// Copyright (C) 2008 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.caja.parser;
import com.google.caja.SomethingWidgyHappenedError;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
/**
* Provides a similar interface to {@code java.text.Normalizer}, but will
* compile and run on JDK1.5 in a stricter mode.
*
* @author mikesamuel@gmail.com
*/
final class Normalizer {
private static final Method IS_NORMALIZED;
private static final Object NORMAL_FORM_C;
static {
Method isNormalized = null;
Object normalFormC = null;
try {
Class<?> normalizer = Class.forName("java.text.Normalizer");
Class<?> normalizerForm = Class.forName("java.text.Normalizer$Form");
isNormalized = normalizer.getMethod(
"isNormalized", CharSequence.class, normalizerForm);
normalFormC = normalizerForm.getField("NFC").get(null);
} catch (ClassNotFoundException ex) {
// JVM versions < 1.5 don't provide Normalizer.
// Use heuristic below.
} catch (IllegalAccessException ex) {
throw new SomethingWidgyHappenedError(
"Normalizer exists but is unexpectedly inaccessible", ex);
} catch (NoSuchFieldException ex) {
// AppEngine doesn't provide Normalizer.Form.
// Use heuristic below.
throw new SomethingWidgyHappenedError(
"Normalizer.Form unexpectedly missing", ex);
} catch (NoSuchMethodException ex) {
// Don't use the normalizer.
// Use heuristic below.
throw new SomethingWidgyHappenedError(
"Normalizer unexpectedly missing methods", ex);
}
IS_NORMALIZED = isNormalized;
NORMAL_FORM_C = normalFormC;
}
/**
* A conservative heuristic as to whether s is normalized according to Unicode
* Normal Form C. It is heuristic, because Caja needs to run with versions
* of the Java standard libraries that do not include normalization.
* @return false if s is not normalized.
*/
public static boolean isNormalized(CharSequence s) {
if (IS_NORMALIZED != null) {
try {
return ((Boolean) IS_NORMALIZED.invoke(null, s, NORMAL_FORM_C))
.booleanValue();
} catch (IllegalAccessException ex) {
throw new SomethingWidgyHappenedError(
"Normalizer unexpectedly uninvokable", ex);
} catch (InvocationTargetException ex) {
Throwable th = ex.getTargetException();
throw new SomethingWidgyHappenedError(
"Normalizer unexpectedly uninvokable", th);
}
}
// From http://unicode.org/reports/tr15/#D6
// Legacy character sets are classified into three categories
// based on their normalization behavior with accepted
// transcoders.
// 1. Prenormalized. Any string in the character set is already in
// Normalization Form X.
// For example, ISO 8859-1 is prenormalized in NFC.
// ...
for (int i = s.length(); --i >= 0;) {
char ch = s.charAt(i);
// Codepoints in [32, 126] U [160, 255] are identical in both Unicode and
// ISO 8859-1.
// Codepoints in [0, 31] and [127, 159] are not part of ISO 8859-1. They
// are control characters in Unicode, and disallowed in identifiers so
// will never reach here.
if (ch >= 256) { return false; }
}
return true;
}
}