/*
*******************************************************************************
* Copyright (C) 2003-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.text.IDNA;
import com.ibm.icu.text.StringPrep;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UCharacterIterator;
import com.ibm.icu.text.UForwardCharacterIterator;
/**
* IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java while extending that class to support IDNA2008/UTS #46 as well.
*
* @author Ram Viswanadha
*/
public final class IDNA2003 {
/* IDNA ACE Prefix is "xn--" */
private static char[] ACE_PREFIX = new char[] { 0x0078, 0x006E, 0x002d, 0x002d };
//private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length;
private static final int MAX_LABEL_LENGTH = 63;
private static final int HYPHEN = 0x002D;
private static final int CAPITAL_A = 0x0041;
private static final int CAPITAL_Z = 0x005A;
private static final int LOWER_CASE_DELTA = 0x0020;
private static final int FULL_STOP = 0x002E;
private static final int MAX_DOMAIN_NAME_LENGTH = 255;
// The NamePrep profile object
private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
private static boolean startsWithPrefix(StringBuffer src) {
boolean startsWithPrefix = true;
if (src.length() < ACE_PREFIX.length) {
return false;
}
for (int i = 0; i < ACE_PREFIX.length; i++) {
if (toASCIILower(src.charAt(i)) != ACE_PREFIX[i]) {
startsWithPrefix = false;
}
}
return startsWithPrefix;
}
private static char toASCIILower(char ch) {
if (CAPITAL_A <= ch && ch <= CAPITAL_Z) {
return (char) (ch + LOWER_CASE_DELTA);
}
return ch;
}
private static StringBuffer toASCIILower(CharSequence src) {
StringBuffer dest = new StringBuffer();
for (int i = 0; i < src.length(); i++) {
dest.append(toASCIILower(src.charAt(i)));
}
return dest;
}
private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2) {
char c1, c2;
int rc;
for (int i = 0;/* no condition */; i++) {
/* If we reach the ends of both strings then they match */
if (i == s1.length()) {
return 0;
}
c1 = s1.charAt(i);
c2 = s2.charAt(i);
/* Case-insensitive comparison */
if (c1 != c2) {
rc = toASCIILower(c1) - toASCIILower(c2);
if (rc != 0) {
return rc;
}
}
}
}
private static int getSeparatorIndex(char[] src, int start, int limit) {
for (; start < limit; start++) {
if (isLabelSeparator(src[start])) {
return start;
}
}
// we have not found the separator just return length
return start;
}
/*
private static int getSeparatorIndex(UCharacterIterator iter){
int currentIndex = iter.getIndex();
int separatorIndex = 0;
int ch;
while((ch=iter.next())!= UCharacterIterator.DONE){
if(isLabelSeparator(ch)){
separatorIndex = iter.getIndex();
iter.setIndex(currentIndex);
return separatorIndex;
}
}
// reset index
iter.setIndex(currentIndex);
// we have not found the separator just return the length
}
*/
private static boolean isLDHChar(int ch) {
// high runner case
if (ch > 0x007A) {
return false;
}
//[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
if ((ch == 0x002D) || (0x0030 <= ch && ch <= 0x0039) || (0x0041 <= ch && ch <= 0x005A) || (0x0061 <= ch && ch <= 0x007A)) {
return true;
}
return false;
}
/**
* Ascertain if the given code point is a label separator as defined by the IDNA RFC
*
* @param ch
* The code point to be ascertained
* @return true if the char is a label separator
* @stable ICU 2.8
*/
private static boolean isLabelSeparator(int ch) {
switch (ch) {
case 0x002e:
case 0x3002:
case 0xFF0E:
case 0xFF61:
return true;
default:
return false;
}
}
public static StringBuffer convertToASCII(UCharacterIterator src, int options) throws StringPrepParseException {
boolean[] caseFlags = null;
// the source contains all ascii codepoints
boolean srcIsASCII = true;
// assume the source contains all LDH codepoints
boolean srcIsLDH = true;
//get the options
boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0);
int ch;
// step 1
while ((ch = src.next()) != UForwardCharacterIterator.DONE) {
if (ch > 0x7f) {
srcIsASCII = false;
}
}
int failPos = -1;
src.setToStart();
StringBuffer processOut = null;
// step 2 is performed only if the source contains non ASCII
if (!srcIsASCII) {
// step 2
processOut = namePrep.prepare(src, options);
} else {
processOut = new StringBuffer(src.getText());
}
int poLen = processOut.length();
if (poLen == 0) {
throw new StringPrepParseException("Found zero length lable after NamePrep.", StringPrepParseException.ZERO_LENGTH_LABEL);
}
StringBuffer dest = new StringBuffer();
// reset the variable to verify if output of prepare is ASCII or not
srcIsASCII = true;
// step 3 & 4
for (int j = 0; j < poLen; j++) {
ch = processOut.charAt(j);
if (ch > 0x7F) {
srcIsASCII = false;
} else if (isLDHChar(ch) == false) {
// here we do not assemble surrogates
// since we know that LDH code points
// are in the ASCII range only
srcIsLDH = false;
failPos = j;
}
}
if (useSTD3ASCIIRules == true) {
// verify 3a and 3b
if (srcIsLDH == false /* source contains some non-LDH characters */
|| processOut.charAt(0) == HYPHEN || processOut.charAt(processOut.length() - 1) == HYPHEN) {
/* populate the parseError struct */
if (srcIsLDH == false) {
throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
StringPrepParseException.STD3_ASCII_RULES_ERROR, processOut.toString(), (failPos > 0) ? (failPos - 1) : failPos);
} else if (processOut.charAt(0) == HYPHEN) {
throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
StringPrepParseException.STD3_ASCII_RULES_ERROR, processOut.toString(), 0);
} else {
throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
StringPrepParseException.STD3_ASCII_RULES_ERROR, processOut.toString(), (poLen > 0) ? poLen - 1 : poLen);
}
}
}
if (srcIsASCII) {
dest = processOut;
} else {
// step 5 : verify the sequence does not begin with ACE prefix
if (!startsWithPrefix(processOut)) {
//step 6: encode the sequence with punycode
caseFlags = new boolean[poLen];
StringBuilder punyout = Punycode.encode(processOut, caseFlags);
// convert all codepoints to lower case ASCII
StringBuffer lowerOut = toASCIILower(punyout);
//Step 7: prepend the ACE prefix
dest.append(ACE_PREFIX, 0, ACE_PREFIX.length);
//Step 6: copy the contents in b2 into dest
dest.append(lowerOut);
} else {
throw new StringPrepParseException("The input does not start with the ACE Prefix.",
StringPrepParseException.ACE_PREFIX_ERROR, processOut.toString(), 0);
}
}
if (dest.length() > MAX_LABEL_LENGTH) {
throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
StringPrepParseException.LABEL_TOO_LONG_ERROR, dest.toString(), 0);
}
return dest;
}
public static StringBuffer convertIDNToASCII(String src, int options) throws StringPrepParseException {
char[] srcArr = src.toCharArray();
StringBuffer result = new StringBuffer();
int sepIndex = 0;
int oldSepIndex = 0;
for (;;) {
sepIndex = getSeparatorIndex(srcArr, sepIndex, srcArr.length);
String label = new String(srcArr, oldSepIndex, sepIndex - oldSepIndex);
//make sure this is not a root label separator.
if (!(label.length() == 0 && sepIndex == srcArr.length)) {
UCharacterIterator iter = UCharacterIterator.getInstance(label);
result.append(convertToASCII(iter, options));
}
if (sepIndex == srcArr.length) {
break;
}
// increment the sepIndex to skip past the separator
sepIndex++;
oldSepIndex = sepIndex;
result.append((char) FULL_STOP);
}
if (result.length() > MAX_DOMAIN_NAME_LENGTH) {
throw new StringPrepParseException("The output exceed the max allowed length.",
StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
}
return result;
}
public static StringBuffer convertToUnicode(UCharacterIterator src, int options) throws StringPrepParseException {
boolean[] caseFlags = null;
// the source contains all ascii codepoints
boolean srcIsASCII = true;
// assume the source contains all LDH codepoints
//boolean srcIsLDH = true;
//get the options
//boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
//int failPos = -1;
int ch;
int saveIndex = src.getIndex();
// step 1: find out if all the codepoints in src are ASCII
while ((ch = src.next()) != UForwardCharacterIterator.DONE) {
if (ch > 0x7F) {
srcIsASCII = false;
}/*else if((srcIsLDH = isLDHChar(ch))==false){
failPos = src.getIndex();
}*/
}
StringBuffer processOut;
if (srcIsASCII == false) {
try {
// step 2: process the string
src.setIndex(saveIndex);
processOut = namePrep.prepare(src, options);
} catch (StringPrepParseException ex) {
return new StringBuffer(src.getText());
}
} else {
//just point to source
processOut = new StringBuffer(src.getText());
}
// TODO:
// The RFC states that
// <quote>
// ToUnicode never fails. If any step fails, then the original input
// is returned immediately in that step.
// </quote>
//step 3: verify ACE Prefix
if (startsWithPrefix(processOut)) {
StringBuffer decodeOut = null;
//step 4: Remove the ACE Prefix
String temp = processOut.substring(ACE_PREFIX.length, processOut.length());
//step 5: Decode using punycode
try {
decodeOut = new StringBuffer(Punycode.decode(temp, caseFlags));
} catch (StringPrepParseException e) {
decodeOut = null;
}
//step 6:Apply toASCII
if (decodeOut != null) {
StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options);
//step 7: verify
if (compareCaseInsensitiveASCII(processOut, toASCIIOut) != 0) {
// throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
// StringPrepParseException.VERIFICATION_ERROR);
decodeOut = null;
}
}
//step 8: return output of step 5
if (decodeOut != null) {
return decodeOut;
}
}
// }else{
// // verify that STD3 ASCII rules are satisfied
// if(useSTD3ASCIIRules == true){
// if( srcIsLDH == false /* source contains some non-LDH characters */
// || processOut.charAt(0) == HYPHEN
// || processOut.charAt(processOut.length()-1) == HYPHEN){
//
// if(srcIsLDH==false){
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
// StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
// (failPos>0) ? (failPos-1) : failPos);
// }else if(processOut.charAt(0) == HYPHEN){
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
// processOut.toString(),0);
//
// }else{
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
// processOut.toString(),
// processOut.length());
//
// }
// }
// }
// // just return the source
// return new StringBuffer(src.getText());
// }
return new StringBuffer(src.getText());
}
public static StringBuffer convertIDNToUnicode(String src, int options) throws StringPrepParseException {
char[] srcArr = src.toCharArray();
StringBuffer result = new StringBuffer();
int sepIndex = 0;
int oldSepIndex = 0;
for (;;) {
sepIndex = getSeparatorIndex(srcArr, sepIndex, srcArr.length);
String label = new String(srcArr, oldSepIndex, sepIndex - oldSepIndex);
if (label.length() == 0 && sepIndex != srcArr.length) {
throw new StringPrepParseException("Found zero length lable after NamePrep.", StringPrepParseException.ZERO_LENGTH_LABEL);
}
UCharacterIterator iter = UCharacterIterator.getInstance(label);
result.append(convertToUnicode(iter, options));
if (sepIndex == srcArr.length) {
break;
}
// Unlike the ToASCII operation we don't normalize the label separators
result.append(srcArr[sepIndex]);
// increment the sepIndex to skip past the separator
sepIndex++;
oldSepIndex = sepIndex;
}
if (result.length() > MAX_DOMAIN_NAME_LENGTH) {
throw new StringPrepParseException("The output exceed the max allowed length.",
StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
}
return result;
}
public static int compare(String s1, String s2, int options) throws StringPrepParseException {
StringBuffer s1Out = convertIDNToASCII(s1, options);
StringBuffer s2Out = convertIDNToASCII(s2, options);
return compareCaseInsensitiveASCII(s1Out, s2Out);
}
}