int lineNum = 0;
Vector<BuilderScriptSet> scriptSets = null;
int rtScriptSetsCount = 2;
Trie2Writable anyCaseTrie = new Trie2Writable(0, 0);
Trie2Writable lowerCaseTrie = new Trie2Writable(0, 0);
// The scriptSets vector provides a mapping from TRIE values to the set
// of scripts.
//
// Reserved TRIE values:
// 0: Code point has no whole script confusables.
// 1: Code point is of script Common or Inherited.
// These code points do not participate in whole script confusable
// detection.
// (This is logically equivalent to saying that they contain confusables
// in all scripts)
//
// Because Trie values are indexes into the ScriptSets vector, pre-fill
// vector positions 0 and 1 to avoid conflicts with the reserved values.
scriptSets = new Vector<BuilderScriptSet>();
scriptSets.addElement(null);
scriptSets.addElement(null);
readWholeFileToString(confusablesWS, input);
parseRegexp = Pattern.compile(parseExp);
// Zap any Byte Order Mark at the start of input. Changing it to a space
// is benign
// given the syntax of the input.
if (input.charAt(0) == 0xfeff) {
input.setCharAt(0, (char) 0x20);
}
// Parse the input, one line per iteration of this loop.
Matcher matcher = parseRegexp.matcher(input);
while (matcher.find()) {
lineNum++;
if (matcher.start(1) >= 0) {
// this was a blank or comment line.
continue;
}
if (matcher.start(8) >= 0) {
// input file syntax error.
throw new ParseException("ConfusablesWholeScript, line " + lineNum + ": Unrecognized input: " + matcher.group(),
matcher.start());
}
// Pick up the start and optional range end code points from the
// parsed line.
int startCodePoint = Integer.parseInt(matcher.group(2), 16);
if (startCodePoint > 0x10ffff) {
throw new ParseException("ConfusablesWholeScript, line " + lineNum + ": out of range code point: "
+ matcher.group(2), matcher.start(2));
}
int endCodePoint = startCodePoint;
if (matcher.start(3) >= 0) {
endCodePoint = Integer.parseInt(matcher.group(3), 16);
}
if (endCodePoint > 0x10ffff) {
throw new ParseException("ConfusablesWholeScript, line " + lineNum + ": out of range code point: "
+ matcher.group(3), matcher.start(3));
}
// Extract the two script names from the source line.
String srcScriptName = matcher.group(4);
String targScriptName = matcher.group(5);
int srcScript = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, srcScriptName);
int targScript = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, targScriptName);
if (srcScript == UScript.INVALID_CODE) {
throw new ParseException(
"ConfusablesWholeScript, line " + lineNum + ": Invalid script code t: " + matcher.group(4),
matcher.start(4));
}
if (targScript == UScript.INVALID_CODE) {
throw new ParseException(
"ConfusablesWholeScript, line " + lineNum + ": Invalid script code t: " + matcher.group(5),
matcher.start(5));
}
// select the table - (A) any case or (L) lower case only
Trie2Writable table = anyCaseTrie;
if (matcher.start(7) >= 0) {
table = lowerCaseTrie;
}
// Build the set of scripts containing confusable characters for
// the code point(s) specified in this input line.
// Sanity check that the script of the source code point is the same
// as the source script indicated in the input file. Failure of this
// check is an error in the input file.
//
// Include the source script in the set (needed for Mixed Script
// Confusable detection).
//
int cp;
for (cp = startCodePoint; cp <= endCodePoint; cp++) {
int setIndex = table.get(cp);
BuilderScriptSet bsset = null;
if (setIndex > 0) {
assert (setIndex < scriptSets.size());
bsset = scriptSets.elementAt(setIndex);
} else {
bsset = new BuilderScriptSet();
bsset.codePoint = cp;
bsset.trie = table;
bsset.sset = new ScriptSet();
setIndex = scriptSets.size();
bsset.index = setIndex;
bsset.rindex = 0;
scriptSets.addElement(bsset);
table.set(cp, setIndex);
}
bsset.sset.Union(targScript);
bsset.sset.Union(srcScript);
int cpScript = UScript.getScript(cp);