/* various tests for consistency of UCD data and API behavior */
public void TestConsistency() {
char[] buffer16 = new char[300];
char[] buffer = new char[300];
UnicodeSet set1, set2, set3, set4;
USerializedSet sset;
int start, end;
int i, length;
String hyphenPattern = "[:Hyphen:]";
String dashPattern = "[:Dash:]";
String lowerPattern = "[:Lowercase:]";
String formatPattern = "[:Cf:]";
String alphaPattern = "[:Alphabetic:]";
/*
* It used to be that UCD.html and its precursors said
* "Those dashes used to mark connections between pieces of words,
* plus the Katakana middle dot."
*
* Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
* but not from Hyphen.
* UTC 94 (2003mar) decided to leave it that way and to changed UCD.html.
* Therefore, do not show errors when testing the Hyphen property.
*/
logln("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
+ "known to the UTC and not considered errors.\n");
set1=new UnicodeSet(hyphenPattern);
set2=new UnicodeSet(dashPattern);
/* remove the Katakana middle dot(s) from set1 */
set1.remove(0x30fb);
set2.remove (0xff65); /* halfwidth variant */
showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", false);
/* check that Cf is neither Hyphen nor Dash nor Alphabetic */
set3=new UnicodeSet(formatPattern);
set4=new UnicodeSet(alphaPattern);
showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", false);
showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", true);
showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", true);
/*
* Check that each lowercase character has "small" in its name
* and not "capital".
* There are some such characters, some of which seem odd.
* Use the verbose flag to see these notices.
*/
set1=new UnicodeSet(lowerPattern);
for(i=0;; ++i) {
// try{
// length=set1.getItem(set1, i, &start, &end, NULL, 0, &errorCode);
// }catch(Exception e){
// break;
// }
start = set1.getRangeStart(i);
end = set1.getRangeEnd(i);
length = i<set1.getRangeCount() ? set1.getRangeCount() : 0;
if(length!=0) {
break; /* done with code points, got a string or -1 */
}
while(start<=end) {
String name=UCharacter.getName(start);
if( (name.indexOf("SMALL")< 0 || name.indexOf("CAPITAL")<-1) &&
name.indexOf("SMALL CAPITAL")==-1
) {
logln("info: [:Lowercase:] contains U+"+hex(start) + " whose name does not suggest lowercase: " + name);
}
++start;
}
}
/*
* Test for an example that unorm_getCanonStartSet() delivers
* all characters that compose from the input one,
* even in multiple steps.
* For example, the set for "I" (0049) should contain both
* I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
* In general, the set for the middle such character should be a subset
* of the set for the first.
*/
set1=new UnicodeSet();
set2=new UnicodeSet();
sset = new USerializedSet();
NormalizerImpl.getCanonStartSet(0x49,sset);
_setAddSerialized(set1, sset);
/* enumerate all characters that are plausible to be latin letters */