Examples of com.ibm.icu.impl.USerializedSet

com.ibm.icu.impl.USerializedSet
Simple class for handling serialized USet/UnicodeSet structures without object creation. See ICU4C icu/source/common/uset.c. @internal

   public void TestConsistency() {
       char[] buffer16 = new char[300];
       char[] buffer   = new char[300];
       UnicodeSet set1, set2, set3, set4;


       USerializedSet sset;
       int start, end;
       int i, length;


       String hyphenPattern = "[:Hyphen:]";
       String dashPattern = "[:Dash:]";
       String lowerPattern = "[:Lowercase:]";
       String formatPattern = "[:Cf:]";
       String alphaPattern  =  "[:Alphabetic:]";


       /*
        * It used to be that UCD.html and its precursors said
        * "Those dashes used to mark connections between pieces of words,
        *  plus the Katakana middle dot."
        *
        * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
        * but not from Hyphen.
        * UTC 94 (2003mar) decided to leave it that way and to changed UCD.html.
        * Therefore, do not show errors when testing the Hyphen property.
        */
       logln("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
                   + "known to the UTC and not considered errors.\n");


       set1=new UnicodeSet(hyphenPattern);
       set2=new UnicodeSet(dashPattern);


           /* remove the Katakana middle dot(s) from set1 */
           set1.remove(0x30fb);
           set2.remove (0xff65); /* halfwidth variant */
           showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", false);




       /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
       set3=new UnicodeSet(formatPattern);
       set4=new UnicodeSet(alphaPattern);


       showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", false);
       showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", true);
       showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", true);
       /*
        * Check that each lowercase character has "small" in its name
        * and not "capital".
        * There are some such characters, some of which seem odd.
        * Use the verbose flag to see these notices.
        */
       set1=new UnicodeSet(lowerPattern);


       for(i=0;; ++i) {
//               try{
//                   length=set1.getItem(set1, i, &start, &end, NULL, 0, &errorCode);
//               }catch(Exception e){
//                   break;
//               }
            start = set1.getRangeStart(i);
            end = set1.getRangeEnd(i);
            length = i<set1.getRangeCount() ? set1.getRangeCount() : 0;
           if(length!=0) {
               break; /* done with code points, got a string or -1 */
           }


           while(start<=end) {
               String name=UCharacter.getName(start);


               if( (name.indexOf("SMALL")< 0 || name.indexOf("CAPITAL")<-1) &&
                   name.indexOf("SMALL CAPITAL")==-1
               ) {
                   logln("info: [:Lowercase:] contains U+"+hex(start) + " whose name does not suggest lowercase: " + name);
               }
               ++start;
           }
       }




       /*
        * Test for an example that unorm_getCanonStartSet() delivers
        * all characters that compose from the input one,
        * even in multiple steps.
        * For example, the set for "I" (0049) should contain both
        * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
        * In general, the set for the middle such character should be a subset
        * of the set for the first.
        */
       set1=new UnicodeSet();
       set2=new UnicodeSet();
       sset = new USerializedSet();
       NormalizerImpl.getCanonStartSet(0x49,sset);
       _setAddSerialized(set1, sset);


       /* enumerate all characters that are plausible to be latin letters */
       for(start=0xa0; start<0x2000; ++start) {

View Full Code Here

        int[] range = new int[2];
        for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {


            // see if any character is at the start of some decomposition
            cp = UTF16.charAt(segment, i);
            USerializedSet starts = new USerializedSet();


            if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
              continue;
            }
            int j=0;
            // if so, see which decompositions match
            int rangeCount = starts.countRanges();
            for(j = 0; j < rangeCount; ++j) {
                starts.getRange(j, range);
                int end=range[1];
                for (int cp2 = range[0]; cp2 <= end; ++cp2) {
                    Set remainder = extract(cp2, segment, i, workingBuffer);
                    if (remainder == null) continue;

View Full Code Here

            }
        }
        
        // test cases with i and I to make sure Turkic works
        char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 };
        USerializedSet sset=new USerializedSet();
        UnicodeSet set = new UnicodeSet();
    
        String s1, s2;
        int start, end;
    
        // collect all sets into one for contiguous output
        int[] startEnd = new int[2];
        for(i=0; i<iI.length; ++i) {
            if(NormalizerImpl.getCanonStartSet(iI[i], sset)) {
                count=sset.countRanges();
                for(j=0; j<count; ++j) {
                    sset.getRange(j, startEnd);
                    set.add(startEnd[0], startEnd[1]);
                }
            }
        }

View Full Code Here

        if(!set.contains('\uFA20')){
            errln("getNX did not return correct set for NX_CJK_COMPAT");
        }
    }
    public void TestSerializedSet(){
        USerializedSet sset=new USerializedSet();
        UnicodeSet set = new UnicodeSet();
        int start, end;
    
        // collect all sets into one for contiguous output
        int[] startEnd = new int[2];


        if(NormalizerImpl.getCanonStartSet(0x0130, sset)) {
            int count=sset.countRanges();
            for(int j=0; j<count; ++j) {
                sset.getRange(j, startEnd);
                set.add(startEnd[0], startEnd[1]);
            }
        }
       


        // test all of these precomposed characters
        UnicodeSetIterator it = new UnicodeSetIterator(set);
        while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {
            start=it.codepoint;
            end=it.codepointEnd;
            while(start<=end) {
                if(!sset.contains(start)){
                    errln("USerializedSet.contains failed for "+Utility.hex(start,8));
                }
            }
        }
    }

View Full Code Here

        int[] range = new int[2];
        for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {


            // see if any character is at the start of some decomposition
            cp = UTF16.charAt(segment, i);
            USerializedSet starts = new USerializedSet();


            if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
              continue;
            }
            int j=0;
            // if so, see which decompositions match
            int rangeCount = starts.countRanges();
            for(j = 0; j < rangeCount; ++j) {
              starts.getRange(j, range);
                int end=range[1];
                for (int cp2 = range[0]; cp2 <= end; ++cp2) {
                  Set remainder = extract(cp2, segment, i, workingBuffer);
                  if (remainder == null) continue;

View Full Code Here

TOP

Related Classes of com.ibm.icu.impl.USerializedSet

com.ibm.icu.dev.test.lang.UCharacterTest

com.ibm.icu.dev.test.normalizer.BasicTest

com.ibm.icu.text.CanonicalIterator

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.