Package com.ibm.icu.text

Examples of com.ibm.icu.text.UnicodeSet$IntPropertyFilter


   /* various tests for consistency of UCD data and API behavior */
   public void TestConsistency() {
       char[] buffer16 = new char[300];
       char[] buffer   = new char[300];
       UnicodeSet set1, set2, set3, set4;

       USerializedSet sset;
       int start, end;
       int i, length;

       String hyphenPattern = "[:Hyphen:]";
       String dashPattern = "[:Dash:]";
       String lowerPattern = "[:Lowercase:]";
       String formatPattern = "[:Cf:]";
       String alphaPattern  =  "[:Alphabetic:]";

       /*
        * It used to be that UCD.html and its precursors said
        * "Those dashes used to mark connections between pieces of words,
        *  plus the Katakana middle dot."
        *
        * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
        * but not from Hyphen.
        * UTC 94 (2003mar) decided to leave it that way and to changed UCD.html.
        * Therefore, do not show errors when testing the Hyphen property.
        */
       logln("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
                   + "known to the UTC and not considered errors.\n");

       set1=new UnicodeSet(hyphenPattern);
       set2=new UnicodeSet(dashPattern);

           /* remove the Katakana middle dot(s) from set1 */
           set1.remove(0x30fb);
           set2.remove (0xff65); /* halfwidth variant */
           showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", false);


       /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
       set3=new UnicodeSet(formatPattern);
       set4=new UnicodeSet(alphaPattern);

       showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", false);
       showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", true);
       showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", true);
       /*
        * Check that each lowercase character has "small" in its name
        * and not "capital".
        * There are some such characters, some of which seem odd.
        * Use the verbose flag to see these notices.
        */
       set1=new UnicodeSet(lowerPattern);

       for(i=0;; ++i) {
//               try{
//                   length=set1.getItem(set1, i, &start, &end, NULL, 0, &errorCode);
//               }catch(Exception e){
//                   break;
//               }
            start = set1.getRangeStart(i);
            end = set1.getRangeEnd(i);
            length = i<set1.getRangeCount() ? set1.getRangeCount() : 0;
           if(length!=0) {
               break; /* done with code points, got a string or -1 */
           }

           while(start<=end) {
               String name=UCharacter.getName(start);

               if( (name.indexOf("SMALL")< 0 || name.indexOf("CAPITAL")<-1) &&
                   name.indexOf("SMALL CAPITAL")==-1
               ) {
                   logln("info: [:Lowercase:] contains U+"+hex(start) + " whose name does not suggest lowercase: " + name);
               }
               ++start;
           }
       }


       /*
        * Test for an example that unorm_getCanonStartSet() delivers
        * all characters that compose from the input one,
        * even in multiple steps.
        * For example, the set for "I" (0049) should contain both
        * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
        * In general, the set for the middle such character should be a subset
        * of the set for the first.
        */
       set1=new UnicodeSet();
       set2=new UnicodeSet();
       sset = new USerializedSet();
       NormalizerImpl.getCanonStartSet(0x49,sset);
       _setAddSerialized(set1, sset);

       /* enumerate all characters that are plausible to be latin letters */
 
View Full Code Here


     * Returns the set for chaining.
     * @param exemplar1
     * @return
     */
    public static UnicodeSet flatten(UnicodeSet exemplar1) {
        UnicodeSet result = new UnicodeSet();
        boolean gotString = false;
        for (UnicodeSetIterator it = new UnicodeSetIterator(exemplar1); it.nextRange();) {
            if (it.codepoint == UnicodeSetIterator.IS_STRING) {
                result.addAll(it.string);
                gotString = true;
            } else {
                result.add(it.codepoint, it.codepointEnd);
            }
        }
        if (gotString) exemplar1.set(result);
        return exemplar1;
    }
View Full Code Here

     * @param uset
     * @return formatted UnicodeSet
     */
    public String toPattern(UnicodeSet uset) {
        first = true;
        UnicodeSet putAtEnd = new UnicodeSet(uset).retainAll(sortAtEnd); // remove all the unassigned gorp for now
        // make sure that comparison separates all strings, even canonically equivalent ones
        Set orderedStrings = new TreeSet(ordering);
        for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.nextRange();) {
            if (it.codepoint == UnicodeSetIterator.IS_STRING) {
                orderedStrings.add(it.string);
            } else {
                for (int i = it.codepoint; i <= it.codepointEnd; ++i) {
                    if (!putAtEnd.contains(i)) {
                        orderedStrings.add(UTF16.valueOf(i));
                    }
                }
            }
        }
View Full Code Here

        String line = null;
        String[] fields = new String[5];
        StringBuffer buf = new StringBuffer();
        int passCount = 0;
        int failCount = 0;
        UnicodeSet other = new UnicodeSet(0, 0x10ffff);
        int c=0;
        try {
            input = TestUtil.getDataReader("unicode/NormalizationTest.txt");
            for (int count = 0;;++count) {
                line = input.readLine();
                if (line == null) {
                    //read the extra test cases
                    if(count > moreCases.length) {
                        count = 0;
                    } else if(count == moreCases.length) {
                        // all done
                        break;
                    }
                    line = moreCases[count++];
                }
                if (line.length() == 0) continue;

                // Expect 5 columns of this format:
                // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>

                // Skip comments
                if (line.charAt(0) == '#'  || line.charAt(0)=='@') continue;

                // Parse out the fields
                hexsplit(line, ';', fields, buf);
               
                // Remove a single code point from the "other" UnicodeSet
                if(fields[0].length()==UTF16.moveCodePointOffset(fields[0],0, 1)) {
                    c=UTF16.charAt(fields[0],0);
                    if(0xac20<=c && c<=0xd73f) {
                        // not an exhaustive test run: skip most Hangul syllables
                        if(c==0xac20) {
                            other.remove(0xac20, 0xd73f);
                        }
                        continue;
                    }
                    other.remove(c);
                }
                if (checkConformance(fields, line)) {
                    ++passCount;
                } else {
                    ++failCount;
View Full Code Here

        }

        // generate a list of all caseless characters -- characters whose
        // case closure is themselves.

        UnicodeSet caseless = new UnicodeSet();

        for (int i = 0; i <= 0x10FFFF; ++i) {
            String cp = UTF16.valueOf(i);
            ci.reset(cp);
            int count = 0;
            String fold = null;
            for (String temp = ci.next(); temp != null; temp = ci.next()) {
                fold = temp;
                if (++count > 1) break;
            }
            if (count==1 && fold.equals(cp)) {
                caseless.add(i);
            }
        }

        System.out.println("caseless = " + caseless.toPattern(true));

        UnicodeSet not_lc = new UnicodeSet("[:^lc:]");
       
        UnicodeSet a = new UnicodeSet();
        a.set(not_lc);
        a.removeAll(caseless);
        System.out.println("[:^lc:] - caseless = " + a.toPattern(true));

        a.set(caseless);
        a.removeAll(not_lc);
        System.out.println("caseless - [:^lc:] = " + a.toPattern(true));
    }
View Full Code Here

        }
        logln("Comparing Sets");
        for (Iterator it = values1.iterator(); it.hasNext();) {
            Object value = it.next();
            logln(value == null ? "null" : value.toString());
            UnicodeSet set1 = map1.getSet(value);
            UnicodeSet set2 = TestBoilerplate.getSet(map2, value);
            if (!TestBoilerplate.verifySetsIdentical(this, set1, set2)) {
                throw new IllegalArgumentException("Halting");
            }
        }
       
View Full Code Here

        /*
         * @see com.ibm.icu.dev.test.TestBoilerplate#_createTestObject()
         */
        protected boolean _addTestObject(List list) {
            if (list.size() > 32) return false;
            UnicodeSet result = new UnicodeSet();
            for (int i = 0; i < 50; ++i) {
                result.add(random.nextInt(100));
            }
            list.add(result.toString());
            return true;
        }
View Full Code Here

        }
       
        // test cases with i and I to make sure Turkic works
        char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 };
        USerializedSet sset=new USerializedSet();
        UnicodeSet set = new UnicodeSet();
   
        String s1, s2;
        int start, end;
   
        // collect all sets into one for contiguous output
        int[] startEnd = new int[2];
        for(i=0; i<iI.length; ++i) {
            if(NormalizerImpl.getCanonStartSet(iI[i], sset)) {
                count=sset.countRanges();
                for(j=0; j<count; ++j) {
                    sset.getRange(j, startEnd);
                    set.add(startEnd[0], startEnd[1]);
                }
            }
        }

        // test all of these precomposed characters
View Full Code Here

  
        return skipSets;
    }

    public void TestSkippable() {
       UnicodeSet starts;
       UnicodeSet[] skipSets = new UnicodeSet[]{
                                                    new UnicodeSet(), //NFD
                                                    new UnicodeSet(), //NFC
                                                    new UnicodeSet(), //NFKC
                                                    new UnicodeSet(), //NFKD
                                                    new UnicodeSet(), //FCD
                                                    new UnicodeSet(), //NONE
                                               };
       UnicodeSet[] expectSets = new UnicodeSet[]{
                                                    new UnicodeSet(),
                                                    new UnicodeSet(),
                                                    new UnicodeSet(),
                                                    new UnicodeSet(),
                                                    new UnicodeSet(),
                                                    new UnicodeSet(),
                                               };
       StringBuffer s, pattern;
       int start, limit, rangeEnd;
       int i, range, count;
       starts = new UnicodeSet();
       /*
       //[\u0350-\u0357\u035D-\u035F\u0610-\u0615\u0656-\u0658\u0CBC\u17DD\u1939-\u193B]
       for(int ch=0;ch<=0x10FFFF;ch++){
               if(Normalizer.isNFSkippable(ch, Normalizer.NFD)) {
                   skipSets[D].add(ch);
               }
               if(Normalizer.isNFSkippable(ch, Normalizer.NFKD)) {
                   skipSets[KD].add(ch);
               }
               if(Normalizer.isNFSkippable(ch, Normalizer.NFC)) {
                   skipSets[C].add(ch);
               }
               if(Normalizer.isNFSkippable(ch, Normalizer.NFKC)) {
                   skipSets[KC].add(ch);
               }
               if(Normalizer.isNFSkippable(ch, Normalizer.FCD)) {
                   skipSets[FCD].add(ch);
               }
               if(Normalizer.isNFSkippable(ch, Normalizer.NONE)) {
                   skipSets[NONE].add(ch);
               }
       }
       */
       // build NF*Skippable sets from runtime data
       NormalizerImpl.addPropertyStarts(starts);
       count=starts.getRangeCount();
  
       start=limit=0;
       rangeEnd=0;
       range=0;
       for(;;) {
           if(start<limit) {
               // get properties for start and apply them to [start..limit[
               if(Normalizer.isNFSkippable(start, Normalizer.NFD)) {
                   skipSets[D].add(start, limit-1);
               }
               if(Normalizer.isNFSkippable(start, Normalizer.NFKD)) {
                   skipSets[KD].add(start, limit-1);
               }
               if(Normalizer.isNFSkippable(start, Normalizer.NFC)) {
                   skipSets[C].add(start, limit-1);
               }
               if(Normalizer.isNFSkippable(start, Normalizer.NFKC)) {
                   skipSets[KC].add(start, limit-1);
               }
               if(Normalizer.isNFSkippable(start, Normalizer.FCD)) {
                   skipSets[FCD].add(start, limit-1);
               }
               if(Normalizer.isNFSkippable(start, Normalizer.NONE)) {
                   skipSets[NONE].add(start, limit-1);
               }
              
           }
  
           // go to next range of same properties
           start=limit;
           if(++limit>rangeEnd) {
               if(range<count) {
                   limit=starts.getRangeStart(range);
                   rangeEnd=starts.getRangeEnd(range);
                   ++range;
               } else if(range==count) {
                   // additional range to complete the Unicode code space
                   limit=rangeEnd=0x110000;
                   ++range;
               } else {
                   break;
               }
           }
       }
  
       expectSets = initSkippables(expectSets);
       if(expectSets[D].contains(0x0350)){
            errln("expectSets[D] contains 0x0350");
       }
       //expectSets.length for now do not test FCD and NONE since there is no data
       for(i=0; i< 4; ++i) {

           if(!skipSets[i].equals(expectSets[i])) {
               errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n"+
                     "May need to update hardcoded UnicodeSet patterns in com.ibm.icu.dev.test.normalizer.BasicTest.java\n"+
                     "See ICU4J - unicodetools.com.ibm.text.UCD.NFSkippable\n" +
                     "Run com.ibm.text.UCD.Main with the option NFSkippable.");
  
               s=new StringBuffer();
              
               s.append("\n\nskip=       ");
               s.append(skipSets[i].toPattern(true));
               s.append("\n\n");
              
               s.append("skip-expect=");            
               pattern = new StringBuffer(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true));
               s.append(pattern);
  
               pattern.delete(0,pattern.length());
               s.append("\n\nexpect-skip=");
               pattern = new StringBuffer(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true));
               s.append(pattern);
               s.append("\n\n");
              
               pattern.delete(0,pattern.length());
               s.append("\n\nintersection(expect,skip)=");
               UnicodeSet intersection  = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]);
               pattern = new StringBuffer(intersection.toPattern(true));
               s.append(pattern);
               s.append("\n\n");
              

              
View Full Code Here

            }
        }
    } 
   
    public void TestGetNX(){
        UnicodeSet set = NormalizerImpl.getNX(1 /*NormalizerImpl.NX_HANGUL*/);
        if(!set.contains(0xac01)){
            errln("getNX did not return correct set for NX_HANGUL");
        }
       
        set = NormalizerImpl.getNX(2/*NormalizerImpl.NX_CJK_COMPAT*/);
        if(!set.contains('\uFA20')){
            errln("getNX did not return correct set for NX_CJK_COMPAT");
        }
    }
View Full Code Here

TOP

Related Classes of com.ibm.icu.text.UnicodeSet$IntPropertyFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.