Package com.ibm.icu.text

Examples of com.ibm.icu.text.UnicodeSet


            errln("getNX did not return correct set for NX_CJK_COMPAT");
        }
    }
    public void TestSerializedSet(){
        USerializedSet sset=new USerializedSet();
        UnicodeSet set = new UnicodeSet();
        int start, end;
   
        // collect all sets into one for contiguous output
        int[] startEnd = new int[2];

        if(NormalizerImpl.getCanonStartSet(0x0130, sset)) {
            int count=sset.countRanges();
            for(int j=0; j<count; ++j) {
                sset.getRange(j, startEnd);
                set.add(startEnd[0], startEnd[1]);
            }
        }
      

        // test all of these precomposed characters
View Full Code Here


import java.io.*;

public class TransliterationChart {
    public static void main(String[] args) throws IOException {
        System.out.println("Start");
        UnicodeSet lengthMarks = new UnicodeSet("[\u09D7\u0B56-\u0B57\u0BD7\u0C56\u0CD5-\u0CD6\u0D57\u0C55\u0CD5]");
        int[] indicScripts = {
            UScript.LATIN,
            UScript.DEVANAGARI,
            UScript.BENGALI,
            UScript.GURMUKHI,
            UScript.GUJARATI,
            UScript.ORIYA,
            UScript.TAMIL,
            UScript.TELUGU,
            UScript.KANNADA,
            UScript.MALAYALAM,
        };
        String[] names = new String[indicScripts.length];
        UnicodeSet[] sets = new UnicodeSet[indicScripts.length];
        Transliterator[] fallbacks = new Transliterator[indicScripts.length];
        for (int i = 0; i < indicScripts.length; ++i) {
            names[i] = UScript.getName(indicScripts[i]);
            sets[i] = new UnicodeSet("[[:" + names[i] + ":]&[[:L:][:M:]]&[:age=3.1:]]");
            fallbacks[i] = Transliterator.getInstance("any-" + names[i]);
        }
        EquivClass eq = new EquivClass(new ReverseComparator());
        PrintWriter pw = openPrintWriter("transChart.html");
        pw.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
        pw.println("<title>Indic Transliteration Chart</title><style>");
        pw.println("td { text-align: Center; font-size: 200% }");
        pw.println("tt { font-size: 50% }");
        pw.println("td.miss { background-color: #CCCCFF }");
        pw.println("</style></head><body bgcolor='#FFFFFF'>");

        Transliterator anyToLatin = Transliterator.getInstance("any-latin");
       
        String testString = "\u0946\u093E";
       
        UnicodeSet failNorm = new UnicodeSet();
        Set latinFail = new TreeSet();
       
        for (int i = 0; i < indicScripts.length; ++i) {
            if (indicScripts[i] == UScript.LATIN) continue;
            String source = names[i];
            System.out.println(source);
            UnicodeSet sourceChars = sets[i];

            for (int j = 0; j < indicScripts.length; ++j) {
                if (i == j) continue;
                String target = names[j];
                Transliterator forward = Transliterator.getInstance(source + '-' + target);
                Transliterator backward = forward.getInverse();
                UnicodeSetIterator it = new UnicodeSetIterator(sourceChars);
                while (it.next()) {
                    if (lengthMarks.contains(it.codepoint)) continue;
                    String s = Normalizer.normalize(it.codepoint,Normalizer.NFC,0);
                    //if (!Normalizer.isNormalized(s,Normalizer.NFC,0)) continue;
                    if (!s.equals(Normalizer.normalize(s,Normalizer.NFD,0))) {
                        failNorm.add(it.codepoint);
                    }
                    String t = fix(forward.transliterate(s));
                    if (t.equals(testString)) {
                        System.out.println("debug");
                    }

                    String r = fix(backward.transliterate(t));
                    if (Normalizer.compare(s,r,0) == 0) {
                        if (indicScripts[j] != UScript.LATIN) eq.add(s,t);
                    } else {
                        if (indicScripts[j] == UScript.LATIN) {
                            latinFail.add(s + " - " + t + " - " + r);
                        }
                    }
                }
            }
        }
        // collect equivalents
        pw.println("<table border='1' cellspacing='0'><tr>");
        for (int i = 0; i < indicScripts.length; ++i) {
            pw.print("<th width='10%'>" + names[i].substring(0,3) + "</th>");
        }
        pw.println("</tr>");

        Iterator rit = eq.getSetIterator(new MyComparator());
        while(rit.hasNext()) {
            Set equivs = (Set)rit.next();
            pw.print("<tr>");
            Iterator sit = equivs.iterator();
            String source = (String)sit.next();
            String item = anyToLatin.transliterate(source);
            if (item.equals("") || source.equals(item)) item = "&nbsp;";
            pw.print("<td>" + item + "</td>");
            for (int i = 1; i < indicScripts.length; ++i) {
                sit = equivs.iterator();
                item = "";
                while (sit.hasNext()) {
                    String trial = (String)sit.next();
                    if (!sets[i].containsAll(trial)) continue;
                    item = trial;
                    break;
                }
                String classString = "";
                if (item.equals("")) {
                    classString = " class='miss'";
                    String temp = fallbacks[i].transliterate(source);
                    if (!temp.equals("") && !temp.equals(source)) item = temp;
                }
                String backup = item.equals("") ? "&nbsp;" : item;
                pw.print("<td" + classString + " title='" + getName(item, "; ") + "'>"
                    + backup + "<br><tt>" + Utility.hex(item) + "</tt></td>");
            }
            /*
            Iterator sit = equivs.iterator();
            while (sit.hasNext()) {
                String item = (String)sit.next();
                pw.print("<td>" + item + "</td>");
            }
            */
            pw.println("</tr>");
        }
        pw.println("</table>");
        if (true) {
            pw.println("<h2>Failed Normalization</h2>");
   
            UnicodeSetIterator it = new UnicodeSetIterator(failNorm);
            UnicodeSet pieces = new UnicodeSet();
            while (it.next()) {
                String s = UTF16.valueOf(it.codepoint);
                String d = Normalizer.normalize(s,Normalizer.NFD,0);
                pw.println("Norm:" + s + ", " + Utility.hex(s) + " " + UCharacter.getName(it.codepoint)
                     + "; " + d + ", " + Utility.hex(d) + ", ");
                pw.println(UCharacter.getName(d.charAt(1)) + "<br>");
                if (UCharacter.getName(d.charAt(1)).indexOf("LENGTH") >= 0) pieces.add(d.charAt(1));
            }
            pw.println(pieces);
           
            pw.println("<h2>Failed Round-Trip</h2>");
            Iterator cit = latinFail.iterator();
View Full Code Here

        String line = null;
        String[] fields = new String[5];
        StringBuffer buf = new StringBuffer();
        int passCount = 0;
        int failCount = 0;
        UnicodeSet other = new UnicodeSet(0, 0x10ffff);
        int c=0;
        try {
            input = TestUtil.getDataReader(fileName);
            for (int count = 0;;++count) {
                line = input.readLine();
                if (line == null) {
                    //read the extra test cases
                    if(count > moreCases.length) {
                        count = 0;
                    } else if(count == moreCases.length) {
                        // all done
                        break;
                    }
                    line = moreCases[count++];
                }
                if (line.length() == 0) continue;

                // Expect 5 columns of this format:
                // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>

                // Skip comments
                if (line.charAt(0) == '#'  || line.charAt(0)=='@') continue;

                // Parse out the fields
                hexsplit(line, ';', fields, buf);
               
                // Remove a single code point from the "other" UnicodeSet
                if(fields[0].length()==UTF16.moveCodePointOffset(fields[0],0, 1)) {
                    c=UTF16.charAt(fields[0],0);
                    if(0xac20<=c && c<=0xd73f) {
                        // not an exhaustive test run: skip most Hangul syllables
                        if(c==0xac20) {
                            other.remove(0xac20, 0xd73f);
                        }
                        continue;
                    }
                    other.remove(c);
                }
                if (checkConformance(fields, line,options)) {
                    ++passCount;
                } else {
                    ++failCount;
View Full Code Here

        // load the resource bundle
        ICUResourceBundle bundle = (ICUResourceBundle)ICUResourceBundle.getBundleInstance("com/ibm/icu/dev/data/testdata","idna_rules", NamePrepTransform.class.getClassLoader(), true);
        String  mapRules      = bundle.getString("MapNoNormalization");
        mapRules             += bundle.getString("MapNFKC");
        mapTransform          = Transliterator.createFromRules("CaseMap",mapRules,Transliterator.FORWARD);
        labelSeparatorSet     = new UnicodeSet(bundle.getString("LabelSeparatorSet"));
        prohibitedSet         = new UnicodeSet(bundle.getString("ProhibitedSet"));
        unassignedSet         = new UnicodeSet(bundle.getString("UnassignedSet"));
    }
View Full Code Here

    t = new UnicodeSet(pat);
    checkEqual(s, t, "toPattern(true)");
  }
 
  UnicodeSet copyWithIterator(UnicodeSet s, boolean withRange) {
    UnicodeSet t = new UnicodeSet();
    UnicodeSetIterator it = new UnicodeSetIterator(s);
    if (withRange) {
      while (it.nextRange()) {
        if (it.codepoint == UnicodeSetIterator.IS_STRING) {
          t.add(it.string);
        } else {
          t.add(it.codepoint, it.codepointEnd);
        }
      }
    } else {
      while (it.next()) {
        if (it.codepoint == UnicodeSetIterator.IS_STRING) {
          t.add(it.string);
        } else {
          t.add(it.codepoint);
        }
      }
    }
    return t;
  }
View Full Code Here

    }
    return true;
  }
 
  void expectEqual(String name, String pat1, String pat2) {
    UnicodeSet set1, set2;
    try {
      set1 = new UnicodeSet(pat1);
      set2 = new UnicodeSet(pat2);
    } catch (IllegalArgumentException e) {
      errln("FAIL: Couldn't create UnicodeSet from pattern for \"" + name + "\": " + e.getMessage());
      return;
    }
    if(!set1.equals(set2)) {
      errln("FAIL: Sets built from patterns differ for \"" + name + "\"");
    }
  }
View Full Code Here

  /**
   * Expect the given set to contain the characters in charsIn and
   * to not contain those in charsOut.
   */
  void expectContainment(String pat, String charsIn, String charsOut) {
    UnicodeSet set;
    try {
      set = new UnicodeSet(pat);
    } catch (IllegalArgumentException e) {
      errln("FAIL: Couldn't create UnicodeSet from pattern \"" +
          pat + "\": " + e.getMessage());
      return;
    }
View Full Code Here

    /**
     * Perform an exhaustive test on all Unicode characters to make sure that the UnicodeSet with each
     * character works.
     */
    public void TestCharacters() {
        UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]");
        boolean skip = getInclusion() < 10;
        for (int cp = 0; cp < 0x110000; ++cp) {
            if (cp > 0xFF && skip && (cp % 37 != 0)) {
                continue;
            }
            String cpString = UTF16.valueOf(cp);
            String s = requiresQuote.contains(cp) ? "\\" + cpString : cpString;
            String pattern = null;
            final String rawPattern = "[" + s + s + "]";
            try {
                pattern = UnicodeRegex.fix(rawPattern);
            } catch (Exception e) {
View Full Code Here

    /**
     * Check all integer Unicode properties to make sure they work.
     */
    public void TestUnicodeProperties() {
        final boolean skip = getInclusion() < 10;
        UnicodeSet temp = new UnicodeSet();
        for (int propNum = UProperty.INT_START; propNum < UProperty.INT_LIMIT; ++propNum) {
            if (skip && (propNum % 5 != 0)) {
                continue;
            }
            String propName = UCharacter.getPropertyName(propNum, NameChoice.LONG);
            final int intPropertyMinValue = UCharacter.getIntPropertyMinValue(propNum);
            int intPropertyMaxValue = UCharacter.getIntPropertyMaxValue(propNum);
            if (skip) { // only test first if not exhaustive
                intPropertyMaxValue = intPropertyMinValue;
            }
            for (int valueNum = intPropertyMinValue; valueNum <= intPropertyMaxValue; ++valueNum) {
                // hack for getting property value name
                String valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.LONG);
                if (valueName == null) {
                    valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.SHORT);
                    if (valueName == null) {
                        valueName = Integer.toString(valueNum);
                    }
                }
                temp.applyIntPropertyValue(propNum, valueNum);
                if (temp.size() == 0) {
                    continue;
                }
                final String prefix = "a";
                final String suffix = "b";
                String shouldMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;
                temp.complement();
                String shouldNotMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;

                // posix style pattern
                String rawPattern = prefix + "[:" + propName + "=" + valueName + ":]" + suffix;
                String rawNegativePattern = prefix + "[:^" + propName + "=" + valueName + ":]" + suffix;
                checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);
View Full Code Here

     * @return A processed Java regex pattern, suitable for input to
     *         Pattern.compile().
     */
    public String transform(String regex) {
        StringBuffer result = new StringBuffer();
        UnicodeSet temp = new UnicodeSet();
        ParsePosition pos = new ParsePosition(0);
        int state = 0; // 1 = after \

        // We add each character unmodified to the output, unless we have a
        // UnicodeSet. Note that we don't worry about supplementary characters,
View Full Code Here

TOP

Related Classes of com.ibm.icu.text.UnicodeSet

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.