Package com.ibm.icu.dev.tool.layout

Source Code of com.ibm.icu.dev.tool.layout.CanonGSUBBuilder

/**
*******************************************************************************
* Copyright (C) 2002-2008, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*/


package com.ibm.icu.dev.tool.layout;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UTF16;

/**
* @author Eric Mader
*
* Notes:
*
* The property \p{Decomposition_Type=Canonical} will match all characters with a canonical
* decomposition.
*
* So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]"
* will match all Latin, Greek and Cyrillic characters with a canonical decomposition.
*
* Are these three scripts enough? Do we want to collect them all at once and distribute by script,
* or process them one script at a time. It's probably a good idea to build a single table for
* however many scripts there are.
*
* It might be better to collect all the characters that have a canonical decomposition and just
* sort them into however many scripts there are... unless we'll get characters in COMMON???
*/
public class CanonGSUBBuilder
{
    static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)
    {
        int leftType  = ArabicShaping.VALUE_NONE;
        int rightType = ArabicShaping.VALUE_NONE;
       
        switch (type) {
            case UCharacter.DecompositionType.ISOLATED:
                break;
               
            case UCharacter.DecompositionType.FINAL:
                rightType = ArabicShaping.VALUE_LEFT;
                break;
           
            case UCharacter.DecompositionType.INITIAL:
                leftType = ArabicShaping.VALUE_RIGHT;
                break;
           
            case UCharacter.DecompositionType.MEDIAL:
               rightType = ArabicShaping.VALUE_LEFT;
               leftType  = ArabicShaping.VALUE_RIGHT;
               break;
              
           default:
               return decomp + UCharacter.toString(ligature);
        }
       
        char[] chars = decomp.toCharArray();
             
        ArabicShaping.shape(chars, leftType, rightType, isolClassTable);
 
        return new String(chars) + UCharacter.toString(ligature);
    }
   
    static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable,
                                     ClassTable finaClassTable, ClassTable isolClassTable)
    {
        System.out.print("Finding Arabic contextual forms... ");
       
        for (int i = 0; i < data.countRecords(); i += 1) {
            ArabicCharacterData.Record record = data.getRecord(i);
            String decomposition = record.getDecomposition();
           
            if (decomposition != null && decomposition.length() == 1) {
                int contextual = record.getCodePoint();
                int isolated   = UTF16.charAt(record.getDecomposition(), 0);
           
                switch (record.getDecompositionType()) {
                case UCharacter.DecompositionType.INITIAL:
                    initClassTable.addMapping(isolated, contextual);
                    break;
               
                case UCharacter.DecompositionType.MEDIAL:
                    mediClassTable.addMapping(isolated, contextual);
                    break;
               
               case UCharacter.DecompositionType.FINAL:
                   finaClassTable.addMapping(isolated, contextual);
                   break;
                  
               case UCharacter.DecompositionType.ISOLATED:
                   isolClassTable.addMapping(isolated, contextual);
                   break;
              
               default:
                   // issue some error message?
                   break;
                }
            }
        }
       
        System.out.println("Done.");
    }

    static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)
    {
        LigatureTree contextualTree = new LigatureTree();
        int ligatureCount = 0;
       
        System.out.print("Building Arabic ligature tree... ");
       
        for (int i = 0; i < data.countRecords(); i += 1) {
            ArabicCharacterData.Record record = data.getRecord(i);
            String decomposition = record.getDecomposition();
           
            if (decomposition != null && decomposition.length() > 1) {
                int ligature   = record.getCodePoint();
                int decompType = record.getDecompositionType();
               
                switch (decompType) {
                case UCharacter.DecompositionType.FINAL:
                case UCharacter.DecompositionType.INITIAL:
                case UCharacter.DecompositionType.MEDIAL:
                case UCharacter.DecompositionType.ISOLATED:
                    contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable));
                    ligatureCount += 1;
                    break;
                   
                case UCharacter.DecompositionType.CANONICAL:
                    //cannonicalTree.insert(decomposition + UCharacter.toString(ligature));
                    break;
                }
            }
        }
       
        System.out.println(ligatureCount + " ligatures.");
       
        return contextualTree;
    }
   
    static final int SIMPLE_GLYPH = 1;
    static final int LIGATURE_GLYPH = 2;
    static final int MARK_GLYPH = 3;
    static final int COMPONENT_GLYPH = 4;
   
    static final int categoryClassMap[] = {
    0,              // UNASSIGNED
    SIMPLE_GLYPH,   // UPPERCASE_LETTER
    SIMPLE_GLYPH,   // LOWERCASE_LETTER
    SIMPLE_GLYPH,   // TITLECASE_LETTER
    SIMPLE_GLYPH,   // MODIFIER_LETTER
    SIMPLE_GLYPH,   // OTHER_LETTER
    MARK_GLYPH,     // NON_SPACING_MARK
    MARK_GLYPH,     // ENCLOSING_MARK ??
    MARK_GLYPH,     // COMBINING_SPACING_MARK ??
    SIMPLE_GLYPH,   // DECIMAL_NUMBER
    SIMPLE_GLYPH,   // LETTER_NUMBER
    SIMPLE_GLYPH,   // OTHER_NUMBER;
    0,              // SPACE_SEPARATOR
    0,              // LINE_SEPARATOR
    0,              // PARAGRAPH_SEPARATOR
    0,              // CONTROL
    0,              // FORMAT
    0,              // PRIVATE_USE
    0,              // SURROGATE
    SIMPLE_GLYPH,   // DASH_PUNCTUATION
    SIMPLE_GLYPH,   // START_PUNCTUATION
    SIMPLE_GLYPH,   // END_PUNCTUATION
    SIMPLE_GLYPH,   // CONNECTOR_PUNCTUATION
    SIMPLE_GLYPH,   // OTHER_PUNCTUATION
    SIMPLE_GLYPH,   // MATH_SYMBOL;
    SIMPLE_GLYPH,   // CURRENCY_SYMBOL
    SIMPLE_GLYPH,   // MODIFIER_SYMBOL
    SIMPLE_GLYPH,   // OTHER_SYMBOL
    SIMPLE_GLYPH,   // INITIAL_PUNCTUATION
    SIMPLE_GLYPH    // FINAL_PUNCTUATION
    };

    static int getGlyphClass(ArabicCharacterData.Record record)
    {
        String decomp = record.getDecomposition();
       
        if (decomp != null && decomp.length() > 1) {
            return LIGATURE_GLYPH;
        }
       
        return categoryClassMap[record.getGeneralCategory()];
    }
   
    static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)
    {
        System.out.print("Adding Arabic glyph classes... ");
       
        for (int i = 0; i < data.countRecords(); i += 1) {
            ArabicCharacterData.Record record = data.getRecord(i);
            classTable.addMapping(record.getCodePoint(), getGlyphClass(record));
        }
       
        System.out.println("Done.");
    }
   
    private static void buildArabicTables(ScriptList scriptList, FeatureList featureList,
                                                LookupList lookupList, ClassTable classTable) {
        // TODO: Might want to have the ligature table builder explicitly check for ligatures
        // which start with space and tatweel rather than pulling them out here...
        UnicodeSet arabicBlock   = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]");
        UnicodeSet oddLigatures  = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]");
        UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]");
        ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures));

        addArabicGlyphClasses(arabicData, classTable);
       
        ClassTable initClassTable = new ClassTable();
        ClassTable mediClassTable = new ClassTable();
        ClassTable finaClassTable = new ClassTable();
        ClassTable isolClassTable = new ClassTable();
       
        buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable);
        isolClassTable.snapshot();
        LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable);

        LigatureTreeWalker ligaWalker = new LigatureTreeWalker();

        ligaTree.walk(ligaWalker);
       
        Lookup initLookup, mediLookup, finaLookup, ligaLookup;
       
        initLookup = new Lookup(Lookup.GSST_Single, 0);
        initLookup.addSubtable(initClassTable);
       
        mediLookup = new Lookup(Lookup.GSST_Single, 0);
        mediLookup.addSubtable(mediClassTable);
       
        finaLookup = new Lookup(Lookup.GSST_Single, 0);
        finaLookup.addSubtable(finaClassTable);
       
        ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks);
        ligaLookup.addSubtable(ligaWalker);
       
        Feature init = new Feature("init");
        Feature medi = new Feature("medi");
        Feature fina = new Feature("fina");
        Feature liga = new Feature("liga");
       
        init.addLookup(lookupList.addLookup(initLookup));
        medi.addLookup(lookupList.addLookup(mediLookup));
        fina.addLookup(lookupList.addLookup(finaLookup));
        liga.addLookup(lookupList.addLookup(ligaLookup));
       
        featureList.addFeature(init);
        featureList.addFeature(medi);
        featureList.addFeature(fina);
        featureList.addFeature(liga);
       
        scriptList.addFeature("arab", "(default)", init);
        scriptList.addFeature("arab", "(default)", medi);
        scriptList.addFeature("arab", "(default)", fina);
        scriptList.addFeature("arab", "(default)", liga);
       
        System.out.println();
    }

    public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)
    {
        int ligatureCount = 0;
       
        System.out.print("building composition ligature tree for " + UScript.getName(script) + "... ");
       
        for (int i = 0; i < data.countRecords(script); i += 1) {
            CanonicalCharacterData.Record record = data.getRecord(script, i);
            String composed = UCharacter.toString(record.getComposedCharacter());
           
            for (int e = 0; e < record.countEquivalents(); e += 1) {
                String equivalent = record.getEquivalent(e);
               
                ligatureTree.insert(equivalent + composed);
                ligatureCount += 1;
            }
        }
       
        System.out.println(ligatureCount + " ligatures.");
    }
   
    public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script)
    {
        int maxDecompCount = data.getMaxEquivalents(script);
        DecompTable[] decompTables = new DecompTable[maxDecompCount];
       
        System.out.print("Building decompositon tables for " + UScript.getName(script) +
                         "... total decompositions: " + data.countRecords(script) +
                         ", max: " + maxDecompCount + "...");
       
        for (int i = 0; i < maxDecompCount; i += 1) {
            DecompTable table = new DecompTable();
           
            for (int r = 0; r < data.countRecords(script); r += 1) {
                CanonicalCharacterData.Record record = data.getRecord(script, r);
               
                if (record.countEquivalents() > i) {
                    table.add(record.getComposedCharacter(), record.getEquivalent(i));
                }
            }
           
            decompTables[i] = table;
        }
       
        System.out.println(" Done.");
       
        return decompTables;
    }
   
    public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)
    {
        int[] lookups = new int[2];
       
        DecompTable[] decompTables = buildDecompTables(data, script);
       
        LigatureTree compTree = new LigatureTree();
       
        buildLigatureTree(data, script, compTree);
       
        System.out.println();
       
        LigatureTreeWalker compWalker = new LigatureTreeWalker();
       
        compTree.walk(compWalker);
       
        Lookup compLookup, dcmpLookup;
        //int compLookupIndex, dcmpLookupIndex;
       
        compLookup = new Lookup(Lookup.GSST_Ligature, 0);
        compLookup.addSubtable(compWalker);
       
        dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0);
        for (int i = 0; i < decompTables.length; i += 1) {
            dcmpLookup.addSubtable(decompTables[i]);
        }
       
        lookups[0] = lookupList.addLookup(compLookup);
        lookups[1] = lookupList.addLookup(dcmpLookup);
       
        return lookups;
    }
   
    public static void addLookups(Feature feature, int[] lookups)
    {
        for (int i = 0; i < lookups.length; i += 1) {
            feature.addLookup(lookups[i]);
        }
    }
   
    /*
     * Hebrew mark order taken from the SBL Hebrew Font manual
     * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks
     */
    public static ClassTable buildCombiningClassTable()
    {
        UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]");
        ClassTable exceptions = new ClassTable();
        ClassTable combiningClasses = new ClassTable();
        int markCount = markSet.size();
       
        exceptions.addMapping(0x05C110); // Point Shin Dot
        exceptions.addMapping(0x05C211); // Point Sin Dot
        exceptions.addMapping(0x05BC21); // Point Dagesh or Mapiq
        exceptions.addMapping(0x05BF23); // Point Rafe
        exceptions.addMapping(0x05B927); // Point Holam
        exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum)
        exceptions.addMapping(0x0591, 220); // Accent Etnahta
        exceptions.addMapping(0x0596, 220); // Accent Tipeha
        exceptions.addMapping(0x059B, 220); // Accent Tevir
        exceptions.addMapping(0x05A3, 220); // Accent Munah
        exceptions.addMapping(0x05A4, 220); // Accent Mahapakh
        exceptions.addMapping(0x05A5, 220); // Accent Merkha
        exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula
        exceptions.addMapping(0x05A7, 220); // Accent Darga
        exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo
        exceptions.addMapping(0x05B0, 220); // Point Sheva
        exceptions.addMapping(0x05B1, 220); // Point Hataf Segol
        exceptions.addMapping(0x05B2, 220); // Point Hataf Patah
        exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats
        exceptions.addMapping(0x05B4, 220); // Point Hiriq
        exceptions.addMapping(0x05B5, 220); // Point Tsere
        exceptions.addMapping(0x05B6, 220); // Point Segol
        exceptions.addMapping(0x05B7, 220); // Point Patah
        exceptions.addMapping(0x05B8, 220); // Point Qamats
        exceptions.addMapping(0x05BB, 220); // Point Qubuts
        exceptions.addMapping(0x05BD, 220); // Point Meteg
        exceptions.addMapping(0x059A, 222); // Accent Yetiv
        exceptions.addMapping(0x05AD, 222); // Accent Dehi
        exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum)
        exceptions.addMapping(0x0593, 230); // Accent Shalshelet
        exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan
        exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol
        exceptions.addMapping(0x0597, 230); // Accent Revia
        exceptions.addMapping(0x0598, 230); // Accent Zarqa
        exceptions.addMapping(0x059F, 230); // Accent Qarney Para
        exceptions.addMapping(0x059E, 230); // Accent Gershayim
        exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam
        exceptions.addMapping(0x059C, 230); // Accent Geresh
        exceptions.addMapping(0x0592, 230); // Accent Segolta
        exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola
        exceptions.addMapping(0x05AC, 230); // Accent Iluy
        exceptions.addMapping(0x05A8, 230); // Accent Qadma
        exceptions.addMapping(0x05AB, 230); // Accent Ole
        exceptions.addMapping(0x05AF, 230); // Mark Masora Circle
        exceptions.addMapping(0x05A1, 230); // Accent Pazer
      //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot
        exceptions.addMapping(0x05AE, 232); // Accent Zinor
        exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana
        exceptions.addMapping(0x0599, 232); // Accent Pashta
       
        exceptions.addMapping(0x065527); // ARABIC HAMZA BELOW
        exceptions.addMapping(0x065427); // ARABIC HAMZA ABOVE

        exceptions.addMapping(0x065128); // ARABIC SHADDA

        exceptions.addMapping(0x065629); // ARABIC SUBSCRIPT ALEF
        exceptions.addMapping(0x067029); // ARABIC LETTER SUPERSCRIPT ALEF

        exceptions.addMapping(0x064D30); // ARABIC KASRATAN
        exceptions.addMapping(0x065030); // ARABIC KASRA

        exceptions.addMapping(0x065231); // ARABIC SUKUN
        exceptions.addMapping(0x06E131); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH

        exceptions.addMapping(0x064B31); // ARABIC FATHATAN
        exceptions.addMapping(0x064C31); // ARABIC DAMMATAN
        exceptions.addMapping(0x064E31); // ARABIC FATHA
        exceptions.addMapping(0x064F31); // ARABIC DAMMA
        exceptions.addMapping(0x065731); // ARABIC INVERTED DAMMA
        exceptions.addMapping(0x065831); // ARABIC MARK NOON GHUNNA

        exceptions.addMapping(0x065332); // ARABIC MADDAH ABOVE
       
        exceptions.snapshot();
       
        for (int i = 0; i < markCount; i += 1) {
            int mark = markSet.charAt(i);
            int markClass = exceptions.getGlyphClassID(mark);
           
            if (markClass == 0) {
                markClass = UCharacter.getCombiningClass(mark);
            }
           
            combiningClasses.addMapping(mark, markClass);
        }
       
        combiningClasses.snapshot();
        return combiningClasses;
    }
   
    public static void buildDecompTables(String fileName)
    {
        // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored.
      //UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]");
        UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]");
        CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet);
        ClassTable classTable = new ClassTable();
       
        LookupList  lookupList  = new LookupList();
        FeatureList featureList = new FeatureList();
        ScriptList  scriptList  = new ScriptList();

        // build common, inherited lookups...
//        int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON);
//        int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED);
       
        for (int script = 0; script < UScript.CODE_LIMIT; script += 1) {
           
            // This is a bit lame, but it's the only way I can think of
            // to make this work w/o knowing the values of COMMON and INHERITED...
            if (script == UScript.COMMON || script == UScript.INHERITED ||
                data.getMaxEquivalents(script) == 0) {
                continue;
            }
           
            int[] lookups = buildLookups(data, lookupList, script);

            Feature ccmp = new Feature("ccmp");
           
            addLookups(ccmp, lookups);
//            addLookups(ccmp, commonLookups);
//            addLookups(ccmp, inheritedLookups);
           
            featureList.addFeature(ccmp);
       
            String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script));
           
            scriptList.addFeature(scriptTag, "(default)", ccmp);
           
            if (script == UScript.ARABIC) {
                buildArabicTables(scriptList, featureList, lookupList, classTable);
            }
        }
       
        featureList.finalizeFeatureList();
       
        ClassTable markClassTable = buildCombiningClassTable();
       
        GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList);
        GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable);
        String[] includeFiles = {"LETypes.h", "CanonShaping.h"};       
       
        LigatureModuleWriter writer = new LigatureModuleWriter();
       
        writer.openFile(fileName);
        writer.writeHeader(null, includeFiles);
        writer.writeTable(gsubWriter);
        writer.writeTable(gdefWriter);
        writer.writeTrailer();
        writer.closeFile();
    }
   
    public static void main(String[] args)
    {
        buildDecompTables(args[0]);
    }
}
TOP

Related Classes of com.ibm.icu.dev.tool.layout.CanonGSUBBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.