Source Code of org.pdf4j.saxon.codenorm.UnicodeDataParser

package org.pdf4j.saxon.codenorm;


import org.pdf4j.saxon.sort.IntHashMap;
import org.pdf4j.saxon.sort.IntToIntHashMap;
import org.pdf4j.saxon.sort.IntToIntMap;


import java.util.ArrayList;
import java.util.BitSet;
import java.util.StringTokenizer;


/**
 * This class reads the data compiled into class UnicodeData, and builds hash tables
 * that can be used by the Unicode normalization routines. This operation is performed
 * once only, the first time normalization is attempted after Saxon is loaded.
 */


class UnicodeDataParser {


    // This class is never instantiated
    private UnicodeDataParser(){}


    /**
     * Called exactly once by NormalizerData to build the static data
     */


    static NormalizerData build() {
        IntToIntMap canonicalClass = new IntToIntHashMap(400);
        canonicalClass.setDefaultValue(0);
        IntHashMap decompose = new IntHashMap(18000);
        IntToIntMap compose = new IntToIntHashMap(15000);
        compose.setDefaultValue(NormalizerData.NOT_COMPOSITE);
        BitSet isCompatibility = new BitSet(128000);
        BitSet isExcluded = new BitSet(128000);


        readExclusionList(isExcluded);
        readCompatibilityList(isCompatibility);
        readCanonicalClassTable(canonicalClass);
        readDecompositionTable(decompose, compose, isExcluded, isCompatibility);


        return new NormalizerData(canonicalClass, decompose, compose,
              isCompatibility, isExcluded);
    }


    /**
     * Reads exclusion list and stores the data
     */


    private static void readExclusionList(BitSet isExcluded) {
        for (int i=0; i<UnicodeData.exclusionList.length; i++) {
            String s = UnicodeData.exclusionList[i];
            StringTokenizer st = new StringTokenizer(s, ",");
            while (st.hasMoreTokens()) {
                String tok = st.nextToken();
                int value = Integer.parseInt(tok, 32);
                isExcluded.set(value);
            }
        }
    }


    /**
     * Reads exclusion list and stores the data
     */


    private static void readCompatibilityList(BitSet isCompatible) {
        for (int i=0; i<UnicodeData.compatibilityList.length; i++) {
            String s = UnicodeData.compatibilityList[i];
            StringTokenizer st = new StringTokenizer(s, ",");
            while (st.hasMoreTokens()) {
                String tok = st.nextToken();
                int value = Integer.parseInt(tok, 32);
                isCompatible.set(value);
            }
        }
    }


    /**
     * Read canonical class table (mapping from character codes to their canonical class)
     */


    private static void readCanonicalClassTable(IntToIntMap canonicalClasses) {
        ArrayList keys = new ArrayList(5000);
        for (int i=0; i<UnicodeData.canonicalClassKeys.length; i++) {
            String s = UnicodeData.canonicalClassKeys[i];
            StringTokenizer st = new StringTokenizer(s, ",");
            while (st.hasMoreTokens()) {
                String tok = st.nextToken();
                int value = Integer.parseInt(tok, 32);
                keys.add(new Integer(value));
            }
        }
        int k = 0;
        for (int i=0; i<UnicodeData.canonicalClassValues.length; i++) {
            String s = UnicodeData.canonicalClassValues[i];
            StringTokenizer st = new StringTokenizer(s, ",");
            while (st.hasMoreTokens()) {
                String tok = st.nextToken();
                int clss = Integer.parseInt(tok, 32);
                canonicalClasses.put(((Integer)keys.get(k++)).intValue(), clss);
            }
        }
    }


    /**
     * Read canonical class table (mapping from character codes to their canonical class)
     */


    private static void readDecompositionTable(IntHashMap decompose, IntToIntMap compose,
                                               BitSet isExcluded, BitSet isCompatibility) {
        int k = 0;
        for (int i=0; i<UnicodeData.decompositionKeys.length; i++) {
            String s = UnicodeData.decompositionKeys[i];
            StringTokenizer st = new StringTokenizer(s, ",");
            while (st.hasMoreTokens()) {
                String tok = st.nextToken();
                int key = Integer.parseInt(tok, 32);
                String value = UnicodeData.decompositionValues[k++];
                decompose.put(key, value);
                                // only compositions are canonical pairs
                // skip if script exclusion


                if (!isCompatibility.get(key) && !isExcluded.get(key)) {
                    char first = '\u0000';
                    char second = value.charAt(0);
                    if (value.length() > 1) {
                        first = second;
                        second = value.charAt(1);
                    }


                    // store composition pair in single integer


                    int pair = (first << 16) | second;
                    compose.put(pair, key);
                }
            }
        }


        // Add algorithmic Hangul decompositions
        // This fragment code is copied from the normalization code published by Unicode consortium.
        // See module org.orbeon.saxon.codenorm.Normalizer for applicable copyright information.


        for (int SIndex = 0; SIndex < SCount; ++SIndex) {
            int TIndex = SIndex % TCount;
            char first, second;
            if (TIndex != 0) { // triple
                first = (char)(SBase + SIndex - TIndex);
                second = (char)(TBase + TIndex);
            } else {
                first = (char)(LBase + SIndex / NCount);
                second = (char)(VBase + (SIndex % NCount) / TCount);
            }
            int pair = (first << 16) | second;
            int key = SIndex + SBase;
            decompose.put(key, String.valueOf(first) + second);
            compose.put(pair, key);
        }
    }


    /**
     * Hangul composition constants
     */
    private static final int
        SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
        LCount = 19, VCount = 21, TCount = 28,
        NCount = VCount * TCount,   // 588
        SCount = LCount * NCount;   // 11172


    // end of Unicode consortium code


}


//
// The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
// you may not use this file except in compliance with the License. You may obtain a copy of the
// License at http://www.mozilla.org/MPL/
//
// Software distributed under the License is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the License for the specific language governing rights and limitations under the License.
//
// The Original Code is: all this file.
//
// The Initial Developer of the Original Code is Michael H. Kay.
//
// The code for generating Hangul decompositions is Copyright (C) Unicode, Inc. All Rights Reserved.
// See statement below.
//
// Contributor(s): none.
//


// * Copyright (c) 1991-2005 Unicode, Inc.
// * For terms of use, see http://www.unicode.org/terms_of_use.html
// * For documentation, see UAX#15.<br>
// * The Unicode Consortium makes no expressed or implied warranty of any
// * kind, and assumes no liability for errors or omissions.
// * No liability is assumed for incidental and consequential damages
// * in connection with or arising out of the use of the information here.
Source Code of org.pdf4j.saxon.codenorm.UnicodeDataParser

Related Classes of org.pdf4j.saxon.codenorm.UnicodeDataParser