Source Code of client.net.sf.saxon.ce.functions.codenorm.UnicodeDataParserFromXML

package client.net.sf.saxon.ce.functions.codenorm;


import client.net.sf.saxon.ce.Configuration;
import client.net.sf.saxon.ce.om.Axis;
import client.net.sf.saxon.ce.om.DocumentInfo;
import client.net.sf.saxon.ce.om.NodeInfo;
import client.net.sf.saxon.ce.pattern.NodeKindTest;
import client.net.sf.saxon.ce.trans.XPathException;
import client.net.sf.saxon.ce.tree.iter.AxisIterator;
import client.net.sf.saxon.ce.tree.util.StringTokenizer;


import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


/**
 * This class reads the data compiled into class UnicodeData, and builds hash tables
 * that can be used by the Unicode normalization routines. This operation is performed
 * once only, the first time normalization is attempted after Saxon is loaded.
 */


class UnicodeDataParserFromXML {


    // This class is never instantiated
    private UnicodeDataParserFromXML(){}


    /**
     * Called exactly once by NormalizerData to build the static data
     */


    static NormalizerData build(Configuration config) throws XPathException {


        DocumentInfo doc = config.buildDocument("normalizationData.xml");


        BitSet isExcluded = new BitSet(128000);
        BitSet isCompatibility = new BitSet(128000);


        NodeInfo canonicalClassKeys = null;
        NodeInfo canonicalClassValues = null;
        NodeInfo decompositionKeys = null;
        NodeInfo decompositionValues = null;


        AxisIterator iter = doc.iterateAxis(Axis.DESCENDANT, NodeKindTest.ELEMENT);
        while (true) {
            NodeInfo item = (NodeInfo)iter.next();
            if (item == null) {
                break;
            }
            if (item.getLocalPart().equals("CanonicalClassKeys")) {
                canonicalClassKeys = item;
            } else if (item.getLocalPart().equals("CanonicalClassValues")) {
                canonicalClassValues = item;
            } else if (item.getLocalPart().equals("DecompositionKeys")) {
                decompositionKeys = item;
            } else if (item.getLocalPart().equals("DecompositionValues")) {
                decompositionValues = item;
            } else if (item.getLocalPart().equals("ExclusionList")) {
                readExclusionList(item.getStringValue(), isExcluded);
            } else if (item.getLocalPart().equals("CompatibilityList")) {
                readCompatibilityList(item.getStringValue(), isCompatibility);
            }
        }


        Map<Integer, Integer> canonicalClass = new HashMap<Integer, Integer>(400);
        readCanonicalClassTable(canonicalClassKeys.getStringValue(), canonicalClassValues.getStringValue(), canonicalClass);




        Map<Integer, String> decompose = new HashMap<Integer, String>(18000);
        Map<Integer, Integer> compose = new HashMap<Integer, Integer>(15000);


        readDecompositionTable(decompositionKeys.getStringValue(), decompositionValues.getStringValue(),
                decompose, compose, isExcluded, isCompatibility);


        return new NormalizerData(canonicalClass, decompose, compose,
              isCompatibility, isExcluded);
    }


    /**
     * Reads exclusion list and stores the data
     */


    private static void readExclusionList(String s, BitSet isExcluded) {
        StringTokenizer st = new StringTokenizer(s);
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();
            int value = Integer.parseInt(tok, 32);
            isExcluded.set(value);
        }
    }


    /**
     * Reads compatibility list and stores the data
     */


    private static void readCompatibilityList(String s, BitSet isCompatible) {
        StringTokenizer st = new StringTokenizer(s);
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();
            int value = Integer.parseInt(tok, 32);
            isCompatible.set(value);
        }
    }


    /**
     * Read canonical class table (mapping from character codes to their canonical class)
     */


    private static void readCanonicalClassTable(String keyString, String valueString, Map<Integer, Integer> canonicalClasses) {
        ArrayList keys = new ArrayList(5000);


        StringTokenizer st = new StringTokenizer(keyString);
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();
            int value = Integer.parseInt(tok, 32);
            keys.add(Integer.valueOf(value));
        }


        int k = 0;
        st = new StringTokenizer(valueString);
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();
            int clss;
            int repeat = 1;
            int star = tok.indexOf('*');
            if (star < 0) {
                clss = Integer.parseInt(tok, 32);
            } else {
                repeat = Integer.parseInt(tok.substring(0, star));
                clss = Integer.parseInt(tok.substring(star+1), 32);
            }
            for (int i=0; i<repeat; i++) {
                canonicalClasses.put(((Integer)keys.get(k++)).intValue(), clss);
            }
        }
    }


    /**
     * Read canonical class table (mapping from character codes to their canonical class)
     */


    private static void readDecompositionTable(
            String decompositionKeyString, String decompositionValuesString,
            Map<Integer, String> decompose, Map<Integer, Integer> compose,
            BitSet isExcluded, BitSet isCompatibility) {
        int k = 0;


        List<String> values = new ArrayList<String>(1000);
        StringTokenizer st = new StringTokenizer(decompositionValuesString);
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();
            String value = "";
            for (int c=0; c<tok.length();) {
                char h0 = tok.charAt(c++);
                char h1 = tok.charAt(c++);
                char h2 = tok.charAt(c++);
                char h3 = tok.charAt(c++);
                int code = ("0123456789abcdef".indexOf(h0)<<12) +
                     ("0123456789abcdef".indexOf(h1)<<8) +
                     ("0123456789abcdef".indexOf(h2)<<4) +
                     ("0123456789abcdef".indexOf(h3)); // was <<12
                value += (char)code;
            }
            values.add(value);
        }




        st = new StringTokenizer(decompositionKeyString);
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();
            int key = Integer.parseInt(tok, 32);
            String value = values.get(k++);
            decompose.put(key, value);
            // only compositions are canonical pairs
            // skip if script exclusion


            if (!isCompatibility.get(key) && !isExcluded.get(key)) {
                char first = '\u0000';
                char second = value.charAt(0);
                if (value.length() > 1) {
                    first = second;
                    second = value.charAt(1);
                }


                // store composition pair in single integer


                int pair = (first << 16) | second;
                compose.put(pair, key);
            }
        }


        // Add algorithmic Hangul decompositions
        // This fragment code is copied from the normalization code published by Unicode consortium.
        // See module net.sf.saxon.serialize.codenorm.Normalizer for applicable copyright information.


        for (int SIndex = 0; SIndex < SCount; ++SIndex) {
            int TIndex = SIndex % TCount;
            char first, second;
            if (TIndex != 0) { // triple
                first = (char)(SBase + SIndex - TIndex);
                second = (char)(TBase + TIndex);
            } else {
                first = (char)(LBase + SIndex / NCount);
                second = (char)(VBase + (SIndex % NCount) / TCount);
            }
            int pair = (first << 16) | second;
            int key = SIndex + SBase;
            decompose.put(key, String.valueOf(first) + second);
            compose.put(pair, key);
        }
    }


    /**
     * Hangul composition constants
     */
    private static final int
        SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
        LCount = 19, VCount = 21, TCount = 28,
        NCount = VCount * TCount,   // 588
        SCount = LCount * NCount;   // 11172


}


// This class has its origins in the normalization software published
// by the Unicode Consortium.


// Modified by Michael Kay (Saxonca) to change the way in which the data files are stored.


// * Copyright (c) 1991-2005 Unicode, Inc.
// * For terms of use, see http://www.unicode.org/terms_of_use.html
// * For documentation, see UAX#15.<br>
// * The Unicode Consortium makes no expressed or implied warranty of any
// * kind, and assumes no liability for errors or omissions.
// * No liability is assumed for incidental and consequential damages
// * in connection with or arising out of the use of the information here.
Source Code of client.net.sf.saxon.ce.functions.codenorm.UnicodeDataParserFromXML

Related Classes of client.net.sf.saxon.ce.functions.codenorm.UnicodeDataParserFromXML