Home | History | Annotate | Download | only in gennorm
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2008, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  store.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2001may25
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Store Unicode normalization data in a memory-mappable file.
     17 */
     18 
     19 #include <stdio.h>
     20 #include <stdlib.h>
     21 #include "unicode/utypes.h"
     22 #include "unicode/uchar.h"
     23 #include "unicode/ustring.h"
     24 #include "cmemory.h"
     25 #include "cstring.h"
     26 #include "filestrm.h"
     27 #include "unicode/udata.h"
     28 #include "utrie.h"
     29 #include "utrie2.h"
     30 #include "unicode/uset.h"
     31 #include "toolutil.h"
     32 #include "unewdata.h"
     33 #include "writesrc.h"
     34 #include "unormimp.h"
     35 #include "gennorm.h"
     36 
     37 #define DO_DEBUG_OUT 0
     38 
     39 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     40 
     41 /*
     42  * The new implementation of the normalization code loads its data from
     43  * unorm.icu, which is generated with this gennorm tool.
     44  * The format of that file is described in unormimp.h .
     45  */
     46 
     47 /* file data ---------------------------------------------------------------- */
     48 
     49 #if UCONFIG_NO_NORMALIZATION
     50 
     51 /* dummy UDataInfo cf. udata.h */
     52 static UDataInfo dataInfo = {
     53     sizeof(UDataInfo),
     54     0,
     55 
     56     U_IS_BIG_ENDIAN,
     57     U_CHARSET_FAMILY,
     58     U_SIZEOF_UCHAR,
     59     0,
     60 
     61     { 0, 0, 0, 0 },                 /* dummy dataFormat */
     62     { 0, 0, 0, 0 },                 /* dummy formatVersion */
     63     { 0, 0, 0, 0 }                  /* dummy dataVersion */
     64 };
     65 
     66 #else
     67 
     68 /* UDataInfo cf. udata.h */
     69 static UDataInfo dataInfo={
     70     sizeof(UDataInfo),
     71     0,
     72 
     73     U_IS_BIG_ENDIAN,
     74     U_CHARSET_FAMILY,
     75     U_SIZEOF_UCHAR,
     76     0,
     77 
     78     { 0x4e, 0x6f, 0x72, 0x6d },   /* dataFormat="Norm" */
     79     { 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
     80     { 3, 2, 0, 0 }                /* dataVersion (Unicode version) */
     81 };
     82 
     83 extern void
     84 setUnicodeVersion(const char *v) {
     85     UVersionInfo version;
     86     u_versionFromString(version, v);
     87     uprv_memcpy(dataInfo.dataVersion, version, 4);
     88 }
     89 
     90 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
     91 
     92 /* builder data ------------------------------------------------------------- */
     93 
     94 /* modularization flags, see gennorm.h (default to "store everything") */
     95 uint32_t gStoreFlags=0xffffffff;
     96 
     97 typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
     98 
     99 static UNewTrie
    100     *normTrie,
    101     *norm32Trie,
    102     *fcdTrie,
    103     *auxTrie;
    104 
    105 static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
    106 
    107 static Norm *norms;
    108 
    109 /*
    110  * set a flag for each code point that was seen in decompositions -
    111  * avoid to decompose ones that have not been used before
    112  */
    113 static uint32_t haveSeenFlags[256];
    114 
    115 /* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
    116 static USet *nfdQCNoSet;
    117 
    118 /* see addCombiningCP() for details */
    119 static uint32_t combiningCPs[2000];
    120 
    121 /*
    122  * after processCombining() this contains for each code point in combiningCPs[]
    123  * the runtime combining index
    124  */
    125 static uint16_t combiningIndexes[2000];
    126 
    127 /* section limits for combiningCPs[], see addCombiningCP() */
    128 static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0;
    129 
    130 /**
    131  * Structure for a triple of code points, stored in combiningTriplesMem.
    132  * The lead and trail code points combine into the the combined one,
    133  * i.e., there is a canonical decomposition of combined-> <lead, trail>.
    134  *
    135  * Before processCombining() is called, leadIndex and trailIndex are 0.
    136  * After processCombining(), they contain the indexes of the lead and trail
    137  * code point in the combiningCPs[] array.
    138  * They are then sorted by leadIndex, then trailIndex.
    139  * They are not sorted by code points.
    140  */
    141 typedef struct CombiningTriple {
    142     uint16_t leadIndex, trailIndex;
    143     uint32_t lead, trail, combined;
    144 } CombiningTriple;
    145 
    146 /* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */
    147 static uint16_t combiningTable[0x8000];
    148 static uint16_t combiningTableTop=0;
    149 
    150 #define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
    151 static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
    152                                +10000]; /* +10000 for exclusion sets */
    153 static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
    154 static int32_t canonSetsCount=0;
    155 
    156 /* allocate and initialize a Norm unit */
    157 static Norm *
    158 allocNorm() {
    159     /* allocate Norm */
    160     Norm *p=(Norm *)utm_alloc(normMem);
    161     /*
    162      * The combiningIndex must not be initialized to 0 because 0 is the
    163      * combiningIndex of the first forward-combining character.
    164      */
    165     p->combiningIndex=0xffff;
    166     return p;
    167 }
    168 
    169 extern void
    170 init() {
    171     uint16_t *p16;
    172 
    173     normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
    174     uprv_memset(normTrie, 0, sizeof(UNewTrie));
    175     norm32Trie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
    176     uprv_memset(norm32Trie, 0, sizeof(UNewTrie));
    177     fcdTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
    178     uprv_memset(fcdTrie, 0, sizeof(UNewTrie));
    179     auxTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
    180     uprv_memset(auxTrie, 0, sizeof(UNewTrie));
    181 
    182     /* initialize the two tries */
    183     if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) {
    184         fprintf(stderr, "error: failed to initialize tries\n");
    185         exit(U_MEMORY_ALLOCATION_ERROR);
    186     }
    187 
    188     /* allocate Norm structures and reset the first one */
    189     normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
    190     norms=allocNorm();
    191 
    192     /* allocate UTF-32 string memory */
    193     utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
    194 
    195     /* reset all "have seen" flags */
    196     uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
    197 
    198     /* open an empty set */
    199     nfdQCNoSet=uset_open(1, 0);
    200 
    201     /* allocate extra data memory for UTF-16 decomposition strings and other values */
    202     extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2);
    203     /* initialize the extraMem counter for the top of FNC strings */
    204     p16=(uint16_t *)utm_alloc(extraMem);
    205     *p16=1;
    206 
    207     /* allocate temporary memory for combining triples */
    208     combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple));
    209 
    210     /* set the minimum code points for no/maybe quick check values to the end of the BMP */
    211     indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
    212     indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff;
    213     indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff;
    214     indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
    215 
    216     /* preset the indexes portion of canonStartSets */
    217     uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2);
    218 }
    219 
    220 /*
    221  * get or create a Norm unit;
    222  * get or create the intermediate trie entries for it as well
    223  */
    224 static Norm *
    225 createNorm(uint32_t code) {
    226     Norm *p;
    227     uint32_t i;
    228 
    229     i=utrie_get32(normTrie, (UChar32)code, NULL);
    230     if(i!=0) {
    231         p=norms+i;
    232     } else {
    233         /* allocate Norm */
    234         p=allocNorm();
    235         if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) {
    236             fprintf(stderr, "error: too many normalization entries\n");
    237             exit(U_BUFFER_OVERFLOW_ERROR);
    238         }
    239     }
    240     return p;
    241 }
    242 
    243 /* get an existing Norm unit */
    244 static Norm *
    245 getNorm(uint32_t code) {
    246     uint32_t i;
    247 
    248     i=utrie_get32(normTrie, (UChar32)code, NULL);
    249     if(i==0) {
    250         return NULL;
    251     }
    252     return norms+i;
    253 }
    254 
    255 /* get the canonical combining class of a character */
    256 static uint8_t
    257 getCCFromCP(uint32_t code) {
    258     Norm *norm=getNorm(code);
    259     if(norm==NULL) {
    260         return 0;
    261     } else {
    262         return norm->udataCC;
    263     }
    264 }
    265 
    266 /*
    267  * enumerate all code points with their Norm structs and call a function for each
    268  * return the number of code points with data
    269  */
    270 static uint32_t
    271 enumTrie(EnumTrieFn *fn, void *context) {
    272     uint32_t count, i;
    273     UChar32 code;
    274     UBool isInBlockZero;
    275 
    276     count=0;
    277     for(code=0; code<=0x10ffff;) {
    278         i=utrie_get32(normTrie, code, &isInBlockZero);
    279         if(isInBlockZero) {
    280             code+=UTRIE_DATA_BLOCK_LENGTH;
    281         } else {
    282             if(i!=0) {
    283                 fn(context, (uint32_t)code, norms+i);
    284                 ++count;
    285             }
    286             ++code;
    287         }
    288     }
    289     return count;
    290 }
    291 
    292 static void
    293 setHaveSeenString(const uint32_t *s, int32_t length) {
    294     uint32_t c;
    295 
    296     while(length>0) {
    297         c=*s++;
    298         haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f));
    299         --length;
    300     }
    301 }
    302 
    303 #define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
    304 
    305 /* handle combining data ---------------------------------------------------- */
    306 
    307 /*
    308  * Insert an entry into combiningCPs[] for the new code point code with its flags.
    309  * The flags indicate if code combines forward, backward, or both.
    310  *
    311  * combiningCPs[] contains three sections:
    312  * 1. code points that combine forward
    313  * 2. code points that combine forward and backward
    314  * 3. code points that combine backward
    315  *
    316  * Search for code in the entire array.
    317  * If it is found and already is in the right section (old flags==new flags)
    318  * then we are done.
    319  * If it is found but the flags are different, then remove it,
    320  * union the old and new flags, and reinsert it into its correct section.
    321  * If it is not found, then just insert it.
    322  *
    323  * Within each section, the code points are not sorted.
    324  */
    325 static void
    326 addCombiningCP(uint32_t code, uint8_t flags) {
    327     uint32_t newEntry;
    328     uint16_t i;
    329 
    330     newEntry=code|((uint32_t)flags<<24);
    331 
    332     /* search for this code point */
    333     for(i=0; i<combineBackTop; ++i) {
    334         if(code==(combiningCPs[i]&0xffffff)) {
    335             /* found it */
    336             if(newEntry==combiningCPs[i]) {
    337                 return; /* no change */
    338             }
    339 
    340             /* combine the flags, remove the old entry from the old place, and insert the new one */
    341             newEntry|=combiningCPs[i];
    342             if(i!=--combineBackTop) {
    343                 uprv_memmove(combiningCPs+i, combiningCPs+i+1, (combineBackTop-i)*4);
    344             }
    345             if(i<combineBothTop) {
    346                 --combineBothTop;
    347             }
    348             if(i<combineFwdTop) {
    349                 --combineFwdTop;
    350             }
    351             break;
    352         }
    353     }
    354 
    355     /* not found or modified, insert it */
    356     if(combineBackTop>=sizeof(combiningCPs)/4) {
    357         fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n",
    358                 (long)(sizeof(combiningCPs)/4));
    359         exit(U_MEMORY_ALLOCATION_ERROR);
    360     }
    361 
    362     /* set i to the insertion point */
    363     flags=(uint8_t)(newEntry>>24);
    364     if(flags==1) {
    365         i=combineFwdTop++;
    366         ++combineBothTop;
    367     } else if(flags==3) {
    368         i=combineBothTop++;
    369     } else /* flags==2 */ {
    370         i=combineBackTop;
    371     }
    372 
    373     /* move the following code points up one and insert newEntry at i */
    374     if(i<combineBackTop) {
    375         uprv_memmove(combiningCPs+i+1, combiningCPs+i, (combineBackTop-i)*4);
    376     }
    377     combiningCPs[i]=newEntry;
    378 
    379     /* finally increment the total counter */
    380     ++combineBackTop;
    381 }
    382 
    383 /**
    384  * Find the index in combiningCPs[] where code point code is stored.
    385  * @param code code point to look for
    386  * @param isLead is code a forward combining code point?
    387  * @return index in combiningCPs[] where code is stored
    388  */
    389 static uint16_t
    390 findCombiningCP(uint32_t code, UBool isLead) {
    391     uint16_t i, limit;
    392 
    393     if(isLead) {
    394         i=0;
    395         limit=combineBothTop;
    396     } else {
    397         i=combineFwdTop;
    398         limit=combineBackTop;
    399     }
    400 
    401     /* search for this code point */
    402     for(; i<limit; ++i) {
    403         if(code==(combiningCPs[i]&0xffffff)) {
    404             /* found it */
    405             return i;
    406         }
    407     }
    408 
    409     /* not found */
    410     return 0xffff;
    411 }
    412 
    413 static void
    414 addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
    415     CombiningTriple *triple;
    416 
    417     if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
    418         return;
    419     }
    420 
    421     /*
    422      * set combiningFlags for the two code points
    423      * do this after decomposition so that getNorm() above returns NULL
    424      * if we do not have actual sub-decomposition data for the initial NFD here
    425      */
    426     createNorm(lead)->combiningFlags|=1;    /* combines forward */
    427     createNorm(trail)->combiningFlags|=2;    /* combines backward */
    428 
    429     addCombiningCP(lead, 1);
    430     addCombiningCP(trail, 2);
    431 
    432     triple=(CombiningTriple *)utm_alloc(combiningTriplesMem);
    433     triple->lead=lead;
    434     triple->trail=trail;
    435     triple->combined=combined;
    436 }
    437 
    438 static int
    439 compareTriples(const void *l, const void *r) {
    440     int diff;
    441     diff=(int)((CombiningTriple *)l)->leadIndex-
    442          (int)((CombiningTriple *)r)->leadIndex;
    443     if(diff==0) {
    444         diff=(int)((CombiningTriple *)l)->trailIndex-
    445              (int)((CombiningTriple *)r)->trailIndex;
    446     }
    447     return diff;
    448 }
    449 
    450 static void
    451 processCombining() {
    452     CombiningTriple *triples;
    453     uint16_t *p;
    454     uint32_t combined;
    455     uint16_t i, j, count, tableTop, finalIndex, combinesFwd;
    456 
    457     triples=utm_getStart(combiningTriplesMem);
    458 
    459     /* add lead and trail indexes to the triples for sorting */
    460     count=(uint16_t)utm_countItems(combiningTriplesMem);
    461     for(i=0; i<count; ++i) {
    462         /* findCombiningCP() must always find the code point */
    463         triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE);
    464         triples[i].trailIndex=findCombiningCP(triples[i].trail, FALSE);
    465     }
    466 
    467     /* sort them by leadIndex, trailIndex */
    468     qsort(triples, count, sizeof(CombiningTriple), compareTriples);
    469 
    470     /* calculate final combining indexes and store them in the Norm entries */
    471     tableTop=0;
    472     j=0; /* triples counter */
    473 
    474     /* first, combining indexes of fwd/both characters are indexes into the combiningTable */
    475     for(i=0; i<combineBothTop; ++i) {
    476         /* start a new table */
    477 
    478         /* assign combining index */
    479         createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=tableTop;
    480 
    481         /* calculate the length of the combining data for this lead code point in the combiningTable */
    482         while(j<count && i==triples[j].leadIndex) {
    483             /* count 2 to 3 16-bit units per composition entry (back-index, code point) */
    484             combined=triples[j++].combined;
    485             if(combined<=0x1fff) {
    486                 tableTop+=2;
    487             } else {
    488                 tableTop+=3;
    489             }
    490         }
    491     }
    492 
    493     /* second, combining indexes of back-only characters are simply incremented from here to be unique */
    494     finalIndex=tableTop;
    495     for(; i<combineBackTop; ++i) {
    496         createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=finalIndex++;
    497     }
    498 
    499     /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */
    500     if(finalIndex>0x8000) {
    501         fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
    502                 tableTop, (long)(sizeof(combiningTable)/4));
    503         exit(U_MEMORY_ALLOCATION_ERROR);
    504     }
    505 
    506     combiningTableTop=tableTop;
    507 
    508     /* store the combining data in the combiningTable, with the final indexes from above */
    509     p=combiningTable;
    510     j=0; /* triples counter */
    511 
    512     /*
    513      * this is essentially the same loop as above, but
    514      * it writes the table data instead of calculating and setting the final indexes;
    515      * it is necessary to have two passes so that all the final indexes are known before
    516      * they are written into the table
    517      */
    518     for(i=0; i<combineBothTop; ++i) {
    519         /* start a new table */
    520 
    521         combined=0; /* avoid compiler warning */
    522 
    523         /* store the combining data for this lead code point in the combiningTable */
    524         while(j<count && i==triples[j].leadIndex) {
    525             Norm *normPtr;
    526             finalIndex=combiningIndexes[triples[j].trailIndex];
    527             combined=triples[j++].combined;
    528             normPtr = getNorm(combined);
    529 
    530             if (normPtr == NULL) {
    531                 fprintf(stderr, "error: processCombining did not get expected result. combined=%d\n", combined);
    532                 exit(U_INTERNAL_PROGRAM_ERROR);
    533             }
    534 
    535             /* is combined a starter? (i.e., cc==0 && combines forward) */
    536             combinesFwd=(uint16_t)((normPtr->combiningFlags&1)<<13);
    537 
    538             *p++=finalIndex;
    539             if(combined<=0x1fff) {
    540                 *p++=(uint16_t)(combinesFwd|combined);
    541             } else if(combined<=0xffff) {
    542                 *p++=(uint16_t)(0x8000|combinesFwd);
    543                 *p++=(uint16_t)combined;
    544             } else {
    545                 *p++=(uint16_t)(0xc000|combinesFwd|((combined-0x10000)>>10));
    546                 *p++=(uint16_t)(0xdc00|(combined&0x3ff));
    547             }
    548         }
    549 
    550         /* set a marker on the last final trail index in this lead's table */
    551         if(combined<=0x1fff) {
    552             *(p-2)|=0x8000;
    553         } else {
    554             *(p-3)|=0x8000;
    555         }
    556     }
    557 
    558     /* post condition: tableTop==(p-combiningTable) */
    559 }
    560 
    561 /* processing incoming normalization data ----------------------------------- */
    562 
    563 /*
    564  * Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
    565  * c must be a Hangul syllable code point.
    566  */
    567 static void
    568 getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3]) {
    569     /* Hangul syllable: decompose algorithmically */
    570     uint32_t c2;
    571     uint8_t length;
    572 
    573     uprv_memset(pHangulNorm, 0, sizeof(Norm));
    574 
    575     c-=HANGUL_BASE;
    576 
    577     c2=c%JAMO_T_COUNT;
    578     c/=JAMO_T_COUNT;
    579     if(c2>0) {
    580         hangulBuffer[2]=JAMO_T_BASE+c2;
    581         length=3;
    582     } else {
    583         hangulBuffer[2]=0;
    584         length=2;
    585     }
    586 
    587     hangulBuffer[1]=JAMO_V_BASE+c%JAMO_V_COUNT;
    588     hangulBuffer[0]=JAMO_L_BASE+c/JAMO_V_COUNT;
    589 
    590     pHangulNorm->nfd=hangulBuffer;
    591     pHangulNorm->lenNFD=length;
    592     if(DO_STORE(UGENNORM_STORE_COMPAT)) {
    593         pHangulNorm->nfkd=hangulBuffer;
    594         pHangulNorm->lenNFKD=length;
    595     }
    596 }
    597 
    598 /*
    599  * decompose the one decomposition further, may generate two decompositions
    600  * apply all previous characters' decompositions to this one
    601  */
    602 static void
    603 decompStoreNewNF(uint32_t code, Norm *norm) {
    604     uint32_t nfd[40], nfkd[40], hangulBuffer[3];
    605     Norm hangulNorm;
    606 
    607     uint32_t *s32;
    608     Norm *p;
    609     uint32_t c;
    610     int32_t i, length;
    611     uint8_t lenNFD=0, lenNFKD=0;
    612     UBool changedNFD=FALSE, changedNFKD=FALSE;
    613 
    614     if((length=norm->lenNFD)!=0) {
    615         /* always allocate the original string */
    616         changedNFD=TRUE;
    617         s32=norm->nfd;
    618     } else if((length=norm->lenNFKD)!=0) {
    619         /* always allocate the original string */
    620         changedNFKD=TRUE;
    621         s32=norm->nfkd;
    622     } else {
    623         /* no decomposition here, nothing to do */
    624         return;
    625     }
    626 
    627     /* decompose each code point */
    628     for(i=0; i<length; ++i) {
    629         c=s32[i];
    630         p=getNorm(c);
    631         if(p==NULL) {
    632             if(HANGUL_BASE<=c && c<(HANGUL_BASE+HANGUL_COUNT)) {
    633                 getHangulDecomposition(c, &hangulNorm, hangulBuffer);
    634                 p=&hangulNorm;
    635             } else {
    636                 /* no data, no decomposition */
    637                 nfd[lenNFD++]=c;
    638                 nfkd[lenNFKD++]=c;
    639                 continue;
    640             }
    641         }
    642 
    643         /* canonically decompose c */
    644         if(changedNFD) {
    645             if(p->lenNFD!=0) {
    646                 uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4);
    647                 lenNFD+=p->lenNFD;
    648             } else {
    649                 nfd[lenNFD++]=c;
    650             }
    651         }
    652 
    653         /* compatibility-decompose c */
    654         if(p->lenNFKD!=0) {
    655             uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4);
    656             lenNFKD+=p->lenNFKD;
    657             changedNFKD=TRUE;
    658         } else if(p->lenNFD!=0) {
    659             uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4);
    660             lenNFKD+=p->lenNFD;
    661             /*
    662              * not  changedNFKD=TRUE;
    663              * so that we do not store a new nfkd if there was no nfkd string before
    664              * and we only see canonical decompositions
    665              */
    666         } else {
    667             nfkd[lenNFKD++]=c;
    668         }
    669     }
    670 
    671     /* assume that norm->lenNFD==1 or ==2 */
    672     if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) {
    673         addCombiningTriple(s32[0], s32[1], code);
    674     }
    675 
    676     if(changedNFD) {
    677         if(lenNFD!=0) {
    678             s32=utm_allocN(utf32Mem, lenNFD);
    679             uprv_memcpy(s32, nfd, lenNFD*4);
    680         } else {
    681             s32=NULL;
    682         }
    683         norm->lenNFD=lenNFD;
    684         norm->nfd=s32;
    685         setHaveSeenString(nfd, lenNFD);
    686     }
    687     if(changedNFKD) {
    688         if(lenNFKD!=0) {
    689             s32=utm_allocN(utf32Mem, lenNFKD);
    690             uprv_memcpy(s32, nfkd, lenNFKD*4);
    691         } else {
    692             s32=NULL;
    693         }
    694         norm->lenNFKD=lenNFKD;
    695         norm->nfkd=s32;
    696         setHaveSeenString(nfkd, lenNFKD);
    697     }
    698 }
    699 
    700 typedef struct DecompSingle {
    701     uint32_t c;
    702     Norm *norm;
    703 } DecompSingle;
    704 
    705 /*
    706  * apply this one character's decompositions (there is at least one!) to
    707  * all previous characters' decompositions to decompose them further
    708  */
    709 static void
    710 decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
    711     uint32_t nfd[40], nfkd[40];
    712     uint32_t *s32;
    713     DecompSingle *me=(DecompSingle *)context;
    714     uint32_t c, myC;
    715     int32_t i, length;
    716     uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD;
    717     UBool changedNFD=FALSE, changedNFKD=FALSE;
    718 
    719     /* get the new character's data */
    720     myC=me->c;
    721     myLenNFD=me->norm->lenNFD;
    722     myLenNFKD=me->norm->lenNFKD;
    723     /* assume that myC has at least one decomposition */
    724 
    725     if((length=norm->lenNFD)!=0 && myLenNFD!=0) {
    726         /* apply NFD(myC) to norm->nfd */
    727         s32=norm->nfd;
    728         for(i=0; i<length; ++i) {
    729             c=s32[i];
    730             if(c==myC) {
    731                 uprv_memcpy(nfd+lenNFD, me->norm->nfd, myLenNFD*4);
    732                 lenNFD+=myLenNFD;
    733                 changedNFD=TRUE;
    734             } else {
    735                 nfd[lenNFD++]=c;
    736             }
    737         }
    738     }
    739 
    740     if((length=norm->lenNFKD)!=0) {
    741         /* apply NFD(myC) and NFKD(myC) to norm->nfkd */
    742         s32=norm->nfkd;
    743         for(i=0; i<length; ++i) {
    744             c=s32[i];
    745             if(c==myC) {
    746                 if(myLenNFKD!=0) {
    747                     uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
    748                     lenNFKD+=myLenNFKD;
    749                 } else /* assume myLenNFD!=0 */ {
    750                     uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4);
    751                     lenNFKD+=myLenNFD;
    752                 }
    753                 changedNFKD=TRUE;
    754             } else {
    755                 nfkd[lenNFKD++]=c;
    756             }
    757         }
    758     } else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) {
    759         /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
    760         s32=norm->nfd;
    761         for(i=0; i<length; ++i) {
    762             c=s32[i];
    763             if(c==myC) {
    764                 uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
    765                 lenNFKD+=myLenNFKD;
    766                 changedNFKD=TRUE;
    767             } else {
    768                 nfkd[lenNFKD++]=c;
    769             }
    770         }
    771     }
    772 
    773     /* set the new decompositions, forget the old ones */
    774     if(changedNFD) {
    775         if(lenNFD!=0) {
    776             if(lenNFD>norm->lenNFD) {
    777                 s32=utm_allocN(utf32Mem, lenNFD);
    778             } else {
    779                 s32=norm->nfd;
    780             }
    781             uprv_memcpy(s32, nfd, lenNFD*4);
    782         } else {
    783             s32=NULL;
    784         }
    785         norm->lenNFD=lenNFD;
    786         norm->nfd=s32;
    787     }
    788     if(changedNFKD) {
    789         if(lenNFKD!=0) {
    790             if(lenNFKD>norm->lenNFKD) {
    791                 s32=utm_allocN(utf32Mem, lenNFKD);
    792             } else {
    793                 s32=norm->nfkd;
    794             }
    795             uprv_memcpy(s32, nfkd, lenNFKD*4);
    796         } else {
    797             s32=NULL;
    798         }
    799         norm->lenNFKD=lenNFKD;
    800         norm->nfkd=s32;
    801     }
    802 }
    803 
    804 /*
    805  * process the data for one code point listed in UnicodeData;
    806  * UnicodeData itself never maps a code point to both NFD and NFKD
    807  */
    808 extern void
    809 storeNorm(uint32_t code, Norm *norm) {
    810     DecompSingle decompSingle;
    811     Norm *p;
    812 
    813     if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
    814         /* ignore compatibility decomposition */
    815         norm->lenNFKD=0;
    816     }
    817 
    818     /* copy existing derived normalization properties */
    819     p=createNorm(code);
    820     norm->qcFlags=p->qcFlags;
    821     norm->combiningFlags=p->combiningFlags;
    822     norm->fncIndex=p->fncIndex;
    823 
    824     /* process the decomposition if there is one here */
    825     if((norm->lenNFD|norm->lenNFKD)!=0) {
    826         /* decompose this one decomposition further, may generate two decompositions */
    827         decompStoreNewNF(code, norm);
    828 
    829         /* has this code point been used in previous decompositions? */
    830         if(HAVE_SEEN(code)) {
    831             /* use this decomposition to decompose other decompositions further */
    832             decompSingle.c=code;
    833             decompSingle.norm=norm;
    834             enumTrie(decompWithSingleFn, &decompSingle);
    835         }
    836     }
    837 
    838     /* store the data */
    839     uprv_memcpy(p, norm, sizeof(Norm));
    840 }
    841 
    842 extern void
    843 setQCFlags(uint32_t code, uint8_t qcFlags) {
    844     if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
    845         /* ignore compatibility decomposition: unset the KC/KD flags */
    846         qcFlags&=~(_NORM_QC_NFKC|_NORM_QC_NFKD);
    847 
    848         /* set the KC/KD flags to the same values as the C/D flags */
    849         qcFlags|=qcFlags<<1;
    850     }
    851     if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
    852         /* ignore composition data: unset the C/KC flags */
    853         qcFlags&=~(_NORM_QC_NFC|_NORM_QC_NFKC);
    854 
    855         /* set the C/KC flags to the same values as the D/KD flags */
    856         qcFlags|=qcFlags>>2;
    857     }
    858 
    859     createNorm(code)->qcFlags|=qcFlags;
    860 
    861     /* adjust the minimum code point for quick check no/maybe */
    862     if(code<0xffff) {
    863         if((qcFlags&_NORM_QC_NFC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]) {
    864             indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=(uint16_t)code;
    865         }
    866         if((qcFlags&_NORM_QC_NFKC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]) {
    867             indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=(uint16_t)code;
    868         }
    869         if((qcFlags&_NORM_QC_NFD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
    870             indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=(uint16_t)code;
    871         }
    872         if((qcFlags&_NORM_QC_NFKD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]) {
    873             indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=(uint16_t)code;
    874         }
    875     }
    876 
    877     if(qcFlags&_NORM_QC_NFD) {
    878         uset_add(nfdQCNoSet, (UChar32)code);
    879     }
    880 }
    881 
    882 extern void
    883 setCompositionExclusion(uint32_t code) {
    884     if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
    885         createNorm(code)->combiningFlags|=0x80;
    886     }
    887 }
    888 
    889 static void
    890 setHangulJamoSpecials() {
    891     Norm *norm;
    892     uint32_t c, hangul;
    893 
    894     /*
    895      * Hangul syllables are algorithmically decomposed into Jamos,
    896      * and Jamos are algorithmically composed into Hangul syllables.
    897      * The quick check flags are parsed, except for Hangul.
    898      */
    899 
    900     /* set Jamo L specials */
    901     hangul=0xac00;
    902     for(c=0x1100; c<=0x1112; ++c) {
    903         norm=createNorm(c);
    904         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L;
    905         if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
    906             norm->combiningFlags=1;
    907         }
    908 
    909         /* for each Jamo L create a set with its associated Hangul block */
    910         norm->canonStart=uset_open(hangul, hangul+21*28-1);
    911         hangul+=21*28;
    912     }
    913 
    914     /* set Jamo V specials */
    915     for(c=0x1161; c<=0x1175; ++c) {
    916         norm=createNorm(c);
    917         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V;
    918         if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
    919             norm->combiningFlags=2;
    920         }
    921         norm->unsafeStart=TRUE;
    922     }
    923 
    924     /* set Jamo T specials */
    925     for(c=0x11a8; c<=0x11c2; ++c) {
    926         norm=createNorm(c);
    927         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T;
    928         if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
    929             norm->combiningFlags=2;
    930         }
    931         norm->unsafeStart=TRUE;
    932     }
    933 
    934     /* set Hangul specials, precompacted */
    935     norm=allocNorm();
    936     norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL;
    937     if(DO_STORE(UGENNORM_STORE_COMPAT)) {
    938         norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD;
    939     } else {
    940         norm->qcFlags=_NORM_QC_NFD;
    941     }
    942 
    943     if(!utrie_setRange32(normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) {
    944         fprintf(stderr, "error: too many normalization entries (setting Hangul)\n");
    945         exit(U_BUFFER_OVERFLOW_ERROR);
    946     }
    947 }
    948 
    949 /*
    950  * set FC-NFKC-Closure string
    951  * s contains the closure string; s[0]==length, s[1..length] is the actual string
    952  * may modify s[0]
    953  */
    954 U_CFUNC void
    955 setFNC(uint32_t c, UChar *s) {
    956     uint16_t *p;
    957     int32_t length, i, count;
    958     UChar first;
    959 
    960     if( DO_NOT_STORE(UGENNORM_STORE_COMPAT) ||
    961         DO_NOT_STORE(UGENNORM_STORE_COMPOSITION) ||
    962         DO_NOT_STORE(UGENNORM_STORE_AUX)
    963     ) {
    964         return;
    965     }
    966 
    967     count=utm_countItems(extraMem);
    968     length=s[0];
    969     first=s[1];
    970 
    971     /* try to overlay single-unit strings with existing ones */
    972     if(length==1 && first<0xff00) {
    973         p=utm_getStart(extraMem);
    974         for(i=1; i<count; ++i) {
    975             if(first==p[i]) {
    976                 break;
    977             }
    978         }
    979     } else {
    980         i=count;
    981     }
    982 
    983     /* append the new string if it cannot be overlayed with an old one */
    984     if(i==count) {
    985         if(count>_NORM_AUX_MAX_FNC) {
    986             fprintf(stderr, "gennorm error: too many FNC strings\n");
    987             exit(U_INDEX_OUTOFBOUNDS_ERROR);
    988         }
    989 
    990         /* prepend 0xffxx with xx==length */
    991         s[0]=(uint16_t)(0xff00+length);
    992         ++length;
    993         p=(uint16_t *)utm_allocN(extraMem, length);
    994         uprv_memcpy(p, s, length*2);
    995 
    996         /* update the top index in extraMem[0] */
    997         count+=length;
    998         ((uint16_t *)utm_getStart(extraMem))[0]=(uint16_t)count;
    999     }
   1000 
   1001     /* store the index to the string */
   1002     createNorm(c)->fncIndex=i;
   1003 }
   1004 
   1005 /* build runtime structures ------------------------------------------------- */
   1006 
   1007 /* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
   1008 static uint16_t
   1009 reorderString(uint32_t *s, int32_t length) {
   1010     uint8_t ccs[40];
   1011     uint32_t c;
   1012     int32_t i, j;
   1013     uint8_t cc, prevCC;
   1014 
   1015     if(length<=0) {
   1016         return 0;
   1017     }
   1018 
   1019     for(i=0; i<length; ++i) {
   1020         /* get the i-th code point and its combining class */
   1021         c=s[i];
   1022         cc=getCCFromCP(c);
   1023         if(cc!=0 && i!=0) {
   1024             /* it is a combining mark, see if it needs to be moved back */
   1025             j=i;
   1026             do {
   1027                 prevCC=ccs[j-1];
   1028                 if(prevCC<=cc) {
   1029                     break;  /* found the right place */
   1030                 }
   1031                 /* move the previous code point here and go back */
   1032                 s[j]=s[j-1];
   1033                 ccs[j]=prevCC;
   1034             } while(--j!=0);
   1035             s[j]=c;
   1036             ccs[j]=cc;
   1037         } else {
   1038             /* just store the combining class */
   1039             ccs[i]=cc;
   1040         }
   1041     }
   1042 
   1043     return (uint16_t)(((uint16_t)ccs[0]<<8)|ccs[length-1]);
   1044 }
   1045 
   1046 #if 0
   1047 static UBool combineAndQC[64]={ 0 };
   1048 #endif
   1049 
   1050 /*
   1051  * canonically reorder the up to two decompositions
   1052  * and store the leading and trailing combining classes accordingly
   1053  *
   1054  * also process canonical decompositions for canonical closure
   1055  */
   1056 static void
   1057 postParseFn(void *context, uint32_t code, Norm *norm) {
   1058     int32_t length;
   1059 
   1060     /* canonically order the NFD */
   1061     length=norm->lenNFD;
   1062     if(length>0) {
   1063         norm->canonBothCCs=reorderString(norm->nfd, length);
   1064     }
   1065 
   1066     /* canonically reorder the NFKD */
   1067     length=norm->lenNFKD;
   1068     if(length>0) {
   1069         norm->compatBothCCs=reorderString(norm->nfkd, length);
   1070     }
   1071 
   1072     /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
   1073     if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) {
   1074         fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags);
   1075     }
   1076     if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) {
   1077         fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
   1078     }
   1079 
   1080     /* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
   1081 #if 0
   1082     combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1;
   1083 #endif
   1084 
   1085     if(norm->combiningFlags&1) {
   1086         if(norm->udataCC!=0) {
   1087             /* illegal - data-derivable composition exclusion */
   1088             fprintf(stderr, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC);
   1089         }
   1090     }
   1091     if(norm->combiningFlags&2) {
   1092         if((norm->qcFlags&0x11)==0) {
   1093             fprintf(stderr, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code);
   1094         }
   1095 #if 0
   1096         /* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
   1097         if(norm->udataCC==0) {
   1098             printf("U+%04lx combines backward but udataCC==0\n", (long)code);
   1099         }
   1100 #endif
   1101     }
   1102     if((norm->combiningFlags&3)==3 && beVerbose) {
   1103         printf("U+%04lx combines both ways\n", (long)code);
   1104     }
   1105 
   1106     /*
   1107      * process canonical decompositions for canonical closure
   1108      *
   1109      * in each canonical decomposition:
   1110      *   add the current character (code) to the set of canonical starters of its norm->nfd[0]
   1111      *   set the "unsafe starter" flag for each norm->nfd[1..]
   1112      */
   1113     length=norm->lenNFD;
   1114     if(length>0) {
   1115         Norm *otherNorm;
   1116         UChar32 c;
   1117         int32_t i;
   1118 
   1119         /* nfd[0].canonStart.add(code) */
   1120         c=norm->nfd[0];
   1121         otherNorm=createNorm(c);
   1122         if(otherNorm->canonStart==NULL) {
   1123             otherNorm->canonStart=uset_open(code, code);
   1124             if(otherNorm->canonStart==NULL) {
   1125                 fprintf(stderr, "gennorm error: out of memory in uset_open()\n");
   1126                 exit(U_MEMORY_ALLOCATION_ERROR);
   1127             }
   1128         } else {
   1129             uset_add(otherNorm->canonStart, code);
   1130             if(!uset_contains(otherNorm->canonStart, code)) {
   1131                 fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c, (int)code);
   1132                 exit(U_INTERNAL_PROGRAM_ERROR);
   1133             }
   1134         }
   1135 
   1136         /* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
   1137         for(i=1; i<length; ++i) {
   1138             createNorm(norm->nfd[i])->unsafeStart=TRUE;
   1139         }
   1140     }
   1141 }
   1142 
   1143 static uint32_t
   1144 make32BitNorm(Norm *norm) {
   1145     UChar extra[100];
   1146     const Norm *other;
   1147     uint32_t word;
   1148     int32_t i, length, beforeZero=0, count, start;
   1149 
   1150     /*
   1151      * Check for assumptions:
   1152      *
   1153      * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes,
   1154      * then the decomposition also begins with a true starter.
   1155      */
   1156     if(norm->udataCC==0) {
   1157         /* this is a starter */
   1158         if((norm->qcFlags&_NORM_QC_NFC)==0 && norm->lenNFD>0) {
   1159             /* a "true" NFC starter with a canonical decomposition */
   1160             if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
   1161                 ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFC)!=0) /* nfd[0] not NFC_YES */
   1162             ) {
   1163                 fprintf(stderr,
   1164                     "error: true NFC starter canonical decomposition[%u] does not begin\n"
   1165                     "    with a true NFC starter: U+%04lx U+%04lx%s\n",
   1166                     norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
   1167                     norm->lenNFD<=2 ? "" : " ...");
   1168                 exit(U_INVALID_TABLE_FILE);
   1169             }
   1170         }
   1171 
   1172         if((norm->qcFlags&_NORM_QC_NFKC)==0) {
   1173             if(norm->lenNFKD>0) {
   1174                 /* a "true" NFKC starter with a compatibility decomposition */
   1175                 if( norm->compatBothCCs>=0x100 || /* lead cc!=0 or */
   1176                     ((other=getNorm(norm->nfkd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfkd[0] not NFKC_YES */
   1177                 ) {
   1178                     fprintf(stderr,
   1179                         "error: true NFKC starter compatibility decomposition[%u] does not begin\n"
   1180                         "    with a true NFKC starter: U+%04lx U+%04lx%s\n",
   1181                         norm->lenNFKD, (long)norm->nfkd[0], (long)norm->nfkd[1],
   1182                         norm->lenNFKD<=2 ? "" : " ...");
   1183                     exit(U_INVALID_TABLE_FILE);
   1184                 }
   1185             } else if(norm->lenNFD>0) {
   1186                 /* a "true" NFKC starter with only a canonical decomposition */
   1187                 if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
   1188                     ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfd[0] not NFKC_YES */
   1189                 ) {
   1190                     fprintf(stderr,
   1191                         "error: true NFKC starter canonical decomposition[%u] does not begin\n"
   1192                         "    with a true NFKC starter: U+%04lx U+%04lx%s\n",
   1193                         norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
   1194                         norm->lenNFD<=2 ? "" : " ...");
   1195                     exit(U_INVALID_TABLE_FILE);
   1196                 }
   1197             }
   1198         }
   1199     }
   1200 
   1201     /* reset the 32-bit word and set the quick check flags */
   1202     word=norm->qcFlags;
   1203 
   1204     /* set the UnicodeData combining class */
   1205     word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT;
   1206 
   1207     /* set the combining flag and index */
   1208     if(norm->combiningFlags&3) {
   1209         word|=(uint32_t)(norm->combiningFlags&3)<<6;
   1210     }
   1211 
   1212     /* set the combining index value into the extra data */
   1213     /* 0xffff: no combining index; 0..0x7fff: combining index */
   1214     if(norm->combiningIndex!=0xffff) {
   1215         extra[0]=norm->combiningIndex;
   1216         beforeZero=1;
   1217     }
   1218 
   1219     count=beforeZero;
   1220 
   1221     /* write the decompositions */
   1222     if((norm->lenNFD|norm->lenNFKD)!=0) {
   1223         extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */
   1224 
   1225         length=norm->lenNFD;
   1226         if(length>0) {
   1227             if(norm->canonBothCCs!=0) {
   1228                 extra[beforeZero]|=0x80;
   1229                 extra[count++]=norm->canonBothCCs;
   1230             }
   1231             start=count;
   1232             for(i=0; i<length; ++i) {
   1233                 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfd[i]);
   1234             }
   1235             extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */
   1236         }
   1237 
   1238         length=norm->lenNFKD;
   1239         if(length>0) {
   1240             if(norm->compatBothCCs!=0) {
   1241                 extra[beforeZero]|=0x8000;
   1242                 extra[count++]=norm->compatBothCCs;
   1243             }
   1244             start=count;
   1245             for(i=0; i<length; ++i) {
   1246                 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfkd[i]);
   1247             }
   1248             extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */
   1249         }
   1250     }
   1251 
   1252     /* allocate and copy the extra data */
   1253     if(count!=0) {
   1254         UChar *p;
   1255 
   1256         if(norm->specialTag!=0) {
   1257             fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag);
   1258             exit(U_ILLEGAL_ARGUMENT_ERROR);
   1259         }
   1260 
   1261         p=(UChar *)utm_allocN(extraMem, count);
   1262         uprv_memcpy(p, extra, count*2);
   1263 
   1264         /* set the extra index, offset by beforeZero */
   1265         word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT;
   1266     } else if(norm->specialTag!=0) {
   1267         /* set a special tag instead of an extra index */
   1268         word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT;
   1269     }
   1270 
   1271     return word;
   1272 }
   1273 
   1274 /* turn all Norm structs into corresponding 32-bit norm values */
   1275 static void
   1276 makeAll32() {
   1277     uint32_t *pNormData;
   1278     uint32_t n;
   1279     int32_t i, normLength, count;
   1280 
   1281     count=(int32_t)utm_countItems(normMem);
   1282     for(i=0; i<count; ++i) {
   1283         norms[i].value32=make32BitNorm(norms+i);
   1284     }
   1285 
   1286     pNormData=utrie_getData(norm32Trie, &normLength);
   1287 
   1288     count=0; /* count is now just used for debugging */
   1289     for(i=0; i<normLength; ++i) {
   1290         n=pNormData[i];
   1291         if(0!=(pNormData[i]=norms[n].value32)) {
   1292             ++count;
   1293         }
   1294     }
   1295 }
   1296 
   1297 /*
   1298  * extract all Norm.canonBothCCs into the FCD table
   1299  * set 32-bit values to use the common fold and compact functions
   1300  */
   1301 static void
   1302 makeFCD() {
   1303     uint32_t *pFCDData;
   1304     uint32_t n;
   1305     int32_t i, count, fcdLength;
   1306     uint16_t bothCCs;
   1307 
   1308     count=utm_countItems(normMem);
   1309     for(i=0; i<count; ++i) {
   1310         bothCCs=norms[i].canonBothCCs;
   1311         if(bothCCs==0) {
   1312             /* if there are no decomposition cc's then use the udataCC twice */
   1313             bothCCs=norms[i].udataCC;
   1314             bothCCs|=bothCCs<<8;
   1315         }
   1316         norms[i].value32=bothCCs;
   1317     }
   1318 
   1319     pFCDData=utrie_getData(fcdTrie, &fcdLength);
   1320 
   1321     for(i=0; i<fcdLength; ++i) {
   1322         n=pFCDData[i];
   1323         pFCDData[i]=norms[n].value32;
   1324     }
   1325 }
   1326 
   1327 /**
   1328  * If the given set contains exactly one character, then return it.
   1329  * Otherwise return -1.
   1330  */
   1331 static int32_t
   1332 usetContainsOne(const USet* set) {
   1333     if(uset_getItemCount(set)==1) {
   1334         /* there is a single item (a single range) */
   1335         UChar32 start, end;
   1336         UErrorCode ec=U_ZERO_ERROR;
   1337         int32_t len=uset_getItem(set, 0, &start, &end, NULL, 0, &ec);
   1338         if (len==0 && start==end) { /* a range (len==0) with a single code point */
   1339             return start;
   1340         }
   1341     }
   1342     return -1;
   1343 }
   1344 
   1345 static void
   1346 makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
   1347     if(norm->canonStart!=NULL && !uset_isEmpty(norm->canonStart)) {
   1348         uint16_t *table;
   1349         int32_t c, tableLength;
   1350         UErrorCode errorCode=U_ZERO_ERROR;
   1351 
   1352         /* does the set contain exactly one code point? */
   1353         c=usetContainsOne(norm->canonStart);
   1354 
   1355         /* add an entry to the BMP or supplementary search table */
   1356         if(code<=0xffff) {
   1357             table=canonStartSets+_NORM_MAX_CANON_SETS;
   1358             tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
   1359 
   1360             table[tableLength++]=(uint16_t)code;
   1361 
   1362             if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) {
   1363                 /* single-code point BMP result for BMP code point */
   1364                 table[tableLength++]=(uint16_t)c;
   1365             } else {
   1366                 table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX|canonStartSetsTop);
   1367                 c=-1;
   1368             }
   1369             canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength;
   1370         } else {
   1371             table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH;
   1372             tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
   1373 
   1374             table[tableLength++]=(uint16_t)(code>>16);
   1375             table[tableLength++]=(uint16_t)code;
   1376 
   1377             if(c>=0) {
   1378                 /* single-code point result for supplementary code point */
   1379                 table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00));
   1380                 table[tableLength++]=(uint16_t)c;
   1381             } else {
   1382                 table[tableLength++]=(uint16_t)canonStartSetsTop;
   1383             }
   1384             canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength;
   1385         }
   1386 
   1387         if(c<0) {
   1388             /* write a USerializedSet */
   1389             ++canonSetsCount;
   1390             canonStartSetsTop+=
   1391                     uset_serialize(norm->canonStart,
   1392                             canonStartSets+canonStartSetsTop,
   1393                             _NORM_MAX_CANON_SETS-canonStartSetsTop,
   1394                             &errorCode);
   1395         }
   1396         canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
   1397 
   1398         if(U_FAILURE(errorCode)) {
   1399             fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), (int)canonStartSetsTop);
   1400             exit(errorCode);
   1401         }
   1402         if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) {
   1403             fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n");
   1404             exit(U_INDEX_OUTOFBOUNDS_ERROR);
   1405         }
   1406     }
   1407 }
   1408 
   1409 /* for getSkippableFlags ---------------------------------------------------- */
   1410 
   1411 /* combine the lead and trail code points; return <0 if they do not combine */
   1412 static int32_t
   1413 combine(uint32_t lead, uint32_t trail) {
   1414     CombiningTriple *triples;
   1415     uint32_t i, count;
   1416 
   1417     /* search for all triples with c as lead code point */
   1418     triples=utm_getStart(combiningTriplesMem);
   1419     count=utm_countItems(combiningTriplesMem);
   1420 
   1421     /* triples are not sorted by code point but for each lead CP there is one contiguous block */
   1422     for(i=0; i<count && lead!=triples[i].lead; ++i) {}
   1423 
   1424     /* check each triple for this code point */
   1425     for(; i<count && lead==triples[i].lead; ++i) {
   1426         if(trail==triples[i].trail) {
   1427             return (int32_t)triples[i].combined;
   1428         }
   1429     }
   1430 
   1431     return -1;
   1432 }
   1433 
   1434 /*
   1435  * Starting from the canonical decomposition s[0..length[ of a single code point,
   1436  * is the code point c consumed in an NFC/FCC recomposition?
   1437  *
   1438  * No need to handle discontiguous composition because that would not consume some
   1439  * intermediate character, so would not compose back to the original character.
   1440  * See comments in canChangeWithFollowing().
   1441  *
   1442  * No need to compose beyond where c canonically orders because if it is consumed
   1443  * then the result differs from the original anyway.
   1444  *
   1445  * Possible optimization:
   1446  * - Verify that there are no cases of the same combining mark stacking twice.
   1447  * - return FALSE right away if c inserts after a copy of itself
   1448  *   without attempting to recompose; will happen because each mark in
   1449  *   the decomposition will be enumerated and passed in as c.
   1450  *   More complicated and fragile though than it is already.
   1451  *
   1452  * markus 2002nov04
   1453  */
   1454 static UBool
   1455 doesComposeConsume(const uint32_t *s, int32_t length, uint32_t c, uint8_t cc) {
   1456     int32_t starter, i;
   1457 
   1458     /* ignore trailing characters where cc<prevCC */
   1459     while(length>1 && cc<getCCFromCP(s[length-1])) {
   1460         --length;
   1461     }
   1462 
   1463     /* start consuming/combining from the beginning */
   1464     starter=(int32_t)s[0];
   1465     for(i=1; i<length; ++i) {
   1466         starter=combine((uint32_t)starter, s[i]);
   1467         if(starter<0) {
   1468             fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n",
   1469                 (int)s[0], (int)s[1], (int)length, (int)c, cc);
   1470             exit(U_INTERNAL_PROGRAM_ERROR);
   1471         }
   1472     }
   1473 
   1474     /* try to combine/consume c, return TRUE if it is consumed */
   1475     return combine((uint32_t)starter, c)>=0;
   1476 }
   1477 
   1478 /* does the starter s[0] combine forward with another char that is below trailCC? */
   1479 static UBool
   1480 canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
   1481     if(trailCC<=1) {
   1482         /* no character will combine ahead of the trailing char of the decomposition */
   1483         return FALSE;
   1484     }
   1485 
   1486     /*
   1487      * We are only checking skippable condition (f).
   1488      * Therefore, the original character does not have quick check flag NFC_NO (c),
   1489      * i.e., the decomposition recomposes completely back into the original code point.
   1490      * So s[0] must be a true starter with cc==0 and
   1491      * combining with following code points.
   1492      *
   1493      * Similarly, length==1 is not possible because that would be a singleton
   1494      * decomposition which is marked with NFC_NO and does not pass (c).
   1495      *
   1496      * Only a character with cc<trailCC can change the composition.
   1497      * Reason: A char with cc>=trailCC would order after decomposition s[],
   1498      * composition would consume all of the decomposition, and here we know that
   1499      * the original char passed check d), i.e., it does not combine forward,
   1500      * therefore does not combine with anything after the decomposition is consumed.
   1501      *
   1502      * Now see if there is a character that
   1503      * 1. combines backward
   1504      * 2. has cc<trailCC
   1505      * 3. is consumed in recomposition
   1506      *
   1507      * length==2 is simple:
   1508      *
   1509      * Characters that fulfill these conditions are exactly the ones that combine directly
   1510      * with the starter c==s[0] because there is no intervening character after
   1511      * reordering.
   1512      * We can just enumerate all chars with which c combines (they all pass 1. and 3.)
   1513      * and see if one has cc<trailCC (passes 2.).
   1514      *
   1515      * length>2 is a little harder:
   1516      *
   1517      * Since we will get different starters during recomposition, we need to
   1518      * enumerate each backward-combining character (1.)
   1519      * with cc<trailCC (2.) and
   1520      * see if it gets consumed in recomposition. (3.)
   1521      * No need to enumerate both-ways combining characters because they must have cc==0.
   1522      */
   1523     if(length==2) {
   1524         /* enumerate all chars that combine with this one and check their cc */
   1525         CombiningTriple *triples;
   1526         uint32_t c, i, count;
   1527         uint8_t cc;
   1528 
   1529         /* search for all triples with c as lead code point */
   1530         triples=utm_getStart(combiningTriplesMem);
   1531         count=utm_countItems(combiningTriplesMem);
   1532         c=s[0];
   1533 
   1534         /* triples are not sorted by code point but for each lead CP there is one contiguous block */
   1535         for(i=0; i<count && c!=triples[i].lead; ++i) {}
   1536 
   1537         /* check each triple for this code point */
   1538         for(; i<count && c==triples[i].lead; ++i) {
   1539             cc=getCCFromCP(triples[i].trail);
   1540             if(cc>0 && cc<trailCC) {
   1541                 /* this trail code point combines with c and has cc<trailCC */
   1542                 return TRUE;
   1543             }
   1544         }
   1545     } else {
   1546         /* enumerate all chars that combine backward */
   1547         uint32_t c2;
   1548         uint16_t i;
   1549         uint8_t cc;
   1550 
   1551         for(i=combineBothTop; i<combineBackTop; ++i) {
   1552             c2=combiningCPs[i]&0xffffff;
   1553             cc=getCCFromCP(c2);
   1554             /* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
   1555             if(cc>0 && cc<trailCC && doesComposeConsume(s, length-1, c2, cc)) {
   1556                 return TRUE;
   1557             }
   1558         }
   1559     }
   1560 
   1561     /* this decomposition is not modified by any appended character */
   1562     return FALSE;
   1563 }
   1564 
   1565 /* see unormimp.h for details on NF*C Skippable flags */
   1566 static uint32_t
   1567 getSkippableFlags(const Norm *norm) {
   1568     /* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
   1569 
   1570     /* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
   1571     if(norm->specialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) {
   1572         return 0;
   1573     }
   1574 
   1575     /* ### TODO check other data generation functions whether they should & do ignore Hangul/Jamo specials */
   1576 
   1577     /*
   1578      * Note:
   1579      * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
   1580      *
   1581      * This means that (a)..(e) must always be derived from the runtime norm32 value,
   1582      * and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
   1583      * the form is NF*C and there is a canonical decomposition (NFD_NO).
   1584      *
   1585      * (a) unassigned code points get "not skippable"==false because they
   1586      * don't have a Norm struct so they won't get here
   1587      */
   1588 
   1589     /* (b) not skippable if cc!=0 */
   1590     if(norm->udataCC!=0) {
   1591         return 0; /* non-zero flag for (f) only */
   1592     }
   1593 
   1594     /*
   1595      * not NFC_Skippable if
   1596      * (c) quick check flag == NO  or
   1597      * (d) combines forward  or
   1598      * (e) combines back or
   1599      * (f) can change if another character is added
   1600      *
   1601      * for (f):
   1602      * For NF*C: Get corresponding decomposition, get its last starter (cc==0),
   1603      *           check its composition list,
   1604      *           see if any of the second code points in the list
   1605      *           has cc less than the trailCC of the decomposition.
   1606      *
   1607      * For FCC: Test at runtime if the decomposition has a trailCC>1
   1608      *          -> there are characters with cc==1, they would order before the trail char
   1609      *          and prevent contiguous combination with the trail char.
   1610      */
   1611     if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 ||
   1612         (norm->combiningFlags&3)!=0) {
   1613         return 0; /* non-zero flag for (f) only */
   1614     }
   1615     if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) {
   1616         return _NORM_AUX_NFC_SKIP_F_MASK;
   1617     }
   1618 
   1619     return 0; /* skippable */
   1620 }
   1621 
   1622 static void
   1623 makeAux() {
   1624     Norm *norm;
   1625     uint32_t *pData;
   1626     int32_t i, length;
   1627 
   1628     pData=utrie_getData(auxTrie, &length);
   1629 
   1630     for(i=0; i<length; ++i) {
   1631         norm=norms+pData[i];
   1632         /*
   1633          * 16-bit auxiliary normalization properties
   1634          * see unormimp.h
   1635          */
   1636         pData[i]=
   1637             ((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))|
   1638             (uint32_t)norm->fncIndex;
   1639 
   1640         if(norm->unsafeStart || norm->udataCC!=0) {
   1641             pData[i]|=_NORM_AUX_UNSAFE_MASK;
   1642         }
   1643 
   1644         pData[i]|=getSkippableFlags(norm);
   1645     }
   1646 }
   1647 
   1648 /* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
   1649 static uint32_t U_CALLCONV
   1650 getFoldedNormValue(UNewTrie *trie, UChar32 start, int32_t offset) {
   1651     uint32_t value, leadNorm32=0;
   1652     UChar32 limit;
   1653     UBool inBlockZero;
   1654 
   1655     limit=start+0x400;
   1656     while(start<limit) {
   1657         value=utrie_get32(trie, start, &inBlockZero);
   1658         if(inBlockZero) {
   1659             start+=UTRIE_DATA_BLOCK_LENGTH;
   1660         } else {
   1661             if(value!=0) {
   1662                 leadNorm32|=value;
   1663             }
   1664             ++start;
   1665         }
   1666     }
   1667 
   1668     /* turn multi-bit fields into the worst-case value */
   1669     if(leadNorm32&_NORM_CC_MASK) {
   1670         leadNorm32|=_NORM_CC_MASK;
   1671     }
   1672 
   1673     /* clean up unnecessarily ored bit fields */
   1674     leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT);
   1675 
   1676     if(leadNorm32==0) {
   1677         /* nothing to do (only composition exclusions?) */
   1678         return 0;
   1679     }
   1680 
   1681     /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
   1682     leadNorm32|=(
   1683         (uint32_t)_NORM_EXTRA_INDEX_TOP+
   1684         (uint32_t)((offset-UTRIE_BMP_INDEX_LENGTH)>>UTRIE_SURROGATE_BLOCK_BITS)
   1685     )<<_NORM_EXTRA_SHIFT;
   1686 
   1687     return leadNorm32;
   1688 }
   1689 
   1690 /* folding value for FCD: use default function (just store the offset (16 bits) if there is any non-0 entry) */
   1691 
   1692 /*
   1693  * folding value for auxiliary data:
   1694  * store the non-zero offset in bits 9..0 (FNC bits)
   1695  * if there is any non-0 entry;
   1696  * "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
   1697  */
   1698 static uint32_t U_CALLCONV
   1699 getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
   1700     uint32_t value, oredValues;
   1701     UChar32 limit;
   1702     UBool inBlockZero;
   1703 
   1704     oredValues=0;
   1705     limit=start+0x400;
   1706     while(start<limit) {
   1707         value=utrie_get32(trie, start, &inBlockZero);
   1708         if(inBlockZero) {
   1709             start+=UTRIE_DATA_BLOCK_LENGTH;
   1710         } else {
   1711             oredValues|=value;
   1712             ++start;
   1713         }
   1714     }
   1715 
   1716     if(oredValues!=0) {
   1717         /* move the 10 significant offset bits into bits 9..0 */
   1718         offset>>=UTRIE_SURROGATE_BLOCK_BITS;
   1719         if(offset>_NORM_AUX_FNC_MASK) {
   1720             fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n");
   1721             exit(U_INDEX_OUTOFBOUNDS_ERROR);
   1722         }
   1723         return (uint32_t)offset|(oredValues&~_NORM_AUX_FNC_MASK);
   1724     } else {
   1725         return 0;
   1726     }
   1727 }
   1728 
   1729 extern void
   1730 processData() {
   1731 #if 0
   1732     uint16_t i;
   1733 #endif
   1734 
   1735     processCombining();
   1736 
   1737     /* canonically reorder decompositions and assign combining classes for decompositions */
   1738     enumTrie(postParseFn, NULL);
   1739 
   1740 #if 0
   1741     for(i=1; i<64; ++i) {
   1742         if(combineAndQC[i]) {
   1743             printf("combiningFlags==0x%02x  qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33);
   1744         }
   1745     }
   1746 #endif
   1747 
   1748     /* add hangul/jamo specials */
   1749     setHangulJamoSpecials();
   1750 
   1751     /* set this value; will be updated as makeCanonSetFn() adds sets (if there are any, see gStoreFlags) */
   1752     canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
   1753 
   1754     /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
   1755     if(DO_STORE(UGENNORM_STORE_AUX) && DO_STORE(UGENNORM_STORE_COMPOSITION)) {
   1756         enumTrie(makeCanonSetFn, NULL);
   1757     }
   1758 
   1759     /* clone the normalization builder trie to make the final data tries */
   1760     if( NULL==utrie_clone(norm32Trie, normTrie, NULL, 0) ||
   1761         NULL==utrie_clone(fcdTrie, normTrie, NULL, 0) ||
   1762         NULL==utrie_clone(auxTrie, normTrie, NULL, 0)
   1763     ) {
   1764         fprintf(stderr, "error: unable to clone the normalization trie\n");
   1765         exit(U_MEMORY_ALLOCATION_ERROR);
   1766     }
   1767 
   1768     /* --- finalize data for quick checks & normalization --- */
   1769 
   1770     /* turn the Norm structs (stage2, norms) into 32-bit data words */
   1771     makeAll32();
   1772 
   1773     /* --- finalize data for FCD checks --- */
   1774 
   1775     /* FCD data: take Norm.canonBothCCs and store them in the FCD table */
   1776     makeFCD();
   1777 
   1778     /* --- finalize auxiliary normalization data --- */
   1779     makeAux();
   1780 
   1781     if(beVerbose) {
   1782 #if 0
   1783         printf("number of stage 2 entries: %ld\n", stage2Mem->index);
   1784         printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2);
   1785 #endif
   1786         printf("combining CPs tops: fwd %u  both %u  back %u\n", combineFwdTop, combineBothTop, combineBackTop);
   1787         printf("combining table count: %u\n", combiningTableTop);
   1788     }
   1789 }
   1790 
   1791 /* is this a norm32 with a special index for a lead surrogate? */
   1792 static U_INLINE UBool
   1793 isNorm32LeadSurrogate(uint32_t norm32) {
   1794     return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
   1795 }
   1796 
   1797 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
   1798 static int32_t U_CALLCONV
   1799 getFoldingNormOffset(uint32_t norm32) {
   1800     if(isNorm32LeadSurrogate(norm32)) {
   1801         return
   1802             UTRIE_BMP_INDEX_LENGTH+
   1803                 (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
   1804                  (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
   1805     } else {
   1806         return 0;
   1807     }
   1808 }
   1809 
   1810 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
   1811 static int32_t U_CALLCONV
   1812 getFoldingAuxOffset(uint32_t data) {
   1813     return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
   1814 }
   1815 
   1816 #endif /* #if !UCONFIG_NO_NORMALIZATION */
   1817 
   1818 extern void
   1819 generateData(const char *dataDir, UBool csource) {
   1820     static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000];
   1821 
   1822     UNewDataMemory *pData;
   1823     UErrorCode errorCode=U_ZERO_ERROR;
   1824     int32_t size, dataLength;
   1825 
   1826 #if UCONFIG_NO_NORMALIZATION
   1827 
   1828     size=0;
   1829 
   1830 #else
   1831 
   1832     U_STRING_DECL(nxCJKCompatPattern, "[:Ideographic:]", 15);
   1833     U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
   1834     USet *set;
   1835     int32_t normTrieSize, fcdTrieSize, auxTrieSize;
   1836 
   1837     normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
   1838     if(U_FAILURE(errorCode)) {
   1839         fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode));
   1840         exit(errorCode);
   1841     }
   1842 
   1843     if(DO_STORE(UGENNORM_STORE_FCD)) {
   1844         fcdTrieSize=utrie_serialize(fcdTrie, fcdTrieBlock, sizeof(fcdTrieBlock), NULL, TRUE, &errorCode);
   1845         if(U_FAILURE(errorCode)) {
   1846             fprintf(stderr, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode));
   1847             exit(errorCode);
   1848         }
   1849     } else {
   1850         fcdTrieSize=0;
   1851     }
   1852 
   1853     if(DO_STORE(UGENNORM_STORE_AUX)) {
   1854         auxTrieSize=utrie_serialize(auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode);
   1855         if(U_FAILURE(errorCode)) {
   1856             fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode));
   1857             exit(errorCode);
   1858         }
   1859     } else {
   1860         auxTrieSize=0;
   1861     }
   1862 
   1863     /* move the parts of canonStartSets[] together into a contiguous block */
   1864     if( canonStartSetsTop<_NORM_MAX_CANON_SETS &&
   1865         canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]!=0
   1866     ) {
   1867         uprv_memmove(canonStartSets+canonStartSetsTop,
   1868                      canonStartSets+_NORM_MAX_CANON_SETS,
   1869                      canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2);
   1870     }
   1871     canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
   1872 
   1873     if( canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH) &&
   1874         canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]!=0
   1875     ) {
   1876         uprv_memmove(canonStartSets+canonStartSetsTop,
   1877                      canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH,
   1878                      canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2);
   1879     }
   1880     canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
   1881 
   1882     /* create the normalization exclusion sets */
   1883     /*
   1884      * nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]]
   1885      * but we cannot use NFD_QC from the pattern because that would require
   1886      * unorm.icu which we are just going to generate.
   1887      * Therefore we have manually collected nfdQCNoSet and intersect Ideographic
   1888      * with that.
   1889      */
   1890     U_STRING_INIT(nxCJKCompatPattern, "[:Ideographic:]", 15);
   1891     U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
   1892 
   1893     canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop;
   1894     set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode);
   1895     if(U_FAILURE(errorCode)) {
   1896         fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
   1897         exit(errorCode);
   1898     }
   1899     uset_retainAll(set, nfdQCNoSet);
   1900     if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
   1901         uset_clear(set);
   1902     }
   1903     canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
   1904     if(U_FAILURE(errorCode)) {
   1905         fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
   1906         exit(errorCode);
   1907     }
   1908     uset_close(set);
   1909 
   1910     canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop;
   1911     set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode);
   1912     if(U_FAILURE(errorCode)) {
   1913         fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
   1914         exit(errorCode);
   1915     }
   1916     if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
   1917         uset_clear(set);
   1918     }
   1919     canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
   1920     if(U_FAILURE(errorCode)) {
   1921         fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
   1922         exit(errorCode);
   1923     }
   1924     uset_close(set);
   1925 
   1926     canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop;
   1927 
   1928     /* make sure that the FCD trie is 4-aligned */
   1929     if((utm_countItems(extraMem)+combiningTableTop)&1) {
   1930         combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
   1931     }
   1932 
   1933     /* pad canonStartSets to 4-alignment, too */
   1934     if(canonStartSetsTop&1) {
   1935         canonStartSets[canonStartSetsTop++]=0x1235;
   1936     }
   1937 
   1938     size=
   1939         _NORM_INDEX_TOP*4+
   1940         normTrieSize+
   1941         utm_countItems(extraMem)*2+
   1942         combiningTableTop*2+
   1943         fcdTrieSize+
   1944         auxTrieSize+
   1945         canonStartSetsTop*2;
   1946 
   1947     if(beVerbose) {
   1948         printf("size of normalization trie              %5u bytes\n", (int)normTrieSize);
   1949         printf("size of 16-bit extra memory             %5u UChars/uint16_t\n", (int)utm_countItems(extraMem));
   1950         printf("  of that: FC_NFKC_Closure size         %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]);
   1951         printf("size of combining table                 %5u uint16_t\n", combiningTableTop);
   1952         printf("size of FCD trie                        %5u bytes\n", (int)fcdTrieSize);
   1953         printf("size of auxiliary trie                  %5u bytes\n", (int)auxTrieSize);
   1954         printf("size of canonStartSets[]                %5u uint16_t\n", (int)canonStartSetsTop);
   1955         printf("  number of indexes                     %5u uint16_t\n", _NORM_SET_INDEX_TOP);
   1956         printf("  size of sets                          %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP);
   1957         printf("  number of sets                        %5d\n", (int)canonSetsCount);
   1958         printf("  size of BMP search table              %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
   1959         printf("  size of supplementary search table    %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
   1960         printf("  length of exclusion sets              %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]);
   1961         printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
   1962     }
   1963 
   1964     indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
   1965     indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem);
   1966 
   1967     indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
   1968     indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
   1969     indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop);
   1970     indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop);
   1971 
   1972     /* the quick check minimum code points are already set */
   1973 
   1974     indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize;
   1975     indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize;
   1976     indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop;
   1977 
   1978 #endif
   1979 
   1980     if(csource) {
   1981 #if UCONFIG_NO_NORMALIZATION
   1982         /* no csource for dummy mode..? */
   1983         fprintf(stderr, "gennorm error: UCONFIG_NO_NORMALIZATION is on in csource mode.\n");
   1984         exit(1);
   1985 #else
   1986         /* write .c file for hardcoded data */
   1987         UTrie normRuntimeTrie={ NULL }, fcdRuntimeTrie={ NULL }, auxRuntimeTrie={ NULL };
   1988         UTrie2 *normRuntimeTrie2, *fcdRuntimeTrie2=NULL, *auxRuntimeTrie2=NULL;
   1989         FILE *f;
   1990 
   1991         utrie_unserialize(&normRuntimeTrie, normTrieBlock, normTrieSize, &errorCode);
   1992         normRuntimeTrie.getFoldingOffset=getFoldingNormOffset;
   1993         if(fcdTrieSize>0) {
   1994             utrie_unserialize(&fcdRuntimeTrie, fcdTrieBlock, fcdTrieSize, &errorCode);
   1995         }
   1996         if(auxTrieSize>0) {
   1997             utrie_unserialize(&auxRuntimeTrie, auxTrieBlock, auxTrieSize, &errorCode);
   1998             auxRuntimeTrie.getFoldingOffset=getFoldingAuxOffset;
   1999         }
   2000         if(U_FAILURE(errorCode)) {
   2001             fprintf(
   2002                 stderr,
   2003                 "gennorm error: failed to utrie_unserialize() one of the tries - %s\n",
   2004                 u_errorName(errorCode));
   2005             exit(errorCode);
   2006         }
   2007 
   2008         /* use UTrie2 */
   2009         dataInfo.formatVersion[0]=3;
   2010         dataInfo.formatVersion[2]=0;
   2011         dataInfo.formatVersion[3]=0;
   2012         normRuntimeTrie2=utrie2_fromUTrie(&normRuntimeTrie, 0, &errorCode);
   2013         if(fcdTrieSize>0) {
   2014             fcdRuntimeTrie2=utrie2_fromUTrie(&fcdRuntimeTrie, 0, &errorCode);
   2015         }
   2016         if(auxTrieSize>0) {
   2017             auxRuntimeTrie2=utrie2_fromUTrie(&auxRuntimeTrie, 0, &errorCode);
   2018         }
   2019         if(U_FAILURE(errorCode)) {
   2020             fprintf(
   2021                 stderr,
   2022                 "gennorm error: utrie2_fromUTrie() failed - %s\n",
   2023                 u_errorName(errorCode));
   2024             exit(errorCode);
   2025         }
   2026         if(auxTrieSize>0) {
   2027             /* delete lead surrogate code unit values */
   2028             UChar lead;
   2029             auxRuntimeTrie2=utrie2_cloneAsThawed(auxRuntimeTrie2, &errorCode);
   2030             for(lead=0xd800; lead<0xdc00; ++lead) {
   2031                 utrie2_set32ForLeadSurrogateCodeUnit(auxRuntimeTrie2, lead, auxRuntimeTrie2->initialValue, &errorCode);
   2032             }
   2033             utrie2_freeze(auxRuntimeTrie2, UTRIE2_16_VALUE_BITS, &errorCode);
   2034             if(U_FAILURE(errorCode)) {
   2035                 fprintf(
   2036                     stderr,
   2037                     "gennorm error: deleting lead surrogate code unit values failed - %s\n",
   2038                     u_errorName(errorCode));
   2039                 exit(errorCode);
   2040             }
   2041         }
   2042 
   2043         f=usrc_create(dataDir, "unorm_props_data.c");
   2044         if(f!=NULL) {
   2045             usrc_writeArray(f,
   2046                 "static const UVersionInfo formatVersion={ ",
   2047                 dataInfo.formatVersion, 8, 4,
   2048                 " };\n\n");
   2049             usrc_writeArray(f,
   2050                 "static const UVersionInfo dataVersion={ ",
   2051                 dataInfo.dataVersion, 8, 4,
   2052                 " };\n\n");
   2053             usrc_writeArray(f,
   2054                 "static const int32_t indexes[_NORM_INDEX_TOP]={\n",
   2055                 indexes, 32, _NORM_INDEX_TOP,
   2056                 "\n};\n\n");
   2057             usrc_writeUTrie2Arrays(f,
   2058                 "static const uint16_t normTrie_index[%ld]={\n",
   2059                 "static const uint32_t normTrie_data32[%ld]={\n",
   2060                 normRuntimeTrie2,
   2061                 "\n};\n\n");
   2062             usrc_writeUTrie2Struct(f,
   2063                 "static const UTrie2 normTrie={\n",
   2064                 normRuntimeTrie2, "normTrie_index", "normTrie_data32",
   2065                 "};\n\n");
   2066             usrc_writeArray(f,
   2067                 "static const uint16_t extraData[%ld]={\n",
   2068                 utm_getStart(extraMem), 16, utm_countItems(extraMem),
   2069                 "\n};\n\n");
   2070             usrc_writeArray(f,
   2071                 "static const uint16_t combiningTable[%ld]={\n",
   2072                 combiningTable, 16, combiningTableTop,
   2073                 "\n};\n\n");
   2074             if(fcdTrieSize>0) {
   2075                 usrc_writeUTrie2Arrays(f,
   2076                     "static const uint16_t fcdTrie_index[%ld]={\n", NULL,
   2077                     fcdRuntimeTrie2,
   2078                     "\n};\n\n");
   2079                 usrc_writeUTrie2Struct(f,
   2080                     "static const UTrie2 fcdTrie={\n",
   2081                     fcdRuntimeTrie2, "fcdTrie_index", NULL,
   2082                     "};\n\n");
   2083             } else {
   2084                 fputs( "static const UTrie2 fcdTrie={ NULL };\n\n", f);
   2085             }
   2086             if(auxTrieSize>0) {
   2087                 usrc_writeUTrie2Arrays(f,
   2088                     "static const uint16_t auxTrie_index[%ld]={\n", NULL,
   2089                     auxRuntimeTrie2,
   2090                     "\n};\n\n");
   2091                 usrc_writeUTrie2Struct(f,
   2092                     "static const UTrie2 auxTrie={\n",
   2093                     auxRuntimeTrie2, "auxTrie_index", NULL,
   2094                     "};\n\n");
   2095             } else {
   2096                 fputs( "static const UTrie2 auxTrie={ NULL };\n\n", f);
   2097             }
   2098             usrc_writeArray(f,
   2099                 "static const uint16_t canonStartSets[%ld]={\n",
   2100                 canonStartSets, 16, canonStartSetsTop,
   2101                 "\n};\n\n");
   2102             fclose(f);
   2103         }
   2104         utrie2_close(normRuntimeTrie2);
   2105         utrie2_close(fcdRuntimeTrie2);
   2106         utrie2_close(auxRuntimeTrie2);
   2107 #endif
   2108     } else {
   2109         /* write the data */
   2110         pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
   2111                         haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
   2112         if(U_FAILURE(errorCode)) {
   2113             fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode);
   2114             exit(errorCode);
   2115         }
   2116 
   2117 #if !UCONFIG_NO_NORMALIZATION
   2118 
   2119         udata_writeBlock(pData, indexes, sizeof(indexes));
   2120         udata_writeBlock(pData, normTrieBlock, normTrieSize);
   2121         udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2);
   2122         udata_writeBlock(pData, combiningTable, combiningTableTop*2);
   2123         udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
   2124         udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
   2125         udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2);
   2126 
   2127 #endif
   2128 
   2129         /* finish up */
   2130         dataLength=udata_finish(pData, &errorCode);
   2131         if(U_FAILURE(errorCode)) {
   2132             fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode);
   2133             exit(errorCode);
   2134         }
   2135 
   2136         if(dataLength!=size) {
   2137             fprintf(stderr, "gennorm error: data length %ld != calculated size %ld\n",
   2138                 (long)dataLength, (long)size);
   2139             exit(U_INTERNAL_PROGRAM_ERROR);
   2140         }
   2141     }
   2142 }
   2143 
   2144 #if !UCONFIG_NO_NORMALIZATION
   2145 
   2146 extern void
   2147 cleanUpData(void) {
   2148     int32_t i, count;
   2149 
   2150     count=utm_countItems(normMem);
   2151     for(i=0; i<count; ++i) {
   2152         uset_close(norms[i].canonStart);
   2153     }
   2154 
   2155     utm_close(normMem);
   2156     utm_close(utf32Mem);
   2157     utm_close(extraMem);
   2158     utm_close(combiningTriplesMem);
   2159     utrie_close(normTrie);
   2160     utrie_close(norm32Trie);
   2161     utrie_close(fcdTrie);
   2162     utrie_close(auxTrie);
   2163 
   2164     uset_close(nfdQCNoSet);
   2165 
   2166     uprv_free(normTrie);
   2167     uprv_free(norm32Trie);
   2168     uprv_free(fcdTrie);
   2169     uprv_free(auxTrie);
   2170 }
   2171 
   2172 #endif /* #if !UCONFIG_NO_NORMALIZATION */
   2173 
   2174 /*
   2175  * Hey, Emacs, please set the following:
   2176  *
   2177  * Local Variables:
   2178  * indent-tabs-mode: nil
   2179  * End:
   2180  *
   2181  */
   2182