1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: unormimp.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2001may25 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __UNORMIMP_H__ 18 #define __UNORMIMP_H__ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_NORMALIZATION 23 24 #ifdef XP_CPLUSPLUS 25 #include "unicode/uniset.h" 26 #endif 27 28 #include "unicode/uiter.h" 29 #include "unicode/unorm.h" 30 #include "unicode/uset.h" 31 #include "utrie2.h" 32 #include "ustr_imp.h" 33 #include "udataswp.h" 34 35 /* 36 * This new implementation of the normalization code loads its data from 37 * unorm.icu, which is generated with the gennorm tool. 38 * The format of that file is described at the end of this file. 39 */ 40 41 /* norm32 value constants */ 42 enum { 43 /* quick check flags 0..3 set mean "no" for their forms */ 44 _NORM_QC_NFC=0x11, /* no|maybe */ 45 _NORM_QC_NFKC=0x22, /* no|maybe */ 46 _NORM_QC_NFD=4, /* no */ 47 _NORM_QC_NFKD=8, /* no */ 48 49 _NORM_QC_ANY_NO=0xf, 50 51 /* quick check flags 4..5 mean "maybe" for their forms; test flags>=_NORM_QC_MAYBE */ 52 _NORM_QC_MAYBE=0x10, 53 _NORM_QC_ANY_MAYBE=0x30, 54 55 _NORM_QC_MASK=0x3f, 56 57 _NORM_COMBINES_FWD=0x40, 58 _NORM_COMBINES_BACK=0x80, 59 _NORM_COMBINES_ANY=0xc0, 60 61 _NORM_CC_SHIFT=8, /* UnicodeData.txt combining class in bits 15..8 */ 62 _NORM_CC_MASK=0xff00, 63 64 _NORM_EXTRA_SHIFT=16, /* 16 bits for the index to UChars and other extra data */ 65 _NORM_EXTRA_INDEX_TOP=0xfc00, /* start of surrogate specials after shift */ 66 67 _NORM_EXTRA_SURROGATE_MASK=0x3ff, 68 _NORM_EXTRA_SURROGATE_TOP=0x3f0, /* hangul etc. */ 69 70 _NORM_EXTRA_HANGUL=_NORM_EXTRA_SURROGATE_TOP, 71 _NORM_EXTRA_JAMO_L, 72 _NORM_EXTRA_JAMO_V, 73 _NORM_EXTRA_JAMO_T 74 }; 75 76 /* norm32 value constants using >16 bits */ 77 #define _NORM_MIN_SPECIAL 0xfc000000 78 #define _NORM_SURROGATES_TOP 0xfff00000 79 #define _NORM_MIN_HANGUL 0xfff00000 80 #define _NORM_MIN_JAMO_V 0xfff20000 81 #define _NORM_JAMO_V_TOP 0xfff30000 82 83 /* value constants for auxTrie */ 84 enum { 85 _NORM_AUX_COMP_EX_SHIFT=10, 86 _NORM_AUX_UNSAFE_SHIFT=11, 87 _NORM_AUX_NFC_SKIPPABLE_F_SHIFT=12 88 }; 89 90 #define _NORM_AUX_MAX_FNC ((int32_t)1<<_NORM_AUX_COMP_EX_SHIFT) 91 92 #define _NORM_AUX_FNC_MASK (uint32_t)(_NORM_AUX_MAX_FNC-1) 93 #define _NORM_AUX_COMP_EX_MASK ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT) 94 #define _NORM_AUX_UNSAFE_MASK ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT) 95 #define _NORM_AUX_NFC_SKIP_F_MASK ((uint32_t)1<<_NORM_AUX_NFC_SKIPPABLE_F_SHIFT) 96 97 /* canonStartSets[0..31] contains indexes for what is in the array */ 98 enum { 99 _NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */ 100 _NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH, /* number of uint16_t in the BMP search table (contains pairs) */ 101 _NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH,/* number of uint16_t in the supplementary search table (contains triplets) */ 102 103 /* from formatVersion 2.3: */ 104 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET, /* uint16_t offset from canonStartSets[0] to the 105 exclusion set for CJK compatibility characters */ 106 _NORM_SET_INDEX_NX_UNICODE32_OFFSET, /* uint16_t offset from canonStartSets[0] to the 107 exclusion set for Unicode 3.2 characters */ 108 _NORM_SET_INDEX_NX_RESERVED_OFFSET, /* uint16_t offset from canonStartSets[0] to the 109 end of the previous exclusion set */ 110 111 _NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */ 112 }; 113 114 /* more constants for canonical starter sets */ 115 116 /* 14 bit indexes to canonical USerializedSets */ 117 #define _NORM_MAX_CANON_SETS 0x4000 118 119 /* single-code point BMP sets are encoded directly in the search table except if result=0x4000..0x7fff */ 120 #define _NORM_CANON_SET_BMP_MASK 0xc000 121 #define _NORM_CANON_SET_BMP_IS_INDEX 0x4000 122 123 /* indexes[] value names */ 124 enum { 125 _NORM_INDEX_TRIE_SIZE, /* number of bytes in normalization trie */ 126 _NORM_INDEX_UCHAR_COUNT, /* number of UChars in extra data */ 127 128 _NORM_INDEX_COMBINE_DATA_COUNT, /* number of uint16_t words for combining data */ 129 _NORM_INDEX_COMBINE_FWD_COUNT, /* number of code points that combine forward */ 130 _NORM_INDEX_COMBINE_BOTH_COUNT, /* number of code points that combine forward and backward */ 131 _NORM_INDEX_COMBINE_BACK_COUNT, /* number of code points that combine backward */ 132 133 _NORM_INDEX_MIN_NFC_NO_MAYBE, /* first code point with quick check NFC NO/MAYBE */ 134 _NORM_INDEX_MIN_NFKC_NO_MAYBE, /* first code point with quick check NFKC NO/MAYBE */ 135 _NORM_INDEX_MIN_NFD_NO_MAYBE, /* first code point with quick check NFD NO/MAYBE */ 136 _NORM_INDEX_MIN_NFKD_NO_MAYBE, /* first code point with quick check NFKD NO/MAYBE */ 137 138 _NORM_INDEX_FCD_TRIE_SIZE, /* number of bytes in FCD trie */ 139 140 _NORM_INDEX_AUX_TRIE_SIZE, /* number of bytes in the auxiliary trie */ 141 _NORM_INDEX_CANON_SET_COUNT, /* number of uint16_t in the array of serialized USet */ 142 143 _NORM_INDEX_TOP=32 /* changing this requires a new formatVersion */ 144 }; 145 146 enum { 147 /* FCD check: everything below this code point is known to have a 0 lead combining class */ 148 _NORM_MIN_WITH_LEAD_CC=0x300 149 }; 150 151 enum { 152 /** 153 * Bit 7 of the length byte for a decomposition string in extra data is 154 * a flag indicating whether the decomposition string is 155 * preceded by a 16-bit word with the leading and trailing cc 156 * of the decomposition (like for A-umlaut); 157 * if not, then both cc's are zero (like for compatibility ideographs). 158 */ 159 _NORM_DECOMP_FLAG_LENGTH_HAS_CC=0x80, 160 /** 161 * Bits 6..0 of the length byte contain the actual length. 162 */ 163 _NORM_DECOMP_LENGTH_MASK=0x7f 164 }; 165 166 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 167 168 /* Korean Hangul and Jamo constants */ 169 enum { 170 JAMO_L_BASE=0x1100, /* "lead" jamo */ 171 JAMO_V_BASE=0x1161, /* "vowel" jamo */ 172 JAMO_T_BASE=0x11a7, /* "trail" jamo */ 173 174 HANGUL_BASE=0xac00, 175 176 JAMO_L_COUNT=19, 177 JAMO_V_COUNT=21, 178 JAMO_T_COUNT=28, 179 180 HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT 181 }; 182 183 #if !UCONFIG_NO_NORMALIZATION 184 185 /* Constants for options flags for normalization. @draft ICU 2.6 */ 186 enum { 187 /** Options bit 0, do not decompose Hangul syllables. @draft ICU 2.6 */ 188 UNORM_NX_HANGUL=1, 189 /** Options bit 1, do not decompose CJK compatibility characters. @draft ICU 2.6 */ 190 UNORM_NX_CJK_COMPAT=2, 191 /** 192 * Options bit 8, use buggy recomposition described in 193 * Unicode Public Review Issue #29 194 * at http://www.unicode.org/review/resolved-pri.html#pri29 195 * 196 * Used in IDNA implementation according to strict interpretation 197 * of IDNA definition based on Unicode 3.2 which predates PRI #29. 198 */ 199 UNORM_BEFORE_PRI_29=0x100 200 }; 201 202 /** 203 * Is the normalizer data loaded? 204 * This is used internally before other internal normalizer functions 205 * are called. 206 * It saves this check in each of many normalization calls that 207 * are made for, e.g., collation. 208 * 209 * @param pErrorCode as usual 210 * @return boolean value for whether the normalization data is loaded 211 * 212 * @internal 213 */ 214 U_CAPI UBool U_EXPORT2 215 unorm_haveData(UErrorCode *pErrorCode); 216 217 /** 218 * Internal API for normalizing. 219 * Does not check for bad input. 220 * @internal 221 */ 222 U_CAPI int32_t U_EXPORT2 223 unorm_internalNormalize(UChar *dest, int32_t destCapacity, 224 const UChar *src, int32_t srcLength, 225 UNormalizationMode mode, int32_t options, 226 UErrorCode *pErrorCode); 227 228 #ifdef XP_CPLUSPLUS 229 230 /** 231 * Internal API for normalizing. 232 * Does not check for bad input. 233 * Requires _haveData() to be true. 234 * @internal 235 */ 236 U_CFUNC int32_t 237 unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity, 238 const UChar *src, int32_t srcLength, 239 UNormalizationMode mode, int32_t options, const U_NAMESPACE_QUALIFIER UnicodeSet *nx, 240 UErrorCode *pErrorCode); 241 242 #endif 243 244 /** 245 * internal API, used by normlzr.cpp 246 * @internal 247 */ 248 U_CAPI int32_t U_EXPORT2 249 unorm_decompose(UChar *dest, int32_t destCapacity, 250 const UChar *src, int32_t srcLength, 251 UBool compat, int32_t options, 252 UErrorCode *pErrorCode); 253 254 /** 255 * internal API, used by normlzr.cpp 256 * @internal 257 */ 258 U_CAPI int32_t U_EXPORT2 259 unorm_compose(UChar *dest, int32_t destCapacity, 260 const UChar *src, int32_t srcLength, 261 UBool compat, int32_t options, 262 UErrorCode *pErrorCode); 263 264 #ifdef XP_CPLUSPLUS 265 266 /** 267 * internal API, used by unormcmp.cpp 268 * @internal 269 */ 270 U_CFUNC UNormalizationCheckResult 271 unorm_internalQuickCheck(const UChar *src, 272 int32_t srcLength, 273 UNormalizationMode mode, 274 UBool allowMaybe, 275 const U_NAMESPACE_QUALIFIER UnicodeSet *nx, 276 UErrorCode *pErrorCode); 277 278 #endif 279 280 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 281 282 /** 283 * Internal option for unorm_cmpEquivFold() for decomposing. 284 * If not set, just do strcasecmp(). 285 * @internal 286 */ 287 #define _COMPARE_EQUIV 0x80000 288 289 #ifndef U_COMPARE_IGNORE_CASE 290 /* see also unorm.h */ 291 /** 292 * Option bit for unorm_compare: 293 * Perform case-insensitive comparison. 294 * @draft ICU 2.2 295 */ 296 #define U_COMPARE_IGNORE_CASE 0x10000 297 #endif 298 299 /** 300 * Internal option for unorm_cmpEquivFold() for strncmp style. 301 * If set, checks for both string length and terminating NUL. 302 * @internal 303 */ 304 #define _STRNCMP_STYLE 0x1000 305 306 #if !UCONFIG_NO_NORMALIZATION 307 308 /** 309 * Internal API to get the 16-bit FCD value (lccc + tccc) for c, 310 * for u_getIntPropertyValue(). 311 * @internal 312 */ 313 U_CFUNC uint16_t U_EXPORT2 314 unorm_getFCD16FromCodePoint(UChar32 c); 315 316 #ifdef XP_CPLUSPLUS 317 318 /** 319 * Internal API, used by collation code. 320 * Get access to the internal FCD trie table to be able to perform 321 * incremental, per-code unit, FCD checks in collation. 322 * One pointer is sufficient because the trie index values are offset 323 * by the index size, so that the same pointer is used to access the trie data. 324 * Code points at fcdHighStart and above have a zero FCD value. 325 * @internal 326 */ 327 U_CAPI const uint16_t * U_EXPORT2 328 unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode); 329 330 /** 331 * Internal API, used by collation code. 332 * Get the FCD value for a code unit, with 333 * bits 15..8 lead combining class 334 * bits 7..0 trail combining class 335 * 336 * If c is a lead surrogate and the value is not 0, 337 * then some of c's associated supplementary code points have a non-zero FCD value. 338 * 339 * @internal 340 */ 341 static inline uint16_t 342 unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) { 343 return fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; 344 } 345 346 /** 347 * Internal API, used by collation code. 348 * Get the FCD value of the next code point (post-increment), with 349 * bits 15..8 lead combining class 350 * bits 7..0 trail combining class 351 * 352 * @internal 353 */ 354 static inline uint16_t 355 unorm_nextFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart, 356 const UChar *&s, const UChar *limit) { 357 UChar32 c=*s++; 358 uint16_t fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; 359 if(fcd!=0 && U16_IS_LEAD(c)) { 360 UChar c2; 361 if(s!=limit && U16_IS_TRAIL(c2=*s)) { 362 ++s; 363 c=U16_GET_SUPPLEMENTARY(c, c2); 364 if(c<fcdHighStart) { 365 fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)]; 366 } else { 367 fcd=0; 368 } 369 } else /* unpaired lead surrogate */ { 370 fcd=0; 371 } 372 } 373 return fcd; 374 } 375 376 /** 377 * Internal API, used by collation code. 378 * Get the FCD value of the previous code point (pre-decrement), with 379 * bits 15..8 lead combining class 380 * bits 7..0 trail combining class 381 * 382 * @internal 383 */ 384 static inline uint16_t 385 unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart, 386 const UChar *start, const UChar *&s) { 387 UChar32 c=*--s; 388 uint16_t fcd; 389 if(!U16_IS_SURROGATE(c)) { 390 fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; 391 } else { 392 UChar c2; 393 if(U16_IS_SURROGATE_TRAIL(c) && s!=start && U16_IS_LEAD(c2=*(s-1))) { 394 --s; 395 c=U16_GET_SUPPLEMENTARY(c2, c); 396 if(c<fcdHighStart) { 397 fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)]; 398 } else { 399 fcd=0; 400 } 401 } else /* unpaired surrogate */ { 402 fcd=0; 403 } 404 } 405 return fcd; 406 } 407 408 #endif 409 410 /** 411 * internal API, used by StringPrep 412 * @internal 413 */ 414 U_CAPI void U_EXPORT2 415 unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode); 416 417 /** 418 * Get the canonical decomposition for one code point. 419 * Requires unorm_haveData() and buffer!=NULL and pLength!=NULL. 420 * @param c code point 421 * @param buffer out-only buffer for algorithmic decompositions of Hangul 422 * @param length out-only, takes the length of the decomposition, if any 423 * @return pointer to decomposition, or 0 if none 424 * @internal 425 */ 426 U_CFUNC const UChar * 427 unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength); 428 429 /** 430 * internal API, used by the canonical iterator 431 * TODO Consider using signature similar to unorm_getCanonicalDecomposition() 432 * for more efficiency 433 * @internal 434 */ 435 U_CAPI int32_t U_EXPORT2 436 unorm_getDecomposition(UChar32 c, UBool compat, 437 UChar *dest, int32_t destCapacity); 438 439 /** 440 * internal API, used by uprops.cpp 441 * @internal 442 */ 443 U_CFUNC UBool U_EXPORT2 444 unorm_internalIsFullCompositionExclusion(UChar32 c); 445 446 /** 447 * Internal API, used by enumeration of canonically equivalent strings 448 * @internal 449 */ 450 U_CFUNC UBool U_EXPORT2 451 unorm_isCanonSafeStart(UChar32 c); 452 453 /** 454 * Internal API, used by enumeration of canonically equivalent strings 455 * @internal 456 */ 457 U_CAPI UBool U_EXPORT2 458 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet); 459 460 /** 461 * Is c an NF<mode>-skippable code point? See unormimp.h. 462 * @internal 463 */ 464 U_CAPI UBool U_EXPORT2 465 unorm_isNFSkippable(UChar32 c, UNormalizationMode mode); 466 467 #ifdef XP_CPLUSPLUS 468 469 /** 470 * Get normalization exclusion set for the options. 471 * Requires unorm_haveData(). 472 * @internal 473 */ 474 U_CFUNC const U_NAMESPACE_QUALIFIER UnicodeSet * 475 unorm_getNX(int32_t options, UErrorCode *pErrorCode); 476 477 #endif 478 479 /** 480 * Enumerate each normalization data trie and add the 481 * start of each range of same properties to the set. 482 * @internal 483 */ 484 U_CAPI void U_EXPORT2 485 unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); 486 487 /** 488 * Swap unorm.icu. See udataswp.h. 489 * @internal 490 */ 491 U_CAPI int32_t U_EXPORT2 492 unorm_swap(const UDataSwapper *ds, 493 const void *inData, int32_t length, void *outData, 494 UErrorCode *pErrorCode); 495 496 /** 497 * Get the NF*_QC property for a code point, for u_getIntPropertyValue(). 498 * @internal 499 */ 500 U_CFUNC UNormalizationCheckResult U_EXPORT2 501 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode); 502 503 /** 504 * Description of the format of unorm.icu version 2.3. 505 * 506 * Main change from version 1 to version 2: 507 * Use of new, common UTrie instead of normalization-specific tries. 508 * Change to version 2.1: add third/auxiliary trie with associated data. 509 * Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK). 510 * Change to version 2.3: add serialized sets for normalization exclusions 511 * stored inside canonStartSets[] 512 * 513 * For more details of how to use the data structures see the code 514 * in unorm.cpp (runtime normalization code) and 515 * in gennorm.c and gennorm/store.c (build-time data generation). 516 * 517 * For the serialized format of UTrie see utrie.c/UTrieHeader. 518 * 519 * - Overall partition 520 * 521 * unorm.dat customarily begins with a UDataInfo structure, see udata.h and .c. 522 * After that there are the following structures: 523 * 524 * int32_t indexes[_NORM_INDEX_TOP]; -- _NORM_INDEX_TOP=32, see enum in this file 525 * 526 * UTrie normTrie; -- size in bytes=indexes[_NORM_INDEX_TRIE_SIZE] 527 * 528 * uint16_t extraData[extraDataTop]; -- extraDataTop=indexes[_NORM_INDEX_UCHAR_COUNT] 529 * extraData[0] contains the number of units for 530 * FC_NFKC_Closure (formatVersion>=2.1) 531 * 532 * uint16_t combiningTable[combiningTableTop]; -- combiningTableTop=indexes[_NORM_INDEX_COMBINE_DATA_COUNT] 533 * combiningTableTop may include one 16-bit padding unit 534 * to make sure that fcdTrie is 32-bit-aligned 535 * 536 * UTrie fcdTrie; -- size in bytes=indexes[_NORM_INDEX_FCD_TRIE_SIZE] 537 * 538 * UTrie auxTrie; -- size in bytes=indexes[_NORM_INDEX_AUX_TRIE_SIZE] 539 * 540 * uint16_t canonStartSets[canonStartSetsTop] -- canonStartSetsTop=indexes[_NORM_INDEX_CANON_SET_COUNT] 541 * serialized USets and binary search tables, see below 542 * 543 * 544 * The indexes array contains lengths and sizes of the following arrays and structures 545 * as well as the following values: 546 * indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop 547 * -- one more than the highest combining index computed for forward-only-combining characters 548 * indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop 549 * -- number of combining indexes computed for both-ways-combining characters 550 * indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop 551 * -- number of combining indexes computed for backward-only-combining characters 552 * 553 * indexes[_NORM_INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD }) 554 * -- first code point with a quick check NF* value of NO/MAYBE 555 * 556 * 557 * - Tries 558 * 559 * The main structures are two UTrie tables ("compact arrays"), 560 * each with one index array and one data array. 561 * See utrie.h and utrie.c. 562 * 563 * 564 * - Tries in unorm.dat 565 * 566 * The first trie (normTrie above) 567 * provides data for the NF* quick checks and normalization. 568 * The second trie (fcdTrie above) provides data just for FCD checks. 569 * 570 * 571 * - norm32 data words from the first trie 572 * 573 * The norm32Table contains one 32-bit word "norm32" per code point. 574 * It contains the following bit fields: 575 * 31..16 extra data index, _NORM_EXTRA_SHIFT is used to shift this field down 576 * if this index is <_NORM_EXTRA_INDEX_TOP then it is an index into 577 * extraData[] where variable-length normalization data for this 578 * code point is found 579 * if this index is <_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_SURROGATE_TOP 580 * then this is a norm32 for a leading surrogate, and the index 581 * value is used together with the following trailing surrogate 582 * code unit in the second trie access 583 * if this index is >=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_SURROGATE_TOP 584 * then this is a norm32 for a "special" character, 585 * i.e., the character is a Hangul syllable or a Jamo 586 * see _NORM_EXTRA_HANGUL etc. 587 * generally, instead of extracting this index from the norm32 and 588 * comparing it with the above constants, 589 * the normalization code compares the entire norm32 value 590 * with _NORM_MIN_SPECIAL, _NORM_SURROGATES_TOP, _NORM_MIN_HANGUL etc. 591 * 592 * 15..8 combining class (cc) according to UnicodeData.txt 593 * 594 * 7..6 _NORM_COMBINES_ANY flags, used in composition to see if a character 595 * combines with any following or preceding character(s) 596 * at all 597 * 7 _NORM_COMBINES_BACK 598 * 6 _NORM_COMBINES_FWD 599 * 600 * 5..0 quick check flags, set for "no" or "maybe", with separate flags for 601 * each normalization form 602 * the higher bits are "maybe" flags; for NF*D there are no such flags 603 * the lower bits are "no" flags for all forms, in the same order 604 * as the "maybe" flags, 605 * which is (MSB to LSB): NFKD NFD NFKC NFC 606 * 5..4 _NORM_QC_ANY_MAYBE 607 * 3..0 _NORM_QC_ANY_NO 608 * see further related constants 609 * 610 * 611 * - Extra data per code point 612 * 613 * "Extra data" is referenced by the index in norm32. 614 * It is variable-length data. It is only present, and only those parts 615 * of it are, as needed for a given character. 616 * The norm32 extra data index is added to the beginning of extraData[] 617 * to get to a vector of 16-bit words with data at the following offsets: 618 * 619 * [-1] Combining index for composition. 620 * Stored only if norm32&_NORM_COMBINES_ANY . 621 * [0] Lengths of the canonical and compatibility decomposition strings. 622 * Stored only if there are decompositions, i.e., 623 * if norm32&(_NORM_QC_NFD|_NORM_QC_NFKD) 624 * High byte: length of NFKD, or 0 if none 625 * Low byte: length of NFD, or 0 if none 626 * Each length byte also has another flag: 627 * Bit 7 of a length byte is set if there are non-zero 628 * combining classes (cc's) associated with the respective 629 * decomposition. If this flag is set, then the decomposition 630 * is preceded by a 16-bit word that contains the 631 * leading and trailing cc's. 632 * Bits 6..0 of a length byte are the length of the 633 * decomposition string, not counting the cc word. 634 * [1..n] NFD 635 * [n+1..] NFKD 636 * 637 * Each of the two decompositions consists of up to two parts: 638 * - The 16-bit words with the leading and trailing cc's. 639 * This is only stored if bit 7 of the corresponding length byte 640 * is set. In this case, at least one of the cc's is not zero. 641 * High byte: leading cc==cc of the first code point in the decomposition string 642 * Low byte: trailing cc==cc of the last code point in the decomposition string 643 * - The decomposition string in UTF-16, with length code units. 644 * 645 * 646 * - Combining indexes and combiningTable[] 647 * 648 * Combining indexes are stored at the [-1] offset of the extra data 649 * if the character combines forward or backward with any other characters. 650 * They are used for (re)composition in NF*C. 651 * Values of combining indexes are arranged according to whether a character 652 * combines forward, backward, or both ways: 653 * forward-only < both ways < backward-only 654 * 655 * The index values for forward-only and both-ways combining characters 656 * are indexes into the combiningTable[]. 657 * The index values for backward-only combining characters are simply 658 * incremented from the preceding index values to be unique. 659 * 660 * In the combiningTable[], a variable-length list 661 * of variable-length (back-index, code point) pair entries is stored 662 * for each forward-combining character. 663 * 664 * These back-indexes are the combining indexes of both-ways or backward-only 665 * combining characters that the forward-combining character combines with. 666 * 667 * Each list is sorted in ascending order of back-indexes. 668 * Each list is terminated with the last back-index having bit 15 set. 669 * 670 * Each pair (back-index, code point) takes up either 2 or 3 671 * 16-bit words. 672 * The first word of a list entry is the back-index, with its bit 15 set if 673 * this is the last pair in the list. 674 * 675 * The second word contains flags in bits 15..13 that determine 676 * if there is a third word and how the combined character is encoded: 677 * 15 set if there is a third word in this list entry 678 * 14 set if the result is a supplementary character 679 * 13 set if the result itself combines forward 680 * 681 * According to these bits 15..14 of the second word, 682 * the result character is encoded as follows: 683 * 00 or 01 The result is <=0x1fff and stored in bits 12..0 of 684 * the second word. 685 * 10 The result is 0x2000..0xffff and stored in the third word. 686 * Bits 12..0 of the second word are not used. 687 * 11 The result is a supplementary character. 688 * Bits 9..0 of the leading surrogate are in bits 9..0 of 689 * the second word. 690 * Add 0xd800 to these bits to get the complete surrogate. 691 * Bits 12..10 of the second word are not used. 692 * The trailing surrogate is stored in the third word. 693 * 694 * 695 * - FCD trie 696 * 697 * The FCD trie is very simple. 698 * It is a folded trie with 16-bit data words. 699 * In each word, the high byte contains the leading cc of the character, 700 * and the low byte contains the trailing cc of the character. 701 * These cc's are the cc's of the first and last code points in the 702 * canonical decomposition of the character. 703 * 704 * Since all 16 bits are used for cc's, lead surrogates must be tested 705 * by checking the code unit instead of the trie data. 706 * This is done only if the 16-bit data word is not zero. 707 * If the code unit is a leading surrogate and the data word is not zero, 708 * then instead of cc's it contains the offset for the second trie lookup. 709 * 710 * 711 * - Auxiliary trie and data 712 * 713 * The auxiliary 16-bit trie contains data for additional properties. 714 * Bits 715 * 15..13 reserved 716 * 12 not NFC_Skippable (f) (formatVersion>=2.2) 717 * 11 flag: not a safe starter for canonical closure 718 * 10 composition exclusion 719 * 9.. 0 index into extraData[] to FC_NFKC_Closure string 720 * (not for lead surrogate), 721 * or lead surrogate offset (for lead surrogate, if 9..0 not zero) 722 * 723 * - FC_NFKC_Closure strings in extraData[] 724 * 725 * Strings are either stored as a single code unit or as the length 726 * followed by that many units. 727 * const UChar *s=extraData+(index from auxTrie data bits 9..0); 728 * int32_t length; 729 * if(*s<0xff00) { 730 * // s points to the single-unit string 731 * length=1; 732 * } else { 733 * length=*s&0xff; 734 * ++s; 735 * } 736 * 737 * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable: 738 * (used in NormalizerTransliterator) 739 * 740 * A skippable character is 741 * a) unassigned, or ALL of the following: 742 * b) of combining class 0. 743 * c) not decomposed by this normalization form. 744 * AND if NFC or NFKC, 745 * d) can never compose with a previous character. 746 * e) can never compose with a following character. 747 * f) can never change if another character is added. 748 * Example: a-breve might satisfy all but f, but if you 749 * add an ogonek it changes to a-ogonek + breve 750 * 751 * a)..e) must be tested from norm32. 752 * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built 753 * into the auxiliary trie. 754 * The same bit is used for NFC and NFKC; (c) differs for them. 755 * As usual, we build the "not skippable" flags so that unassigned 756 * code points get a 0 bit. 757 * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well. 758 * Test Hangul LV syllables entirely in code. 759 * 760 * 761 * - structure inside canonStartSets[] 762 * 763 * This array maps from code points c to sets of code points (USerializedSet). 764 * The result sets are the code points whose canonical decompositions start 765 * with c. 766 * 767 * canonStartSets[] contains the following sub-arrays: 768 * 769 * indexes[_NORM_SET_INDEX_TOP] 770 * - contains lengths of sub-arrays etc. 771 * 772 * startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP] 773 * - contains serialized sets (USerializedSet) of canonical starters for 774 * enumerating canonically equivalent strings 775 * indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP 776 * for details about the structure see uset.c 777 * 778 * bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]] 779 * - a sorted search table for BMP code points whose results are 780 * either indexes to USerializedSets or single code points for 781 * single-code point sets; 782 * each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx 783 * if yy==01 then there is a USerializedSet at canonStartSets+x 784 * else build a USerializedSet with result as the single code point 785 * 786 * suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]] 787 * - a sorted search table for supplementary code points whose results are 788 * either indexes to USerializedSets or single code points for 789 * single-code point sets; 790 * each entry is a triplet of { high16(cp), low16(cp), result } 791 * each code point's high-word may contain extra data in bits 15..5: 792 * if the high word has bit 15 set, then build a set with a single code point 793 * which is (((high16(cp)&0x1f00)<<8)|result; 794 * else there is a USerializedSet at canonStartSets+result 795 * 796 * FormatVersion 2.3 adds 2 serialized sets for normalization exclusions. 797 * They are stored in the data file so that the runtime normalization code need 798 * not depend on other properties and their data and implementation files. 799 * The _NORM_SET_INDEX_NX_..._OFFSET offsets in the canonStartSets index table 800 * give the location for each set. 801 * There is no set stored for UNORM_NX_HANGUL because it's trivial to create 802 * without using properties. 803 * 804 * Set contents: 805 * 806 * _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET (for UNORM_NX_CJK_COMPAT) 807 * [[:Ideographic:]&[:NFD_QC=No:]] 808 * =[CJK Ideographs]&[has canonical decomposition] 809 * 810 * _NORM_SET_INDEX_NX_UNICODE32_OFFSET (for UNORM_UNICODE_3_2) 811 * [:^Age=3.2:] 812 * =set with all code points that were not designated by the specified Unicode version 813 * 814 * _NORM_SET_INDEX_NX_RESERVED_OFFSET 815 * This is an offset that points to where the next, future set would start. 816 * Currently it indicates where the previous set ends, and thus its length. 817 * The name for this enum constant may in the future be applied to different 818 * index slots. In order to get the limit of a set, use its index slot and 819 * the immediately following one regardless of that one's enum name. 820 */ 821 822 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 823 824 #endif 825