1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: normalizer2impl.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov22 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __NORMALIZER2IMPL_H__ 18 #define __NORMALIZER2IMPL_H__ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_NORMALIZATION 23 24 #include "unicode/normalizer2.h" 25 #include "unicode/udata.h" 26 #include "unicode/unistr.h" 27 #include "unicode/unorm.h" 28 #include "mutex.h" 29 #include "uset_imp.h" 30 #include "utrie2.h" 31 32 U_NAMESPACE_BEGIN 33 34 class Hangul { 35 public: 36 /* Korean Hangul and Jamo constants */ 37 enum { 38 JAMO_L_BASE=0x1100, /* "lead" jamo */ 39 JAMO_V_BASE=0x1161, /* "vowel" jamo */ 40 JAMO_T_BASE=0x11a7, /* "trail" jamo */ 41 42 HANGUL_BASE=0xac00, 43 44 JAMO_L_COUNT=19, 45 JAMO_V_COUNT=21, 46 JAMO_T_COUNT=28, 47 48 JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, 49 50 HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, 51 HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT 52 }; 53 54 static inline UBool isHangul(UChar32 c) { 55 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 56 } 57 static inline UBool 58 isHangulWithoutJamoT(UChar c) { 59 c-=HANGUL_BASE; 60 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 61 } 62 static inline UBool isJamoL(UChar32 c) { 63 return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT; 64 } 65 static inline UBool isJamoV(UChar32 c) { 66 return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT; 67 } 68 69 /** 70 * Decomposes c, which must be a Hangul syllable, into buffer 71 * and returns the length of the decomposition (2 or 3). 72 */ 73 static inline int32_t decompose(UChar32 c, UChar buffer[3]) { 74 c-=HANGUL_BASE; 75 UChar32 c2=c%JAMO_T_COUNT; 76 c/=JAMO_T_COUNT; 77 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); 78 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); 79 if(c2==0) { 80 return 2; 81 } else { 82 buffer[2]=(UChar)(JAMO_T_BASE+c2); 83 return 3; 84 } 85 } 86 private: 87 Hangul(); // no instantiation 88 }; 89 90 class Normalizer2Impl; 91 92 class ReorderingBuffer : public UMemory { 93 public: 94 ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) : 95 impl(ni), str(dest), 96 start(NULL), reorderStart(NULL), limit(NULL), 97 remainingCapacity(0), lastCC(0) {} 98 ~ReorderingBuffer() { 99 if(start!=NULL) { 100 str.releaseBuffer((int32_t)(limit-start)); 101 } 102 } 103 UBool init(int32_t destCapacity, UErrorCode &errorCode); 104 105 UBool isEmpty() const { return start==limit; } 106 int32_t length() const { return (int32_t)(limit-start); } 107 UChar *getStart() { return start; } 108 UChar *getLimit() { return limit; } 109 uint8_t getLastCC() const { return lastCC; } 110 111 UBool equals(const UChar *start, const UChar *limit) const; 112 113 // For Hangul composition, replacing the Leading consonant Jamo with the syllable. 114 void setLastChar(UChar c) { 115 *(limit-1)=c; 116 } 117 118 UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 119 return (c<=0xffff) ? 120 appendBMP((UChar)c, cc, errorCode) : 121 appendSupplementary(c, cc, errorCode); 122 } 123 // s must be in NFD, otherwise change the implementation. 124 UBool append(const UChar *s, int32_t length, 125 uint8_t leadCC, uint8_t trailCC, 126 UErrorCode &errorCode); 127 UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) { 128 if(remainingCapacity==0 && !resize(1, errorCode)) { 129 return FALSE; 130 } 131 if(lastCC<=cc || cc==0) { 132 *limit++=c; 133 lastCC=cc; 134 if(cc<=1) { 135 reorderStart=limit; 136 } 137 } else { 138 insert(c, cc); 139 } 140 --remainingCapacity; 141 return TRUE; 142 } 143 UBool appendZeroCC(UChar32 c, UErrorCode &errorCode); 144 UBool appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode); 145 void remove(); 146 void removeSuffix(int32_t suffixLength); 147 void setReorderingLimit(UChar *newLimit) { 148 remainingCapacity+=(int32_t)(limit-newLimit); 149 reorderStart=limit=newLimit; 150 lastCC=0; 151 } 152 private: 153 /* 154 * TODO: Revisit whether it makes sense to track reorderStart. 155 * It is set to after the last known character with cc<=1, 156 * which stops previousCC() before it reads that character and looks up its cc. 157 * previousCC() is normally only called from insert(). 158 * In other words, reorderStart speeds up the insertion of a combining mark 159 * into a multi-combining mark sequence where it does not belong at the end. 160 * This might not be worth the trouble. 161 * On the other hand, it's not a huge amount of trouble. 162 * 163 * We probably need it for UNORM_SIMPLE_APPEND. 164 */ 165 166 UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode); 167 void insert(UChar32 c, uint8_t cc); 168 static void writeCodePoint(UChar *p, UChar32 c) { 169 if(c<=0xffff) { 170 *p=(UChar)c; 171 } else { 172 p[0]=U16_LEAD(c); 173 p[1]=U16_TRAIL(c); 174 } 175 } 176 UBool resize(int32_t appendLength, UErrorCode &errorCode); 177 178 const Normalizer2Impl &impl; 179 UnicodeString &str; 180 UChar *start, *reorderStart, *limit; 181 int32_t remainingCapacity; 182 uint8_t lastCC; 183 184 // private backward iterator 185 void setIterator() { codePointStart=limit; } 186 void skipPrevious(); // Requires start<codePointStart. 187 uint8_t previousCC(); // Returns 0 if there is no previous character. 188 189 UChar *codePointStart, *codePointLimit; 190 }; 191 192 class U_COMMON_API Normalizer2Impl : public UMemory { 193 public: 194 Normalizer2Impl() : memory(NULL), normTrie(NULL) { 195 fcdTrieSingleton.fInstance=NULL; 196 } 197 ~Normalizer2Impl(); 198 199 void load(const char *packageName, const char *name, UErrorCode &errorCode); 200 201 void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; 202 203 // low-level properties ------------------------------------------------ *** 204 205 const UTrie2 *getNormTrie() const { return normTrie; } 206 const UTrie2 *getFCDTrie(UErrorCode &errorCode) const ; 207 208 uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); } 209 210 UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const { 211 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 212 return UNORM_YES; 213 } else if(minMaybeYes<=norm16) { 214 return UNORM_MAYBE; 215 } else { 216 return UNORM_NO; 217 } 218 } 219 UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; } 220 UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; } 221 222 uint8_t getCC(uint16_t norm16) const { 223 if(norm16>=MIN_NORMAL_MAYBE_YES) { 224 return (uint8_t)norm16; 225 } 226 if(norm16<minNoNo || limitNoNo<=norm16) { 227 return 0; 228 } 229 return getCCFromNoNo(norm16); 230 } 231 static uint8_t getCCFromYesOrMaybe(uint16_t norm16) { 232 return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0; 233 } 234 235 uint16_t getFCD16(UChar32 c) const { return UTRIE2_GET16(fcdTrie(), c); } 236 uint16_t getFCD16FromSingleLead(UChar c) const { 237 return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(fcdTrie(), c); 238 } 239 uint16_t getFCD16FromSupplementary(UChar32 c) const { 240 return UTRIE2_GET16_FROM_SUPP(fcdTrie(), c); 241 } 242 uint16_t getFCD16FromSurrogatePair(UChar c, UChar c2) const { 243 return getFCD16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2)); 244 } 245 246 void setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 247 UTrie2 *newFCDTrie, UErrorCode &errorCode) const; 248 249 /** 250 * Get the decomposition for one code point. 251 * @param c code point 252 * @param buffer out-only buffer for algorithmic decompositions 253 * @param length out-only, takes the length of the decomposition, if any 254 * @return pointer to the decomposition, or NULL if none 255 */ 256 const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const; 257 258 enum { 259 MIN_CCC_LCCC_CP=0x300 260 }; 261 262 enum { 263 MIN_YES_YES_WITH_CC=0xff01, 264 JAMO_VT=0xff00, 265 MIN_NORMAL_MAYBE_YES=0xfe00, 266 JAMO_L=1, 267 MAX_DELTA=0x40 268 }; 269 270 enum { 271 // Byte offsets from the start of the data, after the generic header. 272 IX_NORM_TRIE_OFFSET, 273 IX_EXTRA_DATA_OFFSET, 274 IX_RESERVED2_OFFSET, 275 IX_RESERVED3_OFFSET, 276 IX_RESERVED4_OFFSET, 277 IX_RESERVED5_OFFSET, 278 IX_RESERVED6_OFFSET, 279 IX_TOTAL_SIZE, 280 281 // Code point thresholds for quick check codes. 282 IX_MIN_DECOMP_NO_CP, 283 IX_MIN_COMP_NO_MAYBE_CP, 284 285 // Norm16 value thresholds for quick check combinations and types of extra data. 286 IX_MIN_YES_NO, 287 IX_MIN_NO_NO, 288 IX_LIMIT_NO_NO, 289 IX_MIN_MAYBE_YES, 290 291 IX_RESERVED14, 292 IX_RESERVED15, 293 IX_COUNT 294 }; 295 296 enum { 297 MAPPING_HAS_CCC_LCCC_WORD=0x80, 298 MAPPING_PLUS_COMPOSITION_LIST=0x40, 299 MAPPING_NO_COMP_BOUNDARY_AFTER=0x20, 300 MAPPING_LENGTH_MASK=0x1f 301 }; 302 303 enum { 304 COMP_1_LAST_TUPLE=0x8000, 305 COMP_1_TRIPLE=1, 306 COMP_1_TRAIL_LIMIT=0x3400, 307 COMP_1_TRAIL_MASK=0x7ffe, 308 COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit 309 COMP_2_TRAIL_SHIFT=6, 310 COMP_2_TRAIL_MASK=0xffc0 311 }; 312 313 // higher-level functionality ------------------------------------------ *** 314 315 const UChar *decompose(const UChar *src, const UChar *limit, 316 ReorderingBuffer *buffer, UErrorCode &errorCode) const; 317 void decomposeAndAppend(const UChar *src, const UChar *limit, 318 UBool doDecompose, 319 ReorderingBuffer &buffer, 320 UErrorCode &errorCode) const; 321 UBool compose(const UChar *src, const UChar *limit, 322 UBool onlyContiguous, 323 UBool doCompose, 324 ReorderingBuffer &buffer, 325 UErrorCode &errorCode) const; 326 const UChar *composeQuickCheck(const UChar *src, const UChar *limit, 327 UBool onlyContiguous, 328 UNormalizationCheckResult *pQCResult) const; 329 void composeAndAppend(const UChar *src, const UChar *limit, 330 UBool doCompose, 331 UBool onlyContiguous, 332 ReorderingBuffer &buffer, 333 UErrorCode &errorCode) const; 334 const UChar *makeFCD(const UChar *src, const UChar *limit, 335 ReorderingBuffer *buffer, UErrorCode &errorCode) const; 336 void makeFCDAndAppend(const UChar *src, const UChar *limit, 337 UBool doMakeFCD, 338 ReorderingBuffer &buffer, 339 UErrorCode &errorCode) const; 340 341 UBool hasDecompBoundary(UChar32 c, UBool before) const; 342 UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } 343 344 UBool hasCompBoundaryBefore(UChar32 c) const { 345 return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); 346 } 347 UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const; 348 349 UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; } 350 UBool hasFCDBoundaryAfter(UChar32 c) const { 351 uint16_t fcd16=getFCD16(c); 352 return fcd16<=1 || (fcd16&0xff)==0; 353 } 354 UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; } 355 private: 356 static UBool U_CALLCONV 357 isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); 358 359 UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 360 UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; } 361 static UBool isInert(uint16_t norm16) { return norm16==0; } 362 // static UBool isJamoL(uint16_t norm16) const { return norm16==1; } 363 static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } 364 UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; } 365 UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; } 366 // UBool isCompYes(uint16_t norm16) const { 367 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 368 // } 369 // UBool isCompYesOrMaybe(uint16_t norm16) const { 370 // return norm16<minNoNo || minMaybeYes<=norm16; 371 // } 372 // UBool hasZeroCCFromDecompYes(uint16_t norm16) const { 373 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 374 // } 375 UBool isDecompYesAndZeroCC(uint16_t norm16) const { 376 return norm16<minYesNo || 377 norm16==JAMO_VT || 378 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 379 } 380 /** 381 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 382 * the MaybeYes which combine-forward and have ccc=0. 383 * (Standard Unicode 5.2 normalization does not have such characters.) 384 */ 385 UBool isMostDecompYesAndZeroCC(uint16_t norm16) const { 386 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 387 } 388 UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; } 389 390 // For use with isCompYes(). 391 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 392 // static uint8_t getCCFromYes(uint16_t norm16) { 393 // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; 394 // } 395 uint8_t getCCFromNoNo(uint16_t norm16) const { 396 const uint16_t *mapping=getMapping(norm16); 397 if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { 398 return (uint8_t)mapping[1]; 399 } else { 400 return 0; 401 } 402 } 403 // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() 404 uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const; 405 406 // Requires algorithmic-NoNo. 407 UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { 408 return c+norm16-(minMaybeYes-MAX_DELTA-1); 409 } 410 411 // Requires minYesNo<norm16<limitNoNo. 412 const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; } 413 const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const { 414 if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { 415 return NULL; 416 } else if(norm16<minMaybeYes) { 417 return extraData+norm16; // for yesYes; if Jamo L: harmless empty list 418 } else { 419 return maybeYesCompositions+norm16-minMaybeYes; 420 } 421 } 422 const uint16_t *getCompositionsListForComposite(uint16_t norm16) const { 423 const uint16_t *list=extraData+norm16; // composite has both mapping & compositions list 424 return list+ // mapping pointer 425 1+ // +1 to skip the first unit with the mapping lenth 426 (*list&MAPPING_LENGTH_MASK)+ // + mapping length 427 ((*list>>7)&1); // +1 if MAPPING_HAS_CCC_LCCC_WORD 428 } 429 430 const UChar *copyLowPrefixFromNulTerminated(const UChar *src, 431 UChar32 minNeedDataCP, 432 ReorderingBuffer *buffer, 433 UErrorCode &errorCode) const; 434 UBool decomposeShort(const UChar *src, const UChar *limit, 435 ReorderingBuffer &buffer, UErrorCode &errorCode) const; 436 UBool decompose(UChar32 c, uint16_t norm16, 437 ReorderingBuffer &buffer, UErrorCode &errorCode) const; 438 439 static int32_t combine(const uint16_t *list, UChar32 trail); 440 void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 441 UBool onlyContiguous) const; 442 443 UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const; 444 const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const; 445 const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const; 446 447 const UTrie2 *fcdTrie() const { return (const UTrie2 *)fcdTrieSingleton.fInstance; } 448 449 const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const; 450 const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const; 451 452 UDataMemory *memory; 453 UVersionInfo dataVersion; 454 455 // Code point thresholds for quick check codes. 456 UChar32 minDecompNoCP; 457 UChar32 minCompNoMaybeCP; 458 459 // Norm16 value thresholds for quick check combinations and types of extra data. 460 uint16_t minYesNo; 461 uint16_t minNoNo; 462 uint16_t limitNoNo; 463 uint16_t minMaybeYes; 464 465 UTrie2 *normTrie; 466 const uint16_t *maybeYesCompositions; 467 const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 468 469 SimpleSingleton fcdTrieSingleton; 470 }; 471 472 /** 473 * ICU-internal shortcut for quick access to standard Unicode normalization. 474 */ 475 class U_COMMON_API Normalizer2Factory { 476 public: 477 static const Normalizer2 *getNFCInstance(UErrorCode &errorCode); 478 static const Normalizer2 *getNFDInstance(UErrorCode &errorCode); 479 static const Normalizer2 *getFCDInstance(UErrorCode &errorCode); 480 static const Normalizer2 *getFCCInstance(UErrorCode &errorCode); 481 static const Normalizer2 *getNFKCInstance(UErrorCode &errorCode); 482 static const Normalizer2 *getNFKDInstance(UErrorCode &errorCode); 483 static const Normalizer2 *getNFKC_CFInstance(UErrorCode &errorCode); 484 static const Normalizer2 *getNoopInstance(UErrorCode &errorCode); 485 486 static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode); 487 488 static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode); 489 static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode); 490 static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode); 491 492 // Get the Impl instance of the Normalizer2. 493 // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance. 494 static const Normalizer2Impl *getImpl(const Normalizer2 *norm2); 495 496 static const UTrie2 *getFCDTrie(UErrorCode &errorCode); 497 private: 498 Normalizer2Factory(); // No instantiation. 499 }; 500 501 U_NAMESPACE_END 502 503 U_CAPI int32_t U_EXPORT2 504 unorm2_swap(const UDataSwapper *ds, 505 const void *inData, int32_t length, void *outData, 506 UErrorCode *pErrorCode); 507 508 /** 509 * Get the NF*_QC property for a code point, for u_getIntPropertyValue(). 510 * @internal 511 */ 512 U_CFUNC UNormalizationCheckResult U_EXPORT2 513 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode); 514 515 /** 516 * Internal API, used by collation code. 517 * Get access to the internal FCD trie table to be able to perform 518 * incremental, per-code unit, FCD checks in collation. 519 * One pointer is sufficient because the trie index values are offset 520 * by the index size, so that the same pointer is used to access the trie data. 521 * Code points at fcdHighStart and above have a zero FCD value. 522 * @internal 523 */ 524 U_CAPI const uint16_t * U_EXPORT2 525 unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode); 526 527 /** 528 * Internal API, used by collation code. 529 * Get the FCD value for a code unit, with 530 * bits 15..8 lead combining class 531 * bits 7..0 trail combining class 532 * 533 * If c is a lead surrogate and the value is not 0, 534 * then some of c's associated supplementary code points have a non-zero FCD value. 535 * 536 * @internal 537 */ 538 static inline uint16_t 539 unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) { 540 return fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; 541 } 542 543 /** 544 * Internal API, used by collation code. 545 * Get the FCD value of the next code point (post-increment), with 546 * bits 15..8 lead combining class 547 * bits 7..0 trail combining class 548 * 549 * @internal 550 */ 551 static inline uint16_t 552 unorm_nextFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart, 553 const UChar *&s, const UChar *limit) { 554 UChar32 c=*s++; 555 uint16_t fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; 556 if(fcd!=0 && U16_IS_LEAD(c)) { 557 UChar c2; 558 if(s!=limit && U16_IS_TRAIL(c2=*s)) { 559 ++s; 560 c=U16_GET_SUPPLEMENTARY(c, c2); 561 if(c<fcdHighStart) { 562 fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)]; 563 } else { 564 fcd=0; 565 } 566 } else /* unpaired lead surrogate */ { 567 fcd=0; 568 } 569 } 570 return fcd; 571 } 572 573 /** 574 * Internal API, used by collation code. 575 * Get the FCD value of the previous code point (pre-decrement), with 576 * bits 15..8 lead combining class 577 * bits 7..0 trail combining class 578 * 579 * @internal 580 */ 581 static inline uint16_t 582 unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart, 583 const UChar *start, const UChar *&s) { 584 UChar32 c=*--s; 585 uint16_t fcd; 586 if(!U16_IS_SURROGATE(c)) { 587 fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; 588 } else { 589 UChar c2; 590 if(U16_IS_SURROGATE_TRAIL(c) && s!=start && U16_IS_LEAD(c2=*(s-1))) { 591 --s; 592 c=U16_GET_SUPPLEMENTARY(c2, c); 593 if(c<fcdHighStart) { 594 fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)]; 595 } else { 596 fcd=0; 597 } 598 } else /* unpaired surrogate */ { 599 fcd=0; 600 } 601 } 602 return fcd; 603 } 604 605 /** 606 * Format of Normalizer2 .nrm data files. 607 * Format version 1.0. 608 * 609 * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms. 610 * ICU ships with data files for standard Unicode Normalization Forms 611 * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm). 612 * Custom (application-specific) data can be built into additional .nrm files 613 * with the gennorm2 build tool. 614 * 615 * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been 616 * cached already. Internally, Normalizer2Impl.load() reads the .nrm file. 617 * 618 * A .nrm file begins with a standard ICU data file header 619 * (DataHeader, see ucmndata.h and unicode/udata.h). 620 * The UDataInfo.dataVersion field usually contains the Unicode version 621 * for which the data was generated. 622 * 623 * After the header, the file contains the following parts. 624 * Constants are defined as enum values of the Normalizer2Impl class. 625 * 626 * Many details of the data structures are described in the design doc 627 * which is at http://site.icu-project.org/design/normalization/custom 628 * 629 * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4; 630 * 631 * The first eight indexes are byte offsets in ascending order. 632 * Each byte offset marks the start of the next part in the data file, 633 * and the end of the previous one. 634 * When two consecutive byte offsets are the same, then the corresponding part is empty. 635 * Byte offsets are offsets from after the header, 636 * that is, from the beginning of the indexes[]. 637 * Each part starts at an offset with proper alignment for its data. 638 * If necessary, the previous part may include padding bytes to achieve this alignment. 639 * 640 * minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point 641 * with a decomposition mapping, that is, with NF*D_QC=No. 642 * minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point 643 * with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward). 644 * 645 * The next four indexes are thresholds of 16-bit trie values for ranges of 646 * values indicating multiple normalization properties. 647 * minYesNo=indexes[IX_MIN_YES_NO]; 648 * minNoNo=indexes[IX_MIN_NO_NO]; 649 * limitNoNo=indexes[IX_LIMIT_NO_NO]; 650 * minMaybeYes=indexes[IX_MIN_MAYBE_YES]; 651 * See the normTrie description below and the design doc for details. 652 * 653 * UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h 654 * 655 * The trie holds the main normalization data. Each code point is mapped to a 16-bit value. 656 * Rather than using independent bits in the value (which would require more than 16 bits), 657 * information is extracted primarily via range checks. 658 * For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo 659 * means that the character has NF*C_QC=Yes and NF*D_QC=No properties, 660 * which means it has a two-way (round-trip) decomposition mapping. 661 * Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData 662 * pointing to mappings, composition lists, or both. 663 * Value norm16==0 means that the character is normalization-inert, that is, 664 * it does not have a mapping, does not participate in composition, has a zero 665 * canonical combining class, and forms a boundary where text before it and after it 666 * can be normalized independently. 667 * For details about how multiple properties are encoded in 16-bit values 668 * see the design doc. 669 * Note that the encoding cannot express all combinations of the properties involved; 670 * it only supports those combinations that are allowed by 671 * the Unicode Normalization algorithms. Details are in the design doc as well. 672 * The gennorm2 tool only builds .nrm files for data that conforms to the limitations. 673 * 674 * The trie has a value for each lead surrogate code unit representing the "worst case" 675 * properties of the 1024 supplementary characters whose UTF-16 form starts with 676 * the lead surrogate. If all of the 1024 supplementary characters are normalization-inert, 677 * then their lead surrogate code unit has the trie value 0. 678 * When the lead surrogate unit's value exceeds the quick check minimum during processing, 679 * the properties for the full supplementary code point need to be looked up. 680 * 681 * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]; 682 * uint16_t extraData[]; 683 * 684 * There is only one byte offset for the end of these two arrays. 685 * The split between them is given by the constant and variable mentioned above. 686 * 687 * The maybeYesCompositions array contains composition lists for characters that 688 * combine both forward (as starters in composition pairs) 689 * and backward (as trailing characters in composition pairs). 690 * Such characters do not occur in Unicode 5.2 but are allowed by 691 * the Unicode Normalization algorithms. 692 * If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES 693 * and the maybeYesCompositions array is empty. 694 * If there are such characters, then minMaybeYes is subtracted from their norm16 values 695 * to get the index into this array. 696 * 697 * The extraData array contains composition lists for "YesYes" characters, 698 * followed by mappings and optional composition lists for "YesNo" characters, 699 * followed by only mappings for "NoNo" characters. 700 * (Referring to pairs of NFC/NFD quick check values.) 701 * The norm16 values of those characters are directly indexes into the extraData array. 702 * 703 * The data structures for composition lists and mappings are described in the design doc. 704 */ 705 706 #endif /* !UCONFIG_NO_NORMALIZATION */ 707 #endif /* __NORMALIZER2IMPL_H__ */ 708