1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: normalizer2impl.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov22 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_NORMALIZATION 20 21 #include "unicode/normalizer2.h" 22 #include "unicode/udata.h" 23 #include "unicode/ustring.h" 24 #include "cmemory.h" 25 #include "mutex.h" 26 #include "normalizer2impl.h" 27 #include "uassert.h" 28 #include "uhash.h" 29 #include "uset_imp.h" 30 #include "utrie2.h" 31 #include "uvector.h" 32 33 U_NAMESPACE_BEGIN 34 35 // ReorderingBuffer -------------------------------------------------------- *** 36 37 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { 38 int32_t length=str.length(); 39 start=str.getBuffer(destCapacity); 40 if(start==NULL) { 41 // getBuffer() already did str.setToBogus() 42 errorCode=U_MEMORY_ALLOCATION_ERROR; 43 return FALSE; 44 } 45 limit=start+length; 46 remainingCapacity=str.getCapacity()-length; 47 reorderStart=start; 48 if(start==limit) { 49 lastCC=0; 50 } else { 51 setIterator(); 52 lastCC=previousCC(); 53 // Set reorderStart after the last code point with cc<=1 if there is one. 54 if(lastCC>1) { 55 while(previousCC()>1) {} 56 } 57 reorderStart=codePointLimit; 58 } 59 return TRUE; 60 } 61 62 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { 63 int32_t length=(int32_t)(limit-start); 64 return 65 length==(int32_t)(otherLimit-otherStart) && 66 0==u_memcmp(start, otherStart, length); 67 } 68 69 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 70 if(remainingCapacity<2 && !resize(2, errorCode)) { 71 return FALSE; 72 } 73 if(lastCC<=cc || cc==0) { 74 limit[0]=U16_LEAD(c); 75 limit[1]=U16_TRAIL(c); 76 limit+=2; 77 lastCC=cc; 78 if(cc<=1) { 79 reorderStart=limit; 80 } 81 } else { 82 insert(c, cc); 83 } 84 remainingCapacity-=2; 85 return TRUE; 86 } 87 88 UBool ReorderingBuffer::append(const UChar *s, int32_t length, 89 uint8_t leadCC, uint8_t trailCC, 90 UErrorCode &errorCode) { 91 if(length==0) { 92 return TRUE; 93 } 94 if(remainingCapacity<length && !resize(length, errorCode)) { 95 return FALSE; 96 } 97 remainingCapacity-=length; 98 if(lastCC<=leadCC || leadCC==0) { 99 if(trailCC<=1) { 100 reorderStart=limit+length; 101 } else if(leadCC<=1) { 102 reorderStart=limit+1; // Ok if not a code point boundary. 103 } 104 const UChar *sLimit=s+length; 105 do { *limit++=*s++; } while(s!=sLimit); 106 lastCC=trailCC; 107 } else { 108 int32_t i=0; 109 UChar32 c; 110 U16_NEXT(s, i, length, c); 111 insert(c, leadCC); // insert first code point 112 while(i<length) { 113 U16_NEXT(s, i, length, c); 114 if(i<length) { 115 // s must be in NFD, otherwise we need to use getCC(). 116 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 117 } else { 118 leadCC=trailCC; 119 } 120 append(c, leadCC, errorCode); 121 } 122 } 123 return TRUE; 124 } 125 126 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { 127 int32_t cpLength=U16_LENGTH(c); 128 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { 129 return FALSE; 130 } 131 remainingCapacity-=cpLength; 132 if(cpLength==1) { 133 *limit++=(UChar)c; 134 } else { 135 limit[0]=U16_LEAD(c); 136 limit[1]=U16_TRAIL(c); 137 limit+=2; 138 } 139 lastCC=0; 140 reorderStart=limit; 141 return TRUE; 142 } 143 144 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) { 145 if(s==sLimit) { 146 return TRUE; 147 } 148 int32_t length=(int32_t)(sLimit-s); 149 if(remainingCapacity<length && !resize(length, errorCode)) { 150 return FALSE; 151 } 152 u_memcpy(limit, s, length); 153 limit+=length; 154 remainingCapacity-=length; 155 lastCC=0; 156 reorderStart=limit; 157 return TRUE; 158 } 159 160 void ReorderingBuffer::remove() { 161 reorderStart=limit=start; 162 remainingCapacity=str.getCapacity(); 163 lastCC=0; 164 } 165 166 void ReorderingBuffer::removeSuffix(int32_t suffixLength) { 167 if(suffixLength<(limit-start)) { 168 limit-=suffixLength; 169 remainingCapacity+=suffixLength; 170 } else { 171 limit=start; 172 remainingCapacity=str.getCapacity(); 173 } 174 lastCC=0; 175 reorderStart=limit; 176 } 177 178 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { 179 int32_t reorderStartIndex=(int32_t)(reorderStart-start); 180 int32_t length=(int32_t)(limit-start); 181 str.releaseBuffer(length); 182 int32_t newCapacity=length+appendLength; 183 int32_t doubleCapacity=2*str.getCapacity(); 184 if(newCapacity<doubleCapacity) { 185 newCapacity=doubleCapacity; 186 } 187 if(newCapacity<256) { 188 newCapacity=256; 189 } 190 start=str.getBuffer(newCapacity); 191 if(start==NULL) { 192 // getBuffer() already did str.setToBogus() 193 errorCode=U_MEMORY_ALLOCATION_ERROR; 194 return FALSE; 195 } 196 reorderStart=start+reorderStartIndex; 197 limit=start+length; 198 remainingCapacity=str.getCapacity()-length; 199 return TRUE; 200 } 201 202 void ReorderingBuffer::skipPrevious() { 203 codePointLimit=codePointStart; 204 UChar c=*--codePointStart; 205 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { 206 --codePointStart; 207 } 208 } 209 210 uint8_t ReorderingBuffer::previousCC() { 211 codePointLimit=codePointStart; 212 if(reorderStart>=codePointStart) { 213 return 0; 214 } 215 UChar32 c=*--codePointStart; 216 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) { 217 return 0; 218 } 219 220 UChar c2; 221 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { 222 --codePointStart; 223 c=U16_GET_SUPPLEMENTARY(c2, c); 224 } 225 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 226 } 227 228 // Inserts c somewhere before the last character. 229 // Requires 0<cc<lastCC which implies reorderStart<limit. 230 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) { 231 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 232 // insert c at codePointLimit, after the character with prevCC<=cc 233 UChar *q=limit; 234 UChar *r=limit+=U16_LENGTH(c); 235 do { 236 *--r=*--q; 237 } while(codePointLimit!=q); 238 writeCodePoint(q, c); 239 if(cc<=1) { 240 reorderStart=r; 241 } 242 } 243 244 // Normalizer2Impl --------------------------------------------------------- *** 245 246 struct CanonIterData : public UMemory { 247 CanonIterData(UErrorCode &errorCode); 248 ~CanonIterData(); 249 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); 250 UTrie2 *trie; 251 UVector canonStartSets; // contains UnicodeSet * 252 }; 253 254 Normalizer2Impl::~Normalizer2Impl() { 255 udata_close(memory); 256 utrie2_close(normTrie); 257 UTrie2Singleton(fcdTrieSingleton).deleteInstance(); 258 delete (CanonIterData *)canonIterDataSingleton.fInstance; 259 } 260 261 UBool U_CALLCONV 262 Normalizer2Impl::isAcceptable(void *context, 263 const char * /* type */, const char * /*name*/, 264 const UDataInfo *pInfo) { 265 if( 266 pInfo->size>=20 && 267 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 268 pInfo->charsetFamily==U_CHARSET_FAMILY && 269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 270 pInfo->dataFormat[1]==0x72 && 271 pInfo->dataFormat[2]==0x6d && 272 pInfo->dataFormat[3]==0x32 && 273 pInfo->formatVersion[0]==1 274 ) { 275 Normalizer2Impl *me=(Normalizer2Impl *)context; 276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); 277 return TRUE; 278 } else { 279 return FALSE; 280 } 281 } 282 283 void 284 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { 285 if(U_FAILURE(errorCode)) { 286 return; 287 } 288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); 289 if(U_FAILURE(errorCode)) { 290 return; 291 } 292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); 293 const int32_t *inIndexes=(const int32_t *)inBytes; 294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; 295 if(indexesLength<=IX_MIN_MAYBE_YES) { 296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. 297 return; 298 } 299 300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 302 303 minYesNo=inIndexes[IX_MIN_YES_NO]; 304 minNoNo=inIndexes[IX_MIN_NO_NO]; 305 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 306 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 307 308 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; 309 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 310 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 311 inBytes+offset, nextOffset-offset, NULL, 312 &errorCode); 313 if(U_FAILURE(errorCode)) { 314 return; 315 } 316 317 offset=nextOffset; 318 maybeYesCompositions=(const uint16_t *)(inBytes+offset); 319 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); 320 } 321 322 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { 323 UChar32 c; 324 if(cpStart==(cpLimit-1)) { 325 c=*cpStart; 326 } else { 327 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); 328 } 329 uint16_t prevNorm16=getNorm16(c); 330 if(prevNorm16<=minYesNo) { 331 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 332 } else { 333 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo 334 } 335 } 336 337 U_CDECL_BEGIN 338 339 static UBool U_CALLCONV 340 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 341 /* add the start code point to the USet */ 342 const USetAdder *sa=(const USetAdder *)context; 343 sa->add(sa->set, start); 344 return TRUE; 345 } 346 347 static uint32_t U_CALLCONV 348 segmentStarterMapper(const void * /*context*/, uint32_t value) { 349 return value&CANON_NOT_SEGMENT_STARTER; 350 } 351 352 U_CDECL_END 353 354 void 355 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { 356 /* add the start code point of each same-value range of each trie */ 357 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa); 358 359 /* add Hangul LV syllables and LV+1 because of skippables */ 360 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { 361 sa->add(sa->set, c); 362 sa->add(sa->set, c+1); 363 } 364 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 365 } 366 367 void 368 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { 369 /* add the start code point of each same-value range of the canonical iterator data trie */ 370 if(ensureCanonIterData(errorCode)) { 371 // currently only used for the SEGMENT_STARTER property 372 utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, 373 segmentStarterMapper, enumPropertyStartsRange, sa); 374 } 375 } 376 377 const UChar * 378 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, 379 UChar32 minNeedDataCP, 380 ReorderingBuffer *buffer, 381 UErrorCode &errorCode) const { 382 // Make some effort to support NUL-terminated strings reasonably. 383 // Take the part of the fast quick check loop that does not look up 384 // data and check the first part of the string. 385 // After this prefix, determine the string length to simplify the rest 386 // of the code. 387 const UChar *prevSrc=src; 388 UChar c; 389 while((c=*src++)<minNeedDataCP && c!=0) {} 390 // Back out the last character for full processing. 391 // Copy this prefix. 392 if(--src!=prevSrc) { 393 if(buffer!=NULL) { 394 buffer->appendZeroCC(prevSrc, src, errorCode); 395 } 396 } 397 return src; 398 } 399 400 // Dual functionality: 401 // buffer!=NULL: normalize 402 // buffer==NULL: isNormalized/spanQuickCheckYes 403 const UChar * 404 Normalizer2Impl::decompose(const UChar *src, const UChar *limit, 405 ReorderingBuffer *buffer, 406 UErrorCode &errorCode) const { 407 UChar32 minNoCP=minDecompNoCP; 408 if(limit==NULL) { 409 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); 410 if(U_FAILURE(errorCode)) { 411 return src; 412 } 413 limit=u_strchr(src, 0); 414 } 415 416 const UChar *prevSrc; 417 UChar32 c=0; 418 uint16_t norm16=0; 419 420 // only for quick check 421 const UChar *prevBoundary=src; 422 uint8_t prevCC=0; 423 424 for(;;) { 425 // count code units below the minimum or with irrelevant data for the quick check 426 for(prevSrc=src; src!=limit;) { 427 if( (c=*src)<minNoCP || 428 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 429 ) { 430 ++src; 431 } else if(!U16_IS_SURROGATE(c)) { 432 break; 433 } else { 434 UChar c2; 435 if(U16_IS_SURROGATE_LEAD(c)) { 436 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 437 c=U16_GET_SUPPLEMENTARY(c, c2); 438 } 439 } else /* trail surrogate */ { 440 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 441 --src; 442 c=U16_GET_SUPPLEMENTARY(c2, c); 443 } 444 } 445 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 446 src+=U16_LENGTH(c); 447 } else { 448 break; 449 } 450 } 451 } 452 // copy these code units all at once 453 if(src!=prevSrc) { 454 if(buffer!=NULL) { 455 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { 456 break; 457 } 458 } else { 459 prevCC=0; 460 prevBoundary=src; 461 } 462 } 463 if(src==limit) { 464 break; 465 } 466 467 // Check one above-minimum, relevant code point. 468 src+=U16_LENGTH(c); 469 if(buffer!=NULL) { 470 if(!decompose(c, norm16, *buffer, errorCode)) { 471 break; 472 } 473 } else { 474 if(isDecompYes(norm16)) { 475 uint8_t cc=getCCFromYesOrMaybe(norm16); 476 if(prevCC<=cc || cc==0) { 477 prevCC=cc; 478 if(cc<=1) { 479 prevBoundary=src; 480 } 481 continue; 482 } 483 } 484 return prevBoundary; // "no" or cc out of order 485 } 486 } 487 return src; 488 } 489 490 // Decompose a short piece of text which is likely to contain characters that 491 // fail the quick check loop and/or where the quick check loop's overhead 492 // is unlikely to be amortized. 493 // Called by the compose() and makeFCD() implementations. 494 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit, 495 ReorderingBuffer &buffer, 496 UErrorCode &errorCode) const { 497 while(src<limit) { 498 UChar32 c; 499 uint16_t norm16; 500 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16); 501 if(!decompose(c, norm16, buffer, errorCode)) { 502 return FALSE; 503 } 504 } 505 return TRUE; 506 } 507 508 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, 509 ReorderingBuffer &buffer, 510 UErrorCode &errorCode) const { 511 // Only loops for 1:1 algorithmic mappings. 512 for(;;) { 513 // get the decomposition and the lead and trail cc's 514 if(isDecompYes(norm16)) { 515 // c does not decompose 516 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode); 517 } else if(isHangul(norm16)) { 518 // Hangul syllable: decompose algorithmically 519 UChar jamos[3]; 520 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); 521 } else if(isDecompNoAlgorithmic(norm16)) { 522 c=mapAlgorithmic(c, norm16); 523 norm16=getNorm16(c); 524 } else { 525 // c decomposes, get everything from the variable-length extra data 526 const uint16_t *mapping=getMapping(norm16); 527 uint16_t firstUnit=*mapping++; 528 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 529 uint8_t leadCC, trailCC; 530 trailCC=(uint8_t)(firstUnit>>8); 531 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 532 leadCC=(uint8_t)(*mapping++>>8); 533 } else { 534 leadCC=0; 535 } 536 return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode); 537 } 538 } 539 } 540 541 const UChar * 542 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { 543 const UChar *decomp=NULL; 544 uint16_t norm16; 545 for(;;) { 546 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 547 // c does not decompose 548 return decomp; 549 } else if(isHangul(norm16)) { 550 // Hangul syllable: decompose algorithmically 551 length=Hangul::decompose(c, buffer); 552 return buffer; 553 } else if(isDecompNoAlgorithmic(norm16)) { 554 c=mapAlgorithmic(c, norm16); 555 decomp=buffer; 556 length=0; 557 U16_APPEND_UNSAFE(buffer, length, c); 558 } else { 559 // c decomposes, get everything from the variable-length extra data 560 const uint16_t *mapping=getMapping(norm16); 561 uint16_t firstUnit=*mapping++; 562 length=firstUnit&MAPPING_LENGTH_MASK; 563 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 564 ++mapping; 565 } 566 return (const UChar *)mapping; 567 } 568 } 569 } 570 571 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, 572 UBool doDecompose, 573 ReorderingBuffer &buffer, 574 UErrorCode &errorCode) const { 575 if(doDecompose) { 576 decompose(src, limit, &buffer, errorCode); 577 return; 578 } 579 // Just merge the strings at the boundary. 580 ForwardUTrie2StringIterator iter(normTrie, src, limit); 581 uint8_t firstCC, prevCC, cc; 582 firstCC=prevCC=cc=getCC(iter.next16()); 583 while(cc!=0) { 584 prevCC=cc; 585 cc=getCC(iter.next16()); 586 }; 587 buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) && 588 buffer.appendZeroCC(iter.codePointStart, limit, errorCode); 589 } 590 591 // Note: hasDecompBoundary() could be implemented as aliases to 592 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 593 // at the cost of building the FCD trie for a decomposition normalizer. 594 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { 595 for(;;) { 596 if(c<minDecompNoCP) { 597 return TRUE; 598 } 599 uint16_t norm16=getNorm16(c); 600 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 601 return TRUE; 602 } else if(norm16>MIN_NORMAL_MAYBE_YES) { 603 return FALSE; // ccc!=0 604 } else if(isDecompNoAlgorithmic(norm16)) { 605 c=mapAlgorithmic(c, norm16); 606 } else { 607 // c decomposes, get everything from the variable-length extra data 608 const uint16_t *mapping=getMapping(norm16); 609 uint16_t firstUnit=*mapping++; 610 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 611 return FALSE; 612 } 613 if(!before) { 614 // decomp after-boundary: same as hasFCDBoundaryAfter(), 615 // fcd16<=1 || trailCC==0 616 if(firstUnit>0x1ff) { 617 return FALSE; // trailCC>1 618 } 619 if(firstUnit<=0xff) { 620 return TRUE; // trailCC==0 621 } 622 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 623 } 624 // TRUE if leadCC==0 (hasFCDBoundaryBefore()) 625 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0; 626 } 627 } 628 } 629 630 /* 631 * Finds the recomposition result for 632 * a forward-combining "lead" character, 633 * specified with a pointer to its compositions list, 634 * and a backward-combining "trail" character. 635 * 636 * If the lead and trail characters combine, then this function returns 637 * the following "compositeAndFwd" value: 638 * Bits 21..1 composite character 639 * Bit 0 set if the composite is a forward-combining starter 640 * otherwise it returns -1. 641 * 642 * The compositions list has (trail, compositeAndFwd) pair entries, 643 * encoded as either pairs or triples of 16-bit units. 644 * The last entry has the high bit of its first unit set. 645 * 646 * The list is sorted by ascending trail characters (there are no duplicates). 647 * A linear search is used. 648 * 649 * See normalizer2impl.h for a more detailed description 650 * of the compositions list format. 651 */ 652 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { 653 uint16_t key1, firstUnit; 654 if(trail<COMP_1_TRAIL_LIMIT) { 655 // trail character is 0..33FF 656 // result entry may have 2 or 3 units 657 key1=(uint16_t)(trail<<1); 658 while(key1>(firstUnit=*list)) { 659 list+=2+(firstUnit&COMP_1_TRIPLE); 660 } 661 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 662 if(firstUnit&COMP_1_TRIPLE) { 663 return ((int32_t)list[1]<<16)|list[2]; 664 } else { 665 return list[1]; 666 } 667 } 668 } else { 669 // trail character is 3400..10FFFF 670 // result entry has 3 units 671 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ 672 (((trail>>COMP_1_TRAIL_SHIFT))& 673 ~COMP_1_TRIPLE)); 674 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT); 675 uint16_t secondUnit; 676 for(;;) { 677 if(key1>(firstUnit=*list)) { 678 list+=2+(firstUnit&COMP_1_TRIPLE); 679 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 680 if(key2>(secondUnit=list[1])) { 681 if(firstUnit&COMP_1_LAST_TUPLE) { 682 break; 683 } else { 684 list+=3; 685 } 686 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 687 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; 688 } else { 689 break; 690 } 691 } else { 692 break; 693 } 694 } 695 } 696 return -1; 697 } 698 699 /** 700 * @param list some character's compositions list 701 * @param set recursively receives the composites from these compositions 702 */ 703 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { 704 uint16_t firstUnit; 705 int32_t compositeAndFwd; 706 do { 707 firstUnit=*list; 708 if((firstUnit&COMP_1_TRIPLE)==0) { 709 compositeAndFwd=list[1]; 710 list+=2; 711 } else { 712 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; 713 list+=3; 714 } 715 UChar32 composite=compositeAndFwd>>1; 716 if((compositeAndFwd&1)!=0) { 717 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 718 } 719 set.add(composite); 720 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 721 } 722 723 /* 724 * Recomposes the buffer text starting at recomposeStartIndex 725 * (which is in NFD - decomposed and canonically ordered), 726 * and truncates the buffer contents. 727 * 728 * Note that recomposition never lengthens the text: 729 * Any character consists of either one or two code units; 730 * a composition may contain at most one more code unit than the original starter, 731 * while the combining mark that is removed has at least one code unit. 732 */ 733 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 734 UBool onlyContiguous) const { 735 UChar *p=buffer.getStart()+recomposeStartIndex; 736 UChar *limit=buffer.getLimit(); 737 if(p==limit) { 738 return; 739 } 740 741 UChar *starter, *pRemove, *q, *r; 742 const uint16_t *compositionsList; 743 UChar32 c, compositeAndFwd; 744 uint16_t norm16; 745 uint8_t cc, prevCC; 746 UBool starterIsSupplementary; 747 748 // Some of the following variables are not used until we have a forward-combining starter 749 // and are only initialized now to avoid compiler warnings. 750 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter 751 starter=NULL; 752 starterIsSupplementary=FALSE; 753 prevCC=0; 754 755 for(;;) { 756 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16); 757 cc=getCCFromYesOrMaybe(norm16); 758 if( // this character combines backward and 759 isMaybe(norm16) && 760 // we have seen a starter that combines forward and 761 compositionsList!=NULL && 762 // the backward-combining character is not blocked 763 (prevCC<cc || prevCC==0) 764 ) { 765 if(isJamoVT(norm16)) { 766 // c is a Jamo V/T, see if we can compose it with the previous character. 767 if(c<Hangul::JAMO_T_BASE) { 768 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 769 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE); 770 if(prev<Hangul::JAMO_L_COUNT) { 771 pRemove=p-1; 772 UChar syllable=(UChar) 773 (Hangul::HANGUL_BASE+ 774 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 775 Hangul::JAMO_T_COUNT); 776 UChar t; 777 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 778 ++p; 779 syllable+=t; // The next character was a Jamo T. 780 } 781 *starter=syllable; 782 // remove the Jamo V/T 783 q=pRemove; 784 r=p; 785 while(r<limit) { 786 *q++=*r++; 787 } 788 limit=q; 789 p=pRemove; 790 } 791 } 792 /* 793 * No "else" for Jamo T: 794 * Since the input is in NFD, there are no Hangul LV syllables that 795 * a Jamo T could combine with. 796 * All Jamo Ts are combined above when handling Jamo Vs. 797 */ 798 if(p==limit) { 799 break; 800 } 801 compositionsList=NULL; 802 continue; 803 } else if((compositeAndFwd=combine(compositionsList, c))>=0) { 804 // The starter and the combining mark (c) do combine. 805 UChar32 composite=compositeAndFwd>>1; 806 807 // Replace the starter with the composite, remove the combining mark. 808 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark 809 if(starterIsSupplementary) { 810 if(U_IS_SUPPLEMENTARY(composite)) { 811 // both are supplementary 812 starter[0]=U16_LEAD(composite); 813 starter[1]=U16_TRAIL(composite); 814 } else { 815 *starter=(UChar)composite; 816 // The composite is shorter than the starter, 817 // move the intermediate characters forward one. 818 starterIsSupplementary=FALSE; 819 q=starter+1; 820 r=q+1; 821 while(r<pRemove) { 822 *q++=*r++; 823 } 824 --pRemove; 825 } 826 } else if(U_IS_SUPPLEMENTARY(composite)) { 827 // The composite is longer than the starter, 828 // move the intermediate characters back one. 829 starterIsSupplementary=TRUE; 830 ++starter; // temporarily increment for the loop boundary 831 q=pRemove; 832 r=++pRemove; 833 while(starter<q) { 834 *--r=*--q; 835 } 836 *starter=U16_TRAIL(composite); 837 *--starter=U16_LEAD(composite); // undo the temporary increment 838 } else { 839 // both are on the BMP 840 *starter=(UChar)composite; 841 } 842 843 /* remove the combining mark by moving the following text over it */ 844 if(pRemove<p) { 845 q=pRemove; 846 r=p; 847 while(r<limit) { 848 *q++=*r++; 849 } 850 limit=q; 851 p=pRemove; 852 } 853 // Keep prevCC because we removed the combining mark. 854 855 if(p==limit) { 856 break; 857 } 858 // Is the composite a starter that combines forward? 859 if(compositeAndFwd&1) { 860 compositionsList= 861 getCompositionsListForComposite(getNorm16(composite)); 862 } else { 863 compositionsList=NULL; 864 } 865 866 // We combined; continue with looking for compositions. 867 continue; 868 } 869 } 870 871 // no combination this time 872 prevCC=cc; 873 if(p==limit) { 874 break; 875 } 876 877 // If c did not combine, then check if it is a starter. 878 if(cc==0) { 879 // Found a new starter. 880 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) { 881 // It may combine with something, prepare for it. 882 if(U_IS_BMP(c)) { 883 starterIsSupplementary=FALSE; 884 starter=p-1; 885 } else { 886 starterIsSupplementary=TRUE; 887 starter=p-2; 888 } 889 } 890 } else if(onlyContiguous) { 891 // FCC: no discontiguous compositions; any intervening character blocks. 892 compositionsList=NULL; 893 } 894 } 895 buffer.setReorderingLimit(limit); 896 } 897 898 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 899 // doCompose: normalize 900 // !doCompose: isNormalized (buffer must be empty and initialized) 901 UBool 902 Normalizer2Impl::compose(const UChar *src, const UChar *limit, 903 UBool onlyContiguous, 904 UBool doCompose, 905 ReorderingBuffer &buffer, 906 UErrorCode &errorCode) const { 907 /* 908 * prevBoundary points to the last character before the current one 909 * that has a composition boundary before it with ccc==0 and quick check "yes". 910 * Keeping track of prevBoundary saves us looking for a composition boundary 911 * when we find a "no" or "maybe". 912 * 913 * When we back out from prevSrc back to prevBoundary, 914 * then we also remove those same characters (which had been simply copied 915 * or canonically-order-inserted) from the ReorderingBuffer. 916 * Therefore, at all times, the [prevBoundary..prevSrc[ source units 917 * must correspond 1:1 to destination units at the end of the destination buffer. 918 */ 919 const UChar *prevBoundary=src; 920 UChar32 minNoMaybeCP=minCompNoMaybeCP; 921 if(limit==NULL) { 922 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, 923 doCompose ? &buffer : NULL, 924 errorCode); 925 if(U_FAILURE(errorCode)) { 926 return FALSE; 927 } 928 if(prevBoundary<src) { 929 // Set prevBoundary to the last character in the prefix. 930 prevBoundary=src-1; 931 } 932 limit=u_strchr(src, 0); 933 } 934 935 const UChar *prevSrc; 936 UChar32 c=0; 937 uint16_t norm16=0; 938 939 // only for isNormalized 940 uint8_t prevCC=0; 941 942 for(;;) { 943 // count code units below the minimum or with irrelevant data for the quick check 944 for(prevSrc=src; src!=limit;) { 945 if( (c=*src)<minNoMaybeCP || 946 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 947 ) { 948 ++src; 949 } else if(!U16_IS_SURROGATE(c)) { 950 break; 951 } else { 952 UChar c2; 953 if(U16_IS_SURROGATE_LEAD(c)) { 954 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 955 c=U16_GET_SUPPLEMENTARY(c, c2); 956 } 957 } else /* trail surrogate */ { 958 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 959 --src; 960 c=U16_GET_SUPPLEMENTARY(c2, c); 961 } 962 } 963 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 964 src+=U16_LENGTH(c); 965 } else { 966 break; 967 } 968 } 969 } 970 // copy these code units all at once 971 if(src!=prevSrc) { 972 if(doCompose) { 973 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) { 974 break; 975 } 976 } else { 977 prevCC=0; 978 } 979 if(src==limit) { 980 break; 981 } 982 // Set prevBoundary to the last character in the quick check loop. 983 prevBoundary=src-1; 984 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 985 U16_IS_LEAD(*(prevBoundary-1)) 986 ) { 987 --prevBoundary; 988 } 989 // The start of the current character (c). 990 prevSrc=src; 991 } else if(src==limit) { 992 break; 993 } 994 995 src+=U16_LENGTH(c); 996 /* 997 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 998 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 999 * or has ccc!=0. 1000 * Check for Jamo V/T, then for regular characters. 1001 * c is not a Hangul syllable or Jamo L because those have "yes" properties. 1002 */ 1003 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 1004 UChar prev=*(prevSrc-1); 1005 UBool needToDecompose=FALSE; 1006 if(c<Hangul::JAMO_T_BASE) { 1007 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1008 prev=(UChar)(prev-Hangul::JAMO_L_BASE); 1009 if(prev<Hangul::JAMO_L_COUNT) { 1010 if(!doCompose) { 1011 return FALSE; 1012 } 1013 UChar syllable=(UChar) 1014 (Hangul::HANGUL_BASE+ 1015 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 1016 Hangul::JAMO_T_COUNT); 1017 UChar t; 1018 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 1019 ++src; 1020 syllable+=t; // The next character was a Jamo T. 1021 prevBoundary=src; 1022 buffer.setLastChar(syllable); 1023 continue; 1024 } 1025 // If we see L+V+x where x!=T then we drop to the slow path, 1026 // decompose and recompose. 1027 // This is to deal with NFKC finding normal L and V but a 1028 // compatibility variant of a T. We need to either fully compose that 1029 // combination here (which would complicate the code and may not work 1030 // with strange custom data) or use the slow path -- or else our replacing 1031 // two input characters (L+V) with one output character (LV syllable) 1032 // would violate the invariant that [prevBoundary..prevSrc[ has the same 1033 // length as what we appended to the buffer since prevBoundary. 1034 needToDecompose=TRUE; 1035 } 1036 } else if(Hangul::isHangulWithoutJamoT(prev)) { 1037 // c is a Jamo Trailing consonant, 1038 // compose with previous Hangul LV that does not contain a Jamo T. 1039 if(!doCompose) { 1040 return FALSE; 1041 } 1042 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE)); 1043 prevBoundary=src; 1044 continue; 1045 } 1046 if(!needToDecompose) { 1047 // The Jamo V/T did not compose into a Hangul syllable. 1048 if(doCompose) { 1049 if(!buffer.appendBMP((UChar)c, 0, errorCode)) { 1050 break; 1051 } 1052 } else { 1053 prevCC=0; 1054 } 1055 continue; 1056 } 1057 } 1058 /* 1059 * Source buffer pointers: 1060 * 1061 * all done quick check current char not yet 1062 * "yes" but (c) processed 1063 * may combine 1064 * forward 1065 * [-------------[-------------[-------------[-------------[ 1066 * | | | | | 1067 * orig. src prevBoundary prevSrc src limit 1068 * 1069 * 1070 * Destination buffer pointers inside the ReorderingBuffer: 1071 * 1072 * all done might take not filled yet 1073 * characters for 1074 * reordering 1075 * [-------------[-------------[-------------[ 1076 * | | | | 1077 * start reorderStart limit | 1078 * +remainingCap.+ 1079 */ 1080 if(norm16>=MIN_YES_YES_WITH_CC) { 1081 uint8_t cc=(uint8_t)norm16; // cc!=0 1082 if( onlyContiguous && // FCC 1083 (doCompose ? buffer.getLastCC() : prevCC)==0 && 1084 prevBoundary<prevSrc && 1085 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 1086 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1087 // passed the quick check "yes && ccc==0" test. 1088 // Check whether the last character was a "yesYes" or a "yesNo". 1089 // If a "yesNo", then we get its trailing ccc from its 1090 // mapping and check for canonical order. 1091 // All other cases are ok. 1092 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1093 ) { 1094 // Fails FCD test, need to decompose and contiguously recompose. 1095 if(!doCompose) { 1096 return FALSE; 1097 } 1098 } else if(doCompose) { 1099 if(!buffer.append(c, cc, errorCode)) { 1100 break; 1101 } 1102 continue; 1103 } else if(prevCC<=cc) { 1104 prevCC=cc; 1105 continue; 1106 } else { 1107 return FALSE; 1108 } 1109 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 1110 return FALSE; 1111 } 1112 1113 /* 1114 * Find appropriate boundaries around this character, 1115 * decompose the source text from between the boundaries, 1116 * and recompose it. 1117 * 1118 * We may need to remove the last few characters from the ReorderingBuffer 1119 * to account for source text that was copied or appended 1120 * but needs to take part in the recomposition. 1121 */ 1122 1123 /* 1124 * Find the last composition boundary in [prevBoundary..src[. 1125 * It is either the decomposition of the current character (at prevSrc), 1126 * or prevBoundary. 1127 */ 1128 if(hasCompBoundaryBefore(c, norm16)) { 1129 prevBoundary=prevSrc; 1130 } else if(doCompose) { 1131 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary)); 1132 } 1133 1134 // Find the next composition boundary in [src..limit[ - 1135 // modifies src to point to the next starter. 1136 src=(UChar *)findNextCompBoundary(src, limit); 1137 1138 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 1139 int32_t recomposeStartIndex=buffer.length(); 1140 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) { 1141 break; 1142 } 1143 recompose(buffer, recomposeStartIndex, onlyContiguous); 1144 if(!doCompose) { 1145 if(!buffer.equals(prevBoundary, src)) { 1146 return FALSE; 1147 } 1148 buffer.remove(); 1149 prevCC=0; 1150 } 1151 1152 // Move to the next starter. We never need to look back before this point again. 1153 prevBoundary=src; 1154 } 1155 return TRUE; 1156 } 1157 1158 // Very similar to compose(): Make the same changes in both places if relevant. 1159 // pQCResult==NULL: spanQuickCheckYes 1160 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES) 1161 const UChar * 1162 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, 1163 UBool onlyContiguous, 1164 UNormalizationCheckResult *pQCResult) const { 1165 /* 1166 * prevBoundary points to the last character before the current one 1167 * that has a composition boundary before it with ccc==0 and quick check "yes". 1168 */ 1169 const UChar *prevBoundary=src; 1170 UChar32 minNoMaybeCP=minCompNoMaybeCP; 1171 if(limit==NULL) { 1172 UErrorCode errorCode=U_ZERO_ERROR; 1173 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode); 1174 if(prevBoundary<src) { 1175 // Set prevBoundary to the last character in the prefix. 1176 prevBoundary=src-1; 1177 } 1178 limit=u_strchr(src, 0); 1179 } 1180 1181 const UChar *prevSrc; 1182 UChar32 c=0; 1183 uint16_t norm16=0; 1184 uint8_t prevCC=0; 1185 1186 for(;;) { 1187 // count code units below the minimum or with irrelevant data for the quick check 1188 for(prevSrc=src;;) { 1189 if(src==limit) { 1190 return src; 1191 } 1192 if( (c=*src)<minNoMaybeCP || 1193 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1194 ) { 1195 ++src; 1196 } else if(!U16_IS_SURROGATE(c)) { 1197 break; 1198 } else { 1199 UChar c2; 1200 if(U16_IS_SURROGATE_LEAD(c)) { 1201 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1202 c=U16_GET_SUPPLEMENTARY(c, c2); 1203 } 1204 } else /* trail surrogate */ { 1205 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1206 --src; 1207 c=U16_GET_SUPPLEMENTARY(c2, c); 1208 } 1209 } 1210 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1211 src+=U16_LENGTH(c); 1212 } else { 1213 break; 1214 } 1215 } 1216 } 1217 if(src!=prevSrc) { 1218 // Set prevBoundary to the last character in the quick check loop. 1219 prevBoundary=src-1; 1220 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 1221 U16_IS_LEAD(*(prevBoundary-1)) 1222 ) { 1223 --prevBoundary; 1224 } 1225 prevCC=0; 1226 // The start of the current character (c). 1227 prevSrc=src; 1228 } 1229 1230 src+=U16_LENGTH(c); 1231 /* 1232 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1233 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1234 * or has ccc!=0. 1235 */ 1236 if(isMaybeOrNonZeroCC(norm16)) { 1237 uint8_t cc=getCCFromYesOrMaybe(norm16); 1238 if( onlyContiguous && // FCC 1239 cc!=0 && 1240 prevCC==0 && 1241 prevBoundary<prevSrc && 1242 // prevCC==0 && prevBoundary<prevSrc tell us that 1243 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1244 // passed the quick check "yes && ccc==0" test. 1245 // Check whether the last character was a "yesYes" or a "yesNo". 1246 // If a "yesNo", then we get its trailing ccc from its 1247 // mapping and check for canonical order. 1248 // All other cases are ok. 1249 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1250 ) { 1251 // Fails FCD test. 1252 } else if(prevCC<=cc || cc==0) { 1253 prevCC=cc; 1254 if(norm16<MIN_YES_YES_WITH_CC) { 1255 if(pQCResult!=NULL) { 1256 *pQCResult=UNORM_MAYBE; 1257 } else { 1258 return prevBoundary; 1259 } 1260 } 1261 continue; 1262 } 1263 } 1264 if(pQCResult!=NULL) { 1265 *pQCResult=UNORM_NO; 1266 } 1267 return prevBoundary; 1268 } 1269 } 1270 1271 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, 1272 UBool doCompose, 1273 UBool onlyContiguous, 1274 ReorderingBuffer &buffer, 1275 UErrorCode &errorCode) const { 1276 if(!buffer.isEmpty()) { 1277 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit); 1278 if(src!=firstStarterInSrc) { 1279 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), 1280 buffer.getLimit()); 1281 UnicodeString middle(lastStarterInDest, 1282 (int32_t)(buffer.getLimit()-lastStarterInDest)); 1283 buffer.removeSuffix((int32_t)(buffer.getLimit()-lastStarterInDest)); 1284 middle.append(src, (int32_t)(firstStarterInSrc-src)); 1285 const UChar *middleStart=middle.getBuffer(); 1286 compose(middleStart, middleStart+middle.length(), onlyContiguous, 1287 TRUE, buffer, errorCode); 1288 if(U_FAILURE(errorCode)) { 1289 return; 1290 } 1291 src=firstStarterInSrc; 1292 } 1293 } 1294 if(doCompose) { 1295 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode); 1296 } else { 1297 buffer.appendZeroCC(src, limit, errorCode); 1298 } 1299 } 1300 1301 /** 1302 * Does c have a composition boundary before it? 1303 * True if its decomposition begins with a character that has 1304 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1305 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1306 * (isCompYesAndZeroCC()) so we need not decompose. 1307 */ 1308 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { 1309 for(;;) { 1310 if(isCompYesAndZeroCC(norm16)) { 1311 return TRUE; 1312 } else if(isMaybeOrNonZeroCC(norm16)) { 1313 return FALSE; 1314 } else if(isDecompNoAlgorithmic(norm16)) { 1315 c=mapAlgorithmic(c, norm16); 1316 norm16=getNorm16(c); 1317 } else { 1318 // c decomposes, get everything from the variable-length extra data 1319 const uint16_t *mapping=getMapping(norm16); 1320 uint16_t firstUnit=*mapping++; 1321 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1322 return FALSE; 1323 } 1324 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) { 1325 return FALSE; // non-zero leadCC 1326 } 1327 int32_t i=0; 1328 UChar32 c; 1329 U16_NEXT_UNSAFE(mapping, i, c); 1330 return isCompYesAndZeroCC(getNorm16(c)); 1331 } 1332 } 1333 } 1334 1335 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const { 1336 for(;;) { 1337 uint16_t norm16=getNorm16(c); 1338 if(isInert(norm16)) { 1339 return TRUE; 1340 } else if(norm16<=minYesNo) { 1341 // Hangul LVT (==minYesNo) has a boundary after it. 1342 // Hangul LV and non-inert yesYes characters combine forward. 1343 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c); 1344 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 1345 return FALSE; 1346 } else if(isDecompNoAlgorithmic(norm16)) { 1347 c=mapAlgorithmic(c, norm16); 1348 } else { 1349 // c decomposes, get everything from the variable-length extra data. 1350 // If testInert, then c must be a yesNo character which has lccc=0, 1351 // otherwise it could be a noNo. 1352 const uint16_t *mapping=getMapping(norm16); 1353 uint16_t firstUnit=*mapping; 1354 // TRUE if 1355 // c is not deleted, and 1356 // it and its decomposition do not combine forward, and it has a starter, and 1357 // if FCC then trailCC<=1 1358 return 1359 (firstUnit&MAPPING_LENGTH_MASK)!=0 && 1360 (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 && 1361 (!onlyContiguous || firstUnit<=0x1ff); 1362 } 1363 } 1364 } 1365 1366 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const { 1367 BackwardUTrie2StringIterator iter(normTrie, start, p); 1368 uint16_t norm16; 1369 do { 1370 norm16=iter.previous16(); 1371 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1372 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 1373 // but that's probably not worth the extra cost. 1374 return iter.codePointStart; 1375 } 1376 1377 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const { 1378 ForwardUTrie2StringIterator iter(normTrie, p, limit); 1379 uint16_t norm16; 1380 do { 1381 norm16=iter.next16(); 1382 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1383 return iter.codePointStart; 1384 } 1385 1386 class FCDTrieSingleton : public UTrie2Singleton { 1387 public: 1388 FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) : 1389 UTrie2Singleton(s), impl(ni), errorCode(ec) {} 1390 UTrie2 *getInstance(UErrorCode &errorCode) { 1391 return UTrie2Singleton::getInstance(createInstance, this, errorCode); 1392 } 1393 static void *createInstance(const void *context, UErrorCode &errorCode); 1394 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 1395 if(value!=0) { 1396 impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode); 1397 } 1398 return U_SUCCESS(errorCode); 1399 } 1400 1401 Normalizer2Impl &impl; 1402 UTrie2 *newFCDTrie; 1403 UErrorCode &errorCode; 1404 }; 1405 1406 U_CDECL_BEGIN 1407 1408 // Set the FCD value for a range of same-norm16 characters. 1409 static UBool U_CALLCONV 1410 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1411 return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value); 1412 } 1413 1414 // Collect (OR together) the FCD values for a range of supplementary characters, 1415 // for their lead surrogate code unit. 1416 static UBool U_CALLCONV 1417 enumRangeOrValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { 1418 *((uint32_t *)context)|=value; 1419 return TRUE; 1420 } 1421 1422 U_CDECL_END 1423 1424 void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) { 1425 FCDTrieSingleton *me=(FCDTrieSingleton *)context; 1426 me->newFCDTrie=utrie2_open(0, 0, &errorCode); 1427 if(U_SUCCESS(errorCode)) { 1428 utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me); 1429 for(UChar lead=0xd800; lead<0xdc00; ++lead) { 1430 uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead); 1431 utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue); 1432 if(oredValue!=0) { 1433 // Set a "bad" value for makeFCD() to break the quick check loop 1434 // and look up the value for the supplementary code point. 1435 // If there is any lccc, then set the worst-case lccc of 1. 1436 // The ORed-together value's tccc is already the worst case. 1437 if(oredValue>0xff) { 1438 oredValue=0x100|(oredValue&0xff); 1439 } 1440 utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode); 1441 } 1442 } 1443 utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode); 1444 if(U_SUCCESS(errorCode)) { 1445 return me->newFCDTrie; 1446 } 1447 } 1448 utrie2_close(me->newFCDTrie); 1449 return NULL; 1450 } 1451 1452 void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 1453 UTrie2 *newFCDTrie, UErrorCode &errorCode) const { 1454 // Only loops for 1:1 algorithmic mappings. 1455 for(;;) { 1456 if(norm16>=MIN_NORMAL_MAYBE_YES) { 1457 norm16&=0xff; 1458 norm16|=norm16<<8; 1459 } else if(norm16<=minYesNo || minMaybeYes<=norm16) { 1460 // no decomposition or Hangul syllable, all zeros 1461 break; 1462 } else if(limitNoNo<=norm16) { 1463 int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1); 1464 if(start==end) { 1465 start+=delta; 1466 norm16=getNorm16(start); 1467 } else { 1468 // the same delta leads from different original characters to different mappings 1469 do { 1470 UChar32 c=start+delta; 1471 setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode); 1472 } while(++start<=end); 1473 break; 1474 } 1475 } else { 1476 // c decomposes, get everything from the variable-length extra data 1477 const uint16_t *mapping=getMapping(norm16); 1478 uint16_t firstUnit=*mapping; 1479 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1480 // A character that is deleted (maps to an empty string) must 1481 // get the worst-case lccc and tccc values because arbitrary 1482 // characters on both sides will become adjacent. 1483 norm16=0x1ff; 1484 } else { 1485 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 1486 norm16=mapping[1]&0xff00; // lccc 1487 } else { 1488 norm16=0; 1489 } 1490 norm16|=firstUnit>>8; // tccc 1491 } 1492 } 1493 utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode); 1494 break; 1495 } 1496 } 1497 1498 const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const { 1499 // Logically const: Synchronized instantiation. 1500 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 1501 return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode); 1502 } 1503 1504 // Dual functionality: 1505 // buffer!=NULL: normalize 1506 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1507 const UChar * 1508 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, 1509 ReorderingBuffer *buffer, 1510 UErrorCode &errorCode) const { 1511 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1512 // Similar to the prevBoundary in the compose() implementation. 1513 const UChar *prevBoundary=src; 1514 int32_t prevFCD16=0; 1515 if(limit==NULL) { 1516 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode); 1517 if(U_FAILURE(errorCode)) { 1518 return src; 1519 } 1520 if(prevBoundary<src) { 1521 prevBoundary=src; 1522 // We know that the previous character's lccc==0. 1523 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1524 prevFCD16=getFCD16FromSingleLead(*(src-1)); 1525 if(prevFCD16>1) { 1526 --prevBoundary; 1527 } 1528 } 1529 limit=u_strchr(src, 0); 1530 } 1531 1532 // Note: In this function we use buffer->appendZeroCC() because we track 1533 // the lead and trail combining classes here, rather than leaving it to 1534 // the ReorderingBuffer. 1535 // The exception is the call to decomposeShort() which uses the buffer 1536 // in the normal way. 1537 1538 const UTrie2 *trie=fcdTrie(); 1539 1540 const UChar *prevSrc; 1541 UChar32 c=0; 1542 uint16_t fcd16=0; 1543 1544 for(;;) { 1545 // count code units with lccc==0 1546 for(prevSrc=src; src!=limit;) { 1547 if((c=*src)<MIN_CCC_LCCC_CP) { 1548 prevFCD16=~c; 1549 ++src; 1550 } else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) { 1551 prevFCD16=fcd16; 1552 ++src; 1553 } else if(!U16_IS_SURROGATE(c)) { 1554 break; 1555 } else { 1556 UChar c2; 1557 if(U16_IS_SURROGATE_LEAD(c)) { 1558 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1559 c=U16_GET_SUPPLEMENTARY(c, c2); 1560 } 1561 } else /* trail surrogate */ { 1562 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1563 --src; 1564 c=U16_GET_SUPPLEMENTARY(c2, c); 1565 } 1566 } 1567 if((fcd16=getFCD16(c))<=0xff) { 1568 prevFCD16=fcd16; 1569 src+=U16_LENGTH(c); 1570 } else { 1571 break; 1572 } 1573 } 1574 } 1575 // copy these code units all at once 1576 if(src!=prevSrc) { 1577 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) { 1578 break; 1579 } 1580 if(src==limit) { 1581 break; 1582 } 1583 prevBoundary=src; 1584 // We know that the previous character's lccc==0. 1585 if(prevFCD16<0) { 1586 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1587 prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16); 1588 if(prevFCD16>1) { 1589 --prevBoundary; 1590 } 1591 } else { 1592 const UChar *p=src-1; 1593 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { 1594 --p; 1595 // Need to fetch the previous character's FCD value because 1596 // prevFCD16 was just for the trail surrogate code point. 1597 prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]); 1598 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1599 } 1600 if(prevFCD16>1) { 1601 prevBoundary=p; 1602 } 1603 } 1604 // The start of the current character (c). 1605 prevSrc=src; 1606 } else if(src==limit) { 1607 break; 1608 } 1609 1610 src+=U16_LENGTH(c); 1611 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1612 // Check for proper order, and decompose locally if necessary. 1613 if((prevFCD16&0xff)<=(fcd16>>8)) { 1614 // proper order: prev tccc <= current lccc 1615 if((fcd16&0xff)<=1) { 1616 prevBoundary=src; 1617 } 1618 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { 1619 break; 1620 } 1621 prevFCD16=fcd16; 1622 continue; 1623 } else if(buffer==NULL) { 1624 return prevBoundary; // quick check "no" 1625 } else { 1626 /* 1627 * Back out the part of the source that we copied or appended 1628 * already but is now going to be decomposed. 1629 * prevSrc is set to after what was copied/appended. 1630 */ 1631 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); 1632 /* 1633 * Find the part of the source that needs to be decomposed, 1634 * up to the next safe boundary. 1635 */ 1636 src=findNextFCDBoundary(src, limit); 1637 /* 1638 * The source text does not fulfill the conditions for FCD. 1639 * Decompose and reorder a limited piece of the text. 1640 */ 1641 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) { 1642 break; 1643 } 1644 prevBoundary=src; 1645 prevFCD16=0; 1646 } 1647 } 1648 return src; 1649 } 1650 1651 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, 1652 UBool doMakeFCD, 1653 ReorderingBuffer &buffer, 1654 UErrorCode &errorCode) const { 1655 if(!buffer.isEmpty()) { 1656 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit); 1657 if(src!=firstBoundaryInSrc) { 1658 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), 1659 buffer.getLimit()); 1660 UnicodeString middle(lastBoundaryInDest, 1661 (int32_t)(buffer.getLimit()-lastBoundaryInDest)); 1662 buffer.removeSuffix((int32_t)(buffer.getLimit()-lastBoundaryInDest)); 1663 middle.append(src, (int32_t)(firstBoundaryInSrc-src)); 1664 const UChar *middleStart=middle.getBuffer(); 1665 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); 1666 if(U_FAILURE(errorCode)) { 1667 return; 1668 } 1669 src=firstBoundaryInSrc; 1670 } 1671 } 1672 if(doMakeFCD) { 1673 makeFCD(src, limit, &buffer, errorCode); 1674 } else { 1675 buffer.appendZeroCC(src, limit, errorCode); 1676 } 1677 } 1678 1679 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { 1680 BackwardUTrie2StringIterator iter(fcdTrie(), start, p); 1681 uint16_t fcd16; 1682 do { 1683 fcd16=iter.previous16(); 1684 } while(fcd16>0xff); 1685 return iter.codePointStart; 1686 } 1687 1688 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { 1689 ForwardUTrie2StringIterator iter(fcdTrie(), p, limit); 1690 uint16_t fcd16; 1691 do { 1692 fcd16=iter.next16(); 1693 } while(fcd16>0xff); 1694 return iter.codePointStart; 1695 } 1696 1697 // CanonicalIterator data -------------------------------------------------- *** 1698 1699 CanonIterData::CanonIterData(UErrorCode &errorCode) : 1700 trie(utrie2_open(0, 0, &errorCode)), 1701 canonStartSets(uhash_deleteUObject, NULL, errorCode) {} 1702 1703 CanonIterData::~CanonIterData() { 1704 utrie2_close(trie); 1705 } 1706 1707 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) { 1708 uint32_t canonValue=utrie2_get32(trie, decompLead); 1709 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 1710 // origin is the first character whose decomposition starts with 1711 // the character for which we are setting the value. 1712 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode); 1713 } else { 1714 // origin is not the first character, or it is U+0000. 1715 UnicodeSet *set; 1716 if((canonValue&CANON_HAS_SET)==0) { 1717 set=new UnicodeSet; 1718 if(set==NULL) { 1719 errorCode=U_MEMORY_ALLOCATION_ERROR; 1720 return; 1721 } 1722 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); 1723 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); 1724 utrie2_set32(trie, decompLead, canonValue, &errorCode); 1725 canonStartSets.addElement(set, errorCode); 1726 if(firstOrigin!=0) { 1727 set->add(firstOrigin); 1728 } 1729 } else { 1730 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; 1731 } 1732 set->add(origin); 1733 } 1734 } 1735 1736 class CanonIterDataSingleton { 1737 public: 1738 CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) : 1739 singleton(s), impl(ni), errorCode(ec) {} 1740 CanonIterData *getInstance(UErrorCode &errorCode) { 1741 void *duplicate; 1742 CanonIterData *instance= 1743 (CanonIterData *)singleton.getInstance(createInstance, this, duplicate, errorCode); 1744 delete (CanonIterData *)duplicate; 1745 return instance; 1746 } 1747 static void *createInstance(const void *context, UErrorCode &errorCode); 1748 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 1749 if(value!=0) { 1750 impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode); 1751 } 1752 return U_SUCCESS(errorCode); 1753 } 1754 1755 private: 1756 SimpleSingleton &singleton; 1757 Normalizer2Impl &impl; 1758 CanonIterData *newData; 1759 UErrorCode &errorCode; 1760 }; 1761 1762 U_CDECL_BEGIN 1763 1764 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. 1765 static UBool U_CALLCONV 1766 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1767 return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value); 1768 } 1769 1770 U_CDECL_END 1771 1772 void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) { 1773 CanonIterDataSingleton *me=(CanonIterDataSingleton *)context; 1774 me->newData=new CanonIterData(errorCode); 1775 if(me->newData==NULL) { 1776 errorCode=U_MEMORY_ALLOCATION_ERROR; 1777 return NULL; 1778 } 1779 if(U_SUCCESS(errorCode)) { 1780 utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me); 1781 utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode); 1782 if(U_SUCCESS(errorCode)) { 1783 return me->newData; 1784 } 1785 } 1786 delete me->newData; 1787 return NULL; 1788 } 1789 1790 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 1791 CanonIterData &newData, 1792 UErrorCode &errorCode) const { 1793 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 1794 // Inert, or 2-way mapping (including Hangul syllable). 1795 // We do not write a canonStartSet for any yesNo character. 1796 // Composites from 2-way mappings are added at runtime from the 1797 // starter's compositions list, and the other characters in 1798 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 1799 // "maybe" characters. 1800 return; 1801 } 1802 for(UChar32 c=start; c<=end; ++c) { 1803 uint32_t oldValue=utrie2_get32(newData.trie, c); 1804 uint32_t newValue=oldValue; 1805 if(norm16>=minMaybeYes) { 1806 // not a segment starter if it occurs in a decomposition or has cc!=0 1807 newValue|=CANON_NOT_SEGMENT_STARTER; 1808 if(norm16<MIN_NORMAL_MAYBE_YES) { 1809 newValue|=CANON_HAS_COMPOSITIONS; 1810 } 1811 } else if(norm16<minYesNo) { 1812 newValue|=CANON_HAS_COMPOSITIONS; 1813 } else { 1814 // c has a one-way decomposition 1815 UChar32 c2=c; 1816 uint16_t norm16_2=norm16; 1817 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 1818 c2=mapAlgorithmic(c2, norm16_2); 1819 norm16_2=getNorm16(c2); 1820 } 1821 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 1822 // c decomposes, get everything from the variable-length extra data 1823 const uint16_t *mapping=getMapping(norm16_2); 1824 uint16_t firstUnit=*mapping++; 1825 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 1826 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1827 if(c==c2 && (*mapping&0xff)!=0) { 1828 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 1829 } 1830 ++mapping; 1831 } 1832 // Skip empty mappings (no characters in the decomposition). 1833 if(length!=0) { 1834 // add c to first code point's start set 1835 int32_t i=0; 1836 U16_NEXT_UNSAFE(mapping, i, c2); 1837 newData.addToStartSet(c, c2, errorCode); 1838 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 1839 // one-way mapping. A 2-way mapping is possible here after 1840 // intermediate algorithmic mapping. 1841 if(norm16_2>=minNoNo) { 1842 while(i<length) { 1843 U16_NEXT_UNSAFE(mapping, i, c2); 1844 uint32_t c2Value=utrie2_get32(newData.trie, c2); 1845 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 1846 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER, 1847 &errorCode); 1848 } 1849 } 1850 } 1851 } 1852 } else { 1853 // c decomposed to c2 algorithmically; c has cc==0 1854 newData.addToStartSet(c, c2, errorCode); 1855 } 1856 } 1857 if(newValue!=oldValue) { 1858 utrie2_set32(newData.trie, c, newValue, &errorCode); 1859 } 1860 } 1861 } 1862 1863 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { 1864 // Logically const: Synchronized instantiation. 1865 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 1866 CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode); 1867 return U_SUCCESS(errorCode); 1868 } 1869 1870 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { 1871 return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c); 1872 } 1873 1874 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { 1875 return *(const UnicodeSet *)( 1876 ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]); 1877 } 1878 1879 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { 1880 return getCanonValue(c)>=0; 1881 } 1882 1883 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { 1884 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; 1885 if(canonValue==0) { 1886 return FALSE; 1887 } 1888 set.clear(); 1889 int32_t value=canonValue&CANON_VALUE_MASK; 1890 if((canonValue&CANON_HAS_SET)!=0) { 1891 set.addAll(getCanonStartSet(value)); 1892 } else if(value!=0) { 1893 set.add(value); 1894 } 1895 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 1896 uint16_t norm16=getNorm16(c); 1897 if(norm16==JAMO_L) { 1898 UChar32 syllable= 1899 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); 1900 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); 1901 } else { 1902 addComposites(getCompositionsList(norm16), set); 1903 } 1904 } 1905 return TRUE; 1906 } 1907 1908 U_NAMESPACE_END 1909 1910 // Normalizer2 data swapping ----------------------------------------------- *** 1911 1912 U_NAMESPACE_USE 1913 1914 U_CAPI int32_t U_EXPORT2 1915 unorm2_swap(const UDataSwapper *ds, 1916 const void *inData, int32_t length, void *outData, 1917 UErrorCode *pErrorCode) { 1918 const UDataInfo *pInfo; 1919 int32_t headerSize; 1920 1921 const uint8_t *inBytes; 1922 uint8_t *outBytes; 1923 1924 const int32_t *inIndexes; 1925 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1]; 1926 1927 int32_t i, offset, nextOffset, size; 1928 1929 /* udata_swapDataHeader checks the arguments */ 1930 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 1931 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1932 return 0; 1933 } 1934 1935 /* check data format and format version */ 1936 pInfo=(const UDataInfo *)((const char *)inData+4); 1937 if(!( 1938 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 1939 pInfo->dataFormat[1]==0x72 && 1940 pInfo->dataFormat[2]==0x6d && 1941 pInfo->dataFormat[3]==0x32 && 1942 pInfo->formatVersion[0]==1 1943 )) { 1944 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", 1945 pInfo->dataFormat[0], pInfo->dataFormat[1], 1946 pInfo->dataFormat[2], pInfo->dataFormat[3], 1947 pInfo->formatVersion[0]); 1948 *pErrorCode=U_UNSUPPORTED_ERROR; 1949 return 0; 1950 } 1951 1952 inBytes=(const uint8_t *)inData+headerSize; 1953 outBytes=(uint8_t *)outData+headerSize; 1954 1955 inIndexes=(const int32_t *)inBytes; 1956 1957 if(length>=0) { 1958 length-=headerSize; 1959 if(length<(int32_t)sizeof(indexes)) { 1960 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", 1961 length); 1962 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1963 return 0; 1964 } 1965 } 1966 1967 /* read the first few indexes */ 1968 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) { 1969 indexes[i]=udata_readInt32(ds, inIndexes[i]); 1970 } 1971 1972 /* get the total length of the data */ 1973 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 1974 1975 if(length>=0) { 1976 if(length<size) { 1977 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n", 1978 length); 1979 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1980 return 0; 1981 } 1982 1983 /* copy the data for inaccessible bytes */ 1984 if(inBytes!=outBytes) { 1985 uprv_memcpy(outBytes, inBytes, size); 1986 } 1987 1988 offset=0; 1989 1990 /* swap the int32_t indexes[] */ 1991 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]; 1992 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); 1993 offset=nextOffset; 1994 1995 /* swap the UTrie2 */ 1996 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; 1997 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 1998 offset=nextOffset; 1999 2000 /* swap the uint16_t extraData[] */ 2001 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1]; 2002 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 2003 offset=nextOffset; 2004 2005 U_ASSERT(offset==size); 2006 } 2007 2008 return headerSize+size; 2009 } 2010 2011 #endif // !UCONFIG_NO_NORMALIZATION 2012