1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: normalizer2impl.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov22 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_NORMALIZATION 20 21 #include "unicode/normalizer2.h" 22 #include "unicode/udata.h" 23 #include "unicode/ustring.h" 24 #include "cmemory.h" 25 #include "mutex.h" 26 #include "normalizer2impl.h" 27 #include "uassert.h" 28 #include "uhash.h" 29 #include "uset_imp.h" 30 #include "utrie2.h" 31 #include "uvector.h" 32 33 U_NAMESPACE_BEGIN 34 35 // ReorderingBuffer -------------------------------------------------------- *** 36 37 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { 38 int32_t length=str.length(); 39 start=str.getBuffer(destCapacity); 40 if(start==NULL) { 41 // getBuffer() already did str.setToBogus() 42 errorCode=U_MEMORY_ALLOCATION_ERROR; 43 return FALSE; 44 } 45 limit=start+length; 46 remainingCapacity=str.getCapacity()-length; 47 reorderStart=start; 48 if(start==limit) { 49 lastCC=0; 50 } else { 51 setIterator(); 52 lastCC=previousCC(); 53 // Set reorderStart after the last code point with cc<=1 if there is one. 54 if(lastCC>1) { 55 while(previousCC()>1) {} 56 } 57 reorderStart=codePointLimit; 58 } 59 return TRUE; 60 } 61 62 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { 63 int32_t length=(int32_t)(limit-start); 64 return 65 length==(int32_t)(otherLimit-otherStart) && 66 0==u_memcmp(start, otherStart, length); 67 } 68 69 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 70 if(remainingCapacity<2 && !resize(2, errorCode)) { 71 return FALSE; 72 } 73 if(lastCC<=cc || cc==0) { 74 limit[0]=U16_LEAD(c); 75 limit[1]=U16_TRAIL(c); 76 limit+=2; 77 lastCC=cc; 78 if(cc<=1) { 79 reorderStart=limit; 80 } 81 } else { 82 insert(c, cc); 83 } 84 remainingCapacity-=2; 85 return TRUE; 86 } 87 88 UBool ReorderingBuffer::append(const UChar *s, int32_t length, 89 uint8_t leadCC, uint8_t trailCC, 90 UErrorCode &errorCode) { 91 if(length==0) { 92 return TRUE; 93 } 94 if(remainingCapacity<length && !resize(length, errorCode)) { 95 return FALSE; 96 } 97 remainingCapacity-=length; 98 if(lastCC<=leadCC || leadCC==0) { 99 if(trailCC<=1) { 100 reorderStart=limit+length; 101 } else if(leadCC<=1) { 102 reorderStart=limit+1; // Ok if not a code point boundary. 103 } 104 const UChar *sLimit=s+length; 105 do { *limit++=*s++; } while(s!=sLimit); 106 lastCC=trailCC; 107 } else { 108 int32_t i=0; 109 UChar32 c; 110 U16_NEXT(s, i, length, c); 111 insert(c, leadCC); // insert first code point 112 while(i<length) { 113 U16_NEXT(s, i, length, c); 114 if(i<length) { 115 // s must be in NFD, otherwise we need to use getCC(). 116 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 117 } else { 118 leadCC=trailCC; 119 } 120 append(c, leadCC, errorCode); 121 } 122 } 123 return TRUE; 124 } 125 126 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { 127 int32_t cpLength=U16_LENGTH(c); 128 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { 129 return FALSE; 130 } 131 remainingCapacity-=cpLength; 132 if(cpLength==1) { 133 *limit++=(UChar)c; 134 } else { 135 limit[0]=U16_LEAD(c); 136 limit[1]=U16_TRAIL(c); 137 limit+=2; 138 } 139 lastCC=0; 140 reorderStart=limit; 141 return TRUE; 142 } 143 144 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) { 145 if(s==sLimit) { 146 return TRUE; 147 } 148 int32_t length=(int32_t)(sLimit-s); 149 if(remainingCapacity<length && !resize(length, errorCode)) { 150 return FALSE; 151 } 152 u_memcpy(limit, s, length); 153 limit+=length; 154 remainingCapacity-=length; 155 lastCC=0; 156 reorderStart=limit; 157 return TRUE; 158 } 159 160 void ReorderingBuffer::remove() { 161 reorderStart=limit=start; 162 remainingCapacity=str.getCapacity(); 163 lastCC=0; 164 } 165 166 void ReorderingBuffer::removeSuffix(int32_t suffixLength) { 167 if(suffixLength<(limit-start)) { 168 limit-=suffixLength; 169 remainingCapacity+=suffixLength; 170 } else { 171 limit=start; 172 remainingCapacity=str.getCapacity(); 173 } 174 lastCC=0; 175 reorderStart=limit; 176 } 177 178 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { 179 int32_t reorderStartIndex=(int32_t)(reorderStart-start); 180 int32_t length=(int32_t)(limit-start); 181 str.releaseBuffer(length); 182 int32_t newCapacity=length+appendLength; 183 int32_t doubleCapacity=2*str.getCapacity(); 184 if(newCapacity<doubleCapacity) { 185 newCapacity=doubleCapacity; 186 } 187 if(newCapacity<256) { 188 newCapacity=256; 189 } 190 start=str.getBuffer(newCapacity); 191 if(start==NULL) { 192 // getBuffer() already did str.setToBogus() 193 errorCode=U_MEMORY_ALLOCATION_ERROR; 194 return FALSE; 195 } 196 reorderStart=start+reorderStartIndex; 197 limit=start+length; 198 remainingCapacity=str.getCapacity()-length; 199 return TRUE; 200 } 201 202 void ReorderingBuffer::skipPrevious() { 203 codePointLimit=codePointStart; 204 UChar c=*--codePointStart; 205 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { 206 --codePointStart; 207 } 208 } 209 210 uint8_t ReorderingBuffer::previousCC() { 211 codePointLimit=codePointStart; 212 if(reorderStart>=codePointStart) { 213 return 0; 214 } 215 UChar32 c=*--codePointStart; 216 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) { 217 return 0; 218 } 219 220 UChar c2; 221 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { 222 --codePointStart; 223 c=U16_GET_SUPPLEMENTARY(c2, c); 224 } 225 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 226 } 227 228 // Inserts c somewhere before the last character. 229 // Requires 0<cc<lastCC which implies reorderStart<limit. 230 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) { 231 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 232 // insert c at codePointLimit, after the character with prevCC<=cc 233 UChar *q=limit; 234 UChar *r=limit+=U16_LENGTH(c); 235 do { 236 *--r=*--q; 237 } while(codePointLimit!=q); 238 writeCodePoint(q, c); 239 if(cc<=1) { 240 reorderStart=r; 241 } 242 } 243 244 // Normalizer2Impl --------------------------------------------------------- *** 245 246 struct CanonIterData : public UMemory { 247 CanonIterData(UErrorCode &errorCode); 248 ~CanonIterData(); 249 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); 250 UTrie2 *trie; 251 UVector canonStartSets; // contains UnicodeSet * 252 }; 253 254 Normalizer2Impl::~Normalizer2Impl() { 255 udata_close(memory); 256 utrie2_close(normTrie); 257 UTrie2Singleton(fcdTrieSingleton).deleteInstance(); 258 delete (CanonIterData *)canonIterDataSingleton.fInstance; 259 } 260 261 UBool U_CALLCONV 262 Normalizer2Impl::isAcceptable(void *context, 263 const char * /* type */, const char * /*name*/, 264 const UDataInfo *pInfo) { 265 if( 266 pInfo->size>=20 && 267 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 268 pInfo->charsetFamily==U_CHARSET_FAMILY && 269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 270 pInfo->dataFormat[1]==0x72 && 271 pInfo->dataFormat[2]==0x6d && 272 pInfo->dataFormat[3]==0x32 && 273 pInfo->formatVersion[0]==1 274 ) { 275 Normalizer2Impl *me=(Normalizer2Impl *)context; 276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); 277 return TRUE; 278 } else { 279 return FALSE; 280 } 281 } 282 283 void 284 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { 285 if(U_FAILURE(errorCode)) { 286 return; 287 } 288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); 289 if(U_FAILURE(errorCode)) { 290 return; 291 } 292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); 293 const int32_t *inIndexes=(const int32_t *)inBytes; 294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; 295 if(indexesLength<=IX_MIN_MAYBE_YES) { 296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. 297 return; 298 } 299 300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 302 303 minYesNo=inIndexes[IX_MIN_YES_NO]; 304 minNoNo=inIndexes[IX_MIN_NO_NO]; 305 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 306 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 307 308 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; 309 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 310 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 311 inBytes+offset, nextOffset-offset, NULL, 312 &errorCode); 313 if(U_FAILURE(errorCode)) { 314 return; 315 } 316 317 offset=nextOffset; 318 maybeYesCompositions=(const uint16_t *)(inBytes+offset); 319 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); 320 } 321 322 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { 323 UChar32 c; 324 if(cpStart==(cpLimit-1)) { 325 c=*cpStart; 326 } else { 327 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); 328 } 329 uint16_t prevNorm16=getNorm16(c); 330 if(prevNorm16<=minYesNo) { 331 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 332 } else { 333 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo 334 } 335 } 336 337 U_CDECL_BEGIN 338 339 static UBool U_CALLCONV 340 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 341 /* add the start code point to the USet */ 342 const USetAdder *sa=(const USetAdder *)context; 343 sa->add(sa->set, start); 344 return TRUE; 345 } 346 347 static uint32_t U_CALLCONV 348 segmentStarterMapper(const void * /*context*/, uint32_t value) { 349 return value&CANON_NOT_SEGMENT_STARTER; 350 } 351 352 U_CDECL_END 353 354 void 355 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { 356 /* add the start code point of each same-value range of each trie */ 357 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa); 358 359 /* add Hangul LV syllables and LV+1 because of skippables */ 360 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { 361 sa->add(sa->set, c); 362 sa->add(sa->set, c+1); 363 } 364 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 365 } 366 367 void 368 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { 369 /* add the start code point of each same-value range of the canonical iterator data trie */ 370 if(ensureCanonIterData(errorCode)) { 371 // currently only used for the SEGMENT_STARTER property 372 utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, 373 segmentStarterMapper, enumPropertyStartsRange, sa); 374 } 375 } 376 377 const UChar * 378 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, 379 UChar32 minNeedDataCP, 380 ReorderingBuffer *buffer, 381 UErrorCode &errorCode) const { 382 // Make some effort to support NUL-terminated strings reasonably. 383 // Take the part of the fast quick check loop that does not look up 384 // data and check the first part of the string. 385 // After this prefix, determine the string length to simplify the rest 386 // of the code. 387 const UChar *prevSrc=src; 388 UChar c; 389 while((c=*src++)<minNeedDataCP && c!=0) {} 390 // Back out the last character for full processing. 391 // Copy this prefix. 392 if(--src!=prevSrc) { 393 if(buffer!=NULL) { 394 buffer->appendZeroCC(prevSrc, src, errorCode); 395 } 396 } 397 return src; 398 } 399 400 // Dual functionality: 401 // buffer!=NULL: normalize 402 // buffer==NULL: isNormalized/spanQuickCheckYes 403 const UChar * 404 Normalizer2Impl::decompose(const UChar *src, const UChar *limit, 405 ReorderingBuffer *buffer, 406 UErrorCode &errorCode) const { 407 UChar32 minNoCP=minDecompNoCP; 408 if(limit==NULL) { 409 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); 410 if(U_FAILURE(errorCode)) { 411 return src; 412 } 413 limit=u_strchr(src, 0); 414 } 415 416 const UChar *prevSrc; 417 UChar32 c=0; 418 uint16_t norm16=0; 419 420 // only for quick check 421 const UChar *prevBoundary=src; 422 uint8_t prevCC=0; 423 424 for(;;) { 425 // count code units below the minimum or with irrelevant data for the quick check 426 for(prevSrc=src; src!=limit;) { 427 if( (c=*src)<minNoCP || 428 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 429 ) { 430 ++src; 431 } else if(!U16_IS_SURROGATE(c)) { 432 break; 433 } else { 434 UChar c2; 435 if(U16_IS_SURROGATE_LEAD(c)) { 436 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 437 c=U16_GET_SUPPLEMENTARY(c, c2); 438 } 439 } else /* trail surrogate */ { 440 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 441 --src; 442 c=U16_GET_SUPPLEMENTARY(c2, c); 443 } 444 } 445 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 446 src+=U16_LENGTH(c); 447 } else { 448 break; 449 } 450 } 451 } 452 // copy these code units all at once 453 if(src!=prevSrc) { 454 if(buffer!=NULL) { 455 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { 456 break; 457 } 458 } else { 459 prevCC=0; 460 prevBoundary=src; 461 } 462 } 463 if(src==limit) { 464 break; 465 } 466 467 // Check one above-minimum, relevant code point. 468 src+=U16_LENGTH(c); 469 if(buffer!=NULL) { 470 if(!decompose(c, norm16, *buffer, errorCode)) { 471 break; 472 } 473 } else { 474 if(isDecompYes(norm16)) { 475 uint8_t cc=getCCFromYesOrMaybe(norm16); 476 if(prevCC<=cc || cc==0) { 477 prevCC=cc; 478 if(cc<=1) { 479 prevBoundary=src; 480 } 481 continue; 482 } 483 } 484 return prevBoundary; // "no" or cc out of order 485 } 486 } 487 return src; 488 } 489 490 // Decompose a short piece of text which is likely to contain characters that 491 // fail the quick check loop and/or where the quick check loop's overhead 492 // is unlikely to be amortized. 493 // Called by the compose() and makeFCD() implementations. 494 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit, 495 ReorderingBuffer &buffer, 496 UErrorCode &errorCode) const { 497 while(src<limit) { 498 UChar32 c; 499 uint16_t norm16; 500 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16); 501 if(!decompose(c, norm16, buffer, errorCode)) { 502 return FALSE; 503 } 504 } 505 return TRUE; 506 } 507 508 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, 509 ReorderingBuffer &buffer, 510 UErrorCode &errorCode) const { 511 // Only loops for 1:1 algorithmic mappings. 512 for(;;) { 513 // get the decomposition and the lead and trail cc's 514 if(isDecompYes(norm16)) { 515 // c does not decompose 516 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode); 517 } else if(isHangul(norm16)) { 518 // Hangul syllable: decompose algorithmically 519 UChar jamos[3]; 520 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); 521 } else if(isDecompNoAlgorithmic(norm16)) { 522 c=mapAlgorithmic(c, norm16); 523 norm16=getNorm16(c); 524 } else { 525 // c decomposes, get everything from the variable-length extra data 526 const uint16_t *mapping=getMapping(norm16); 527 uint16_t firstUnit=*mapping++; 528 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 529 uint8_t leadCC, trailCC; 530 trailCC=(uint8_t)(firstUnit>>8); 531 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 532 leadCC=(uint8_t)(*mapping++>>8); 533 } else { 534 leadCC=0; 535 } 536 return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode); 537 } 538 } 539 } 540 541 const UChar * 542 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { 543 const UChar *decomp=NULL; 544 uint16_t norm16; 545 for(;;) { 546 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 547 // c does not decompose 548 return decomp; 549 } else if(isHangul(norm16)) { 550 // Hangul syllable: decompose algorithmically 551 length=Hangul::decompose(c, buffer); 552 return buffer; 553 } else if(isDecompNoAlgorithmic(norm16)) { 554 c=mapAlgorithmic(c, norm16); 555 decomp=buffer; 556 length=0; 557 U16_APPEND_UNSAFE(buffer, length, c); 558 } else { 559 // c decomposes, get everything from the variable-length extra data 560 const uint16_t *mapping=getMapping(norm16); 561 uint16_t firstUnit=*mapping++; 562 length=firstUnit&MAPPING_LENGTH_MASK; 563 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 564 ++mapping; 565 } 566 return (const UChar *)mapping; 567 } 568 } 569 } 570 571 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, 572 UBool doDecompose, 573 UnicodeString &safeMiddle, 574 ReorderingBuffer &buffer, 575 UErrorCode &errorCode) const { 576 buffer.copyReorderableSuffixTo(safeMiddle); 577 if(doDecompose) { 578 decompose(src, limit, &buffer, errorCode); 579 return; 580 } 581 // Just merge the strings at the boundary. 582 ForwardUTrie2StringIterator iter(normTrie, src, limit); 583 uint8_t firstCC, prevCC, cc; 584 firstCC=prevCC=cc=getCC(iter.next16()); 585 while(cc!=0) { 586 prevCC=cc; 587 cc=getCC(iter.next16()); 588 }; 589 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 590 limit=u_strchr(iter.codePointStart, 0); 591 } 592 buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) && 593 buffer.appendZeroCC(iter.codePointStart, limit, errorCode); 594 } 595 596 // Note: hasDecompBoundary() could be implemented as aliases to 597 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 598 // at the cost of building the FCD trie for a decomposition normalizer. 599 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { 600 for(;;) { 601 if(c<minDecompNoCP) { 602 return TRUE; 603 } 604 uint16_t norm16=getNorm16(c); 605 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 606 return TRUE; 607 } else if(norm16>MIN_NORMAL_MAYBE_YES) { 608 return FALSE; // ccc!=0 609 } else if(isDecompNoAlgorithmic(norm16)) { 610 c=mapAlgorithmic(c, norm16); 611 } else { 612 // c decomposes, get everything from the variable-length extra data 613 const uint16_t *mapping=getMapping(norm16); 614 uint16_t firstUnit=*mapping++; 615 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 616 return FALSE; 617 } 618 if(!before) { 619 // decomp after-boundary: same as hasFCDBoundaryAfter(), 620 // fcd16<=1 || trailCC==0 621 if(firstUnit>0x1ff) { 622 return FALSE; // trailCC>1 623 } 624 if(firstUnit<=0xff) { 625 return TRUE; // trailCC==0 626 } 627 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 628 } 629 // TRUE if leadCC==0 (hasFCDBoundaryBefore()) 630 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0; 631 } 632 } 633 } 634 635 /* 636 * Finds the recomposition result for 637 * a forward-combining "lead" character, 638 * specified with a pointer to its compositions list, 639 * and a backward-combining "trail" character. 640 * 641 * If the lead and trail characters combine, then this function returns 642 * the following "compositeAndFwd" value: 643 * Bits 21..1 composite character 644 * Bit 0 set if the composite is a forward-combining starter 645 * otherwise it returns -1. 646 * 647 * The compositions list has (trail, compositeAndFwd) pair entries, 648 * encoded as either pairs or triples of 16-bit units. 649 * The last entry has the high bit of its first unit set. 650 * 651 * The list is sorted by ascending trail characters (there are no duplicates). 652 * A linear search is used. 653 * 654 * See normalizer2impl.h for a more detailed description 655 * of the compositions list format. 656 */ 657 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { 658 uint16_t key1, firstUnit; 659 if(trail<COMP_1_TRAIL_LIMIT) { 660 // trail character is 0..33FF 661 // result entry may have 2 or 3 units 662 key1=(uint16_t)(trail<<1); 663 while(key1>(firstUnit=*list)) { 664 list+=2+(firstUnit&COMP_1_TRIPLE); 665 } 666 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 667 if(firstUnit&COMP_1_TRIPLE) { 668 return ((int32_t)list[1]<<16)|list[2]; 669 } else { 670 return list[1]; 671 } 672 } 673 } else { 674 // trail character is 3400..10FFFF 675 // result entry has 3 units 676 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ 677 (((trail>>COMP_1_TRAIL_SHIFT))& 678 ~COMP_1_TRIPLE)); 679 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT); 680 uint16_t secondUnit; 681 for(;;) { 682 if(key1>(firstUnit=*list)) { 683 list+=2+(firstUnit&COMP_1_TRIPLE); 684 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 685 if(key2>(secondUnit=list[1])) { 686 if(firstUnit&COMP_1_LAST_TUPLE) { 687 break; 688 } else { 689 list+=3; 690 } 691 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 692 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; 693 } else { 694 break; 695 } 696 } else { 697 break; 698 } 699 } 700 } 701 return -1; 702 } 703 704 /** 705 * @param list some character's compositions list 706 * @param set recursively receives the composites from these compositions 707 */ 708 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { 709 uint16_t firstUnit; 710 int32_t compositeAndFwd; 711 do { 712 firstUnit=*list; 713 if((firstUnit&COMP_1_TRIPLE)==0) { 714 compositeAndFwd=list[1]; 715 list+=2; 716 } else { 717 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; 718 list+=3; 719 } 720 UChar32 composite=compositeAndFwd>>1; 721 if((compositeAndFwd&1)!=0) { 722 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 723 } 724 set.add(composite); 725 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 726 } 727 728 /* 729 * Recomposes the buffer text starting at recomposeStartIndex 730 * (which is in NFD - decomposed and canonically ordered), 731 * and truncates the buffer contents. 732 * 733 * Note that recomposition never lengthens the text: 734 * Any character consists of either one or two code units; 735 * a composition may contain at most one more code unit than the original starter, 736 * while the combining mark that is removed has at least one code unit. 737 */ 738 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 739 UBool onlyContiguous) const { 740 UChar *p=buffer.getStart()+recomposeStartIndex; 741 UChar *limit=buffer.getLimit(); 742 if(p==limit) { 743 return; 744 } 745 746 UChar *starter, *pRemove, *q, *r; 747 const uint16_t *compositionsList; 748 UChar32 c, compositeAndFwd; 749 uint16_t norm16; 750 uint8_t cc, prevCC; 751 UBool starterIsSupplementary; 752 753 // Some of the following variables are not used until we have a forward-combining starter 754 // and are only initialized now to avoid compiler warnings. 755 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter 756 starter=NULL; 757 starterIsSupplementary=FALSE; 758 prevCC=0; 759 760 for(;;) { 761 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16); 762 cc=getCCFromYesOrMaybe(norm16); 763 if( // this character combines backward and 764 isMaybe(norm16) && 765 // we have seen a starter that combines forward and 766 compositionsList!=NULL && 767 // the backward-combining character is not blocked 768 (prevCC<cc || prevCC==0) 769 ) { 770 if(isJamoVT(norm16)) { 771 // c is a Jamo V/T, see if we can compose it with the previous character. 772 if(c<Hangul::JAMO_T_BASE) { 773 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 774 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE); 775 if(prev<Hangul::JAMO_L_COUNT) { 776 pRemove=p-1; 777 UChar syllable=(UChar) 778 (Hangul::HANGUL_BASE+ 779 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 780 Hangul::JAMO_T_COUNT); 781 UChar t; 782 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 783 ++p; 784 syllable+=t; // The next character was a Jamo T. 785 } 786 *starter=syllable; 787 // remove the Jamo V/T 788 q=pRemove; 789 r=p; 790 while(r<limit) { 791 *q++=*r++; 792 } 793 limit=q; 794 p=pRemove; 795 } 796 } 797 /* 798 * No "else" for Jamo T: 799 * Since the input is in NFD, there are no Hangul LV syllables that 800 * a Jamo T could combine with. 801 * All Jamo Ts are combined above when handling Jamo Vs. 802 */ 803 if(p==limit) { 804 break; 805 } 806 compositionsList=NULL; 807 continue; 808 } else if((compositeAndFwd=combine(compositionsList, c))>=0) { 809 // The starter and the combining mark (c) do combine. 810 UChar32 composite=compositeAndFwd>>1; 811 812 // Replace the starter with the composite, remove the combining mark. 813 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark 814 if(starterIsSupplementary) { 815 if(U_IS_SUPPLEMENTARY(composite)) { 816 // both are supplementary 817 starter[0]=U16_LEAD(composite); 818 starter[1]=U16_TRAIL(composite); 819 } else { 820 *starter=(UChar)composite; 821 // The composite is shorter than the starter, 822 // move the intermediate characters forward one. 823 starterIsSupplementary=FALSE; 824 q=starter+1; 825 r=q+1; 826 while(r<pRemove) { 827 *q++=*r++; 828 } 829 --pRemove; 830 } 831 } else if(U_IS_SUPPLEMENTARY(composite)) { 832 // The composite is longer than the starter, 833 // move the intermediate characters back one. 834 starterIsSupplementary=TRUE; 835 ++starter; // temporarily increment for the loop boundary 836 q=pRemove; 837 r=++pRemove; 838 while(starter<q) { 839 *--r=*--q; 840 } 841 *starter=U16_TRAIL(composite); 842 *--starter=U16_LEAD(composite); // undo the temporary increment 843 } else { 844 // both are on the BMP 845 *starter=(UChar)composite; 846 } 847 848 /* remove the combining mark by moving the following text over it */ 849 if(pRemove<p) { 850 q=pRemove; 851 r=p; 852 while(r<limit) { 853 *q++=*r++; 854 } 855 limit=q; 856 p=pRemove; 857 } 858 // Keep prevCC because we removed the combining mark. 859 860 if(p==limit) { 861 break; 862 } 863 // Is the composite a starter that combines forward? 864 if(compositeAndFwd&1) { 865 compositionsList= 866 getCompositionsListForComposite(getNorm16(composite)); 867 } else { 868 compositionsList=NULL; 869 } 870 871 // We combined; continue with looking for compositions. 872 continue; 873 } 874 } 875 876 // no combination this time 877 prevCC=cc; 878 if(p==limit) { 879 break; 880 } 881 882 // If c did not combine, then check if it is a starter. 883 if(cc==0) { 884 // Found a new starter. 885 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) { 886 // It may combine with something, prepare for it. 887 if(U_IS_BMP(c)) { 888 starterIsSupplementary=FALSE; 889 starter=p-1; 890 } else { 891 starterIsSupplementary=TRUE; 892 starter=p-2; 893 } 894 } 895 } else if(onlyContiguous) { 896 // FCC: no discontiguous compositions; any intervening character blocks. 897 compositionsList=NULL; 898 } 899 } 900 buffer.setReorderingLimit(limit); 901 } 902 903 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 904 // doCompose: normalize 905 // !doCompose: isNormalized (buffer must be empty and initialized) 906 UBool 907 Normalizer2Impl::compose(const UChar *src, const UChar *limit, 908 UBool onlyContiguous, 909 UBool doCompose, 910 ReorderingBuffer &buffer, 911 UErrorCode &errorCode) const { 912 /* 913 * prevBoundary points to the last character before the current one 914 * that has a composition boundary before it with ccc==0 and quick check "yes". 915 * Keeping track of prevBoundary saves us looking for a composition boundary 916 * when we find a "no" or "maybe". 917 * 918 * When we back out from prevSrc back to prevBoundary, 919 * then we also remove those same characters (which had been simply copied 920 * or canonically-order-inserted) from the ReorderingBuffer. 921 * Therefore, at all times, the [prevBoundary..prevSrc[ source units 922 * must correspond 1:1 to destination units at the end of the destination buffer. 923 */ 924 const UChar *prevBoundary=src; 925 UChar32 minNoMaybeCP=minCompNoMaybeCP; 926 if(limit==NULL) { 927 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, 928 doCompose ? &buffer : NULL, 929 errorCode); 930 if(U_FAILURE(errorCode)) { 931 return FALSE; 932 } 933 if(prevBoundary<src) { 934 // Set prevBoundary to the last character in the prefix. 935 prevBoundary=src-1; 936 } 937 limit=u_strchr(src, 0); 938 } 939 940 const UChar *prevSrc; 941 UChar32 c=0; 942 uint16_t norm16=0; 943 944 // only for isNormalized 945 uint8_t prevCC=0; 946 947 for(;;) { 948 // count code units below the minimum or with irrelevant data for the quick check 949 for(prevSrc=src; src!=limit;) { 950 if( (c=*src)<minNoMaybeCP || 951 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 952 ) { 953 ++src; 954 } else if(!U16_IS_SURROGATE(c)) { 955 break; 956 } else { 957 UChar c2; 958 if(U16_IS_SURROGATE_LEAD(c)) { 959 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 960 c=U16_GET_SUPPLEMENTARY(c, c2); 961 } 962 } else /* trail surrogate */ { 963 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 964 --src; 965 c=U16_GET_SUPPLEMENTARY(c2, c); 966 } 967 } 968 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 969 src+=U16_LENGTH(c); 970 } else { 971 break; 972 } 973 } 974 } 975 // copy these code units all at once 976 if(src!=prevSrc) { 977 if(doCompose) { 978 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) { 979 break; 980 } 981 } else { 982 prevCC=0; 983 } 984 if(src==limit) { 985 break; 986 } 987 // Set prevBoundary to the last character in the quick check loop. 988 prevBoundary=src-1; 989 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 990 U16_IS_LEAD(*(prevBoundary-1)) 991 ) { 992 --prevBoundary; 993 } 994 // The start of the current character (c). 995 prevSrc=src; 996 } else if(src==limit) { 997 break; 998 } 999 1000 src+=U16_LENGTH(c); 1001 /* 1002 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1003 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1004 * or has ccc!=0. 1005 * Check for Jamo V/T, then for regular characters. 1006 * c is not a Hangul syllable or Jamo L because those have "yes" properties. 1007 */ 1008 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 1009 UChar prev=*(prevSrc-1); 1010 UBool needToDecompose=FALSE; 1011 if(c<Hangul::JAMO_T_BASE) { 1012 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1013 prev=(UChar)(prev-Hangul::JAMO_L_BASE); 1014 if(prev<Hangul::JAMO_L_COUNT) { 1015 if(!doCompose) { 1016 return FALSE; 1017 } 1018 UChar syllable=(UChar) 1019 (Hangul::HANGUL_BASE+ 1020 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 1021 Hangul::JAMO_T_COUNT); 1022 UChar t; 1023 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 1024 ++src; 1025 syllable+=t; // The next character was a Jamo T. 1026 prevBoundary=src; 1027 buffer.setLastChar(syllable); 1028 continue; 1029 } 1030 // If we see L+V+x where x!=T then we drop to the slow path, 1031 // decompose and recompose. 1032 // This is to deal with NFKC finding normal L and V but a 1033 // compatibility variant of a T. We need to either fully compose that 1034 // combination here (which would complicate the code and may not work 1035 // with strange custom data) or use the slow path -- or else our replacing 1036 // two input characters (L+V) with one output character (LV syllable) 1037 // would violate the invariant that [prevBoundary..prevSrc[ has the same 1038 // length as what we appended to the buffer since prevBoundary. 1039 needToDecompose=TRUE; 1040 } 1041 } else if(Hangul::isHangulWithoutJamoT(prev)) { 1042 // c is a Jamo Trailing consonant, 1043 // compose with previous Hangul LV that does not contain a Jamo T. 1044 if(!doCompose) { 1045 return FALSE; 1046 } 1047 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE)); 1048 prevBoundary=src; 1049 continue; 1050 } 1051 if(!needToDecompose) { 1052 // The Jamo V/T did not compose into a Hangul syllable. 1053 if(doCompose) { 1054 if(!buffer.appendBMP((UChar)c, 0, errorCode)) { 1055 break; 1056 } 1057 } else { 1058 prevCC=0; 1059 } 1060 continue; 1061 } 1062 } 1063 /* 1064 * Source buffer pointers: 1065 * 1066 * all done quick check current char not yet 1067 * "yes" but (c) processed 1068 * may combine 1069 * forward 1070 * [-------------[-------------[-------------[-------------[ 1071 * | | | | | 1072 * orig. src prevBoundary prevSrc src limit 1073 * 1074 * 1075 * Destination buffer pointers inside the ReorderingBuffer: 1076 * 1077 * all done might take not filled yet 1078 * characters for 1079 * reordering 1080 * [-------------[-------------[-------------[ 1081 * | | | | 1082 * start reorderStart limit | 1083 * +remainingCap.+ 1084 */ 1085 if(norm16>=MIN_YES_YES_WITH_CC) { 1086 uint8_t cc=(uint8_t)norm16; // cc!=0 1087 if( onlyContiguous && // FCC 1088 (doCompose ? buffer.getLastCC() : prevCC)==0 && 1089 prevBoundary<prevSrc && 1090 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 1091 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1092 // passed the quick check "yes && ccc==0" test. 1093 // Check whether the last character was a "yesYes" or a "yesNo". 1094 // If a "yesNo", then we get its trailing ccc from its 1095 // mapping and check for canonical order. 1096 // All other cases are ok. 1097 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1098 ) { 1099 // Fails FCD test, need to decompose and contiguously recompose. 1100 if(!doCompose) { 1101 return FALSE; 1102 } 1103 } else if(doCompose) { 1104 if(!buffer.append(c, cc, errorCode)) { 1105 break; 1106 } 1107 continue; 1108 } else if(prevCC<=cc) { 1109 prevCC=cc; 1110 continue; 1111 } else { 1112 return FALSE; 1113 } 1114 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 1115 return FALSE; 1116 } 1117 1118 /* 1119 * Find appropriate boundaries around this character, 1120 * decompose the source text from between the boundaries, 1121 * and recompose it. 1122 * 1123 * We may need to remove the last few characters from the ReorderingBuffer 1124 * to account for source text that was copied or appended 1125 * but needs to take part in the recomposition. 1126 */ 1127 1128 /* 1129 * Find the last composition boundary in [prevBoundary..src[. 1130 * It is either the decomposition of the current character (at prevSrc), 1131 * or prevBoundary. 1132 */ 1133 if(hasCompBoundaryBefore(c, norm16)) { 1134 prevBoundary=prevSrc; 1135 } else if(doCompose) { 1136 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary)); 1137 } 1138 1139 // Find the next composition boundary in [src..limit[ - 1140 // modifies src to point to the next starter. 1141 src=(UChar *)findNextCompBoundary(src, limit); 1142 1143 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 1144 int32_t recomposeStartIndex=buffer.length(); 1145 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) { 1146 break; 1147 } 1148 recompose(buffer, recomposeStartIndex, onlyContiguous); 1149 if(!doCompose) { 1150 if(!buffer.equals(prevBoundary, src)) { 1151 return FALSE; 1152 } 1153 buffer.remove(); 1154 prevCC=0; 1155 } 1156 1157 // Move to the next starter. We never need to look back before this point again. 1158 prevBoundary=src; 1159 } 1160 return TRUE; 1161 } 1162 1163 // Very similar to compose(): Make the same changes in both places if relevant. 1164 // pQCResult==NULL: spanQuickCheckYes 1165 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES) 1166 const UChar * 1167 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, 1168 UBool onlyContiguous, 1169 UNormalizationCheckResult *pQCResult) const { 1170 /* 1171 * prevBoundary points to the last character before the current one 1172 * that has a composition boundary before it with ccc==0 and quick check "yes". 1173 */ 1174 const UChar *prevBoundary=src; 1175 UChar32 minNoMaybeCP=minCompNoMaybeCP; 1176 if(limit==NULL) { 1177 UErrorCode errorCode=U_ZERO_ERROR; 1178 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode); 1179 if(prevBoundary<src) { 1180 // Set prevBoundary to the last character in the prefix. 1181 prevBoundary=src-1; 1182 } 1183 limit=u_strchr(src, 0); 1184 } 1185 1186 const UChar *prevSrc; 1187 UChar32 c=0; 1188 uint16_t norm16=0; 1189 uint8_t prevCC=0; 1190 1191 for(;;) { 1192 // count code units below the minimum or with irrelevant data for the quick check 1193 for(prevSrc=src;;) { 1194 if(src==limit) { 1195 return src; 1196 } 1197 if( (c=*src)<minNoMaybeCP || 1198 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1199 ) { 1200 ++src; 1201 } else if(!U16_IS_SURROGATE(c)) { 1202 break; 1203 } else { 1204 UChar c2; 1205 if(U16_IS_SURROGATE_LEAD(c)) { 1206 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1207 c=U16_GET_SUPPLEMENTARY(c, c2); 1208 } 1209 } else /* trail surrogate */ { 1210 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1211 --src; 1212 c=U16_GET_SUPPLEMENTARY(c2, c); 1213 } 1214 } 1215 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1216 src+=U16_LENGTH(c); 1217 } else { 1218 break; 1219 } 1220 } 1221 } 1222 if(src!=prevSrc) { 1223 // Set prevBoundary to the last character in the quick check loop. 1224 prevBoundary=src-1; 1225 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 1226 U16_IS_LEAD(*(prevBoundary-1)) 1227 ) { 1228 --prevBoundary; 1229 } 1230 prevCC=0; 1231 // The start of the current character (c). 1232 prevSrc=src; 1233 } 1234 1235 src+=U16_LENGTH(c); 1236 /* 1237 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1238 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1239 * or has ccc!=0. 1240 */ 1241 if(isMaybeOrNonZeroCC(norm16)) { 1242 uint8_t cc=getCCFromYesOrMaybe(norm16); 1243 if( onlyContiguous && // FCC 1244 cc!=0 && 1245 prevCC==0 && 1246 prevBoundary<prevSrc && 1247 // prevCC==0 && prevBoundary<prevSrc tell us that 1248 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1249 // passed the quick check "yes && ccc==0" test. 1250 // Check whether the last character was a "yesYes" or a "yesNo". 1251 // If a "yesNo", then we get its trailing ccc from its 1252 // mapping and check for canonical order. 1253 // All other cases are ok. 1254 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1255 ) { 1256 // Fails FCD test. 1257 } else if(prevCC<=cc || cc==0) { 1258 prevCC=cc; 1259 if(norm16<MIN_YES_YES_WITH_CC) { 1260 if(pQCResult!=NULL) { 1261 *pQCResult=UNORM_MAYBE; 1262 } else { 1263 return prevBoundary; 1264 } 1265 } 1266 continue; 1267 } 1268 } 1269 if(pQCResult!=NULL) { 1270 *pQCResult=UNORM_NO; 1271 } 1272 return prevBoundary; 1273 } 1274 } 1275 1276 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, 1277 UBool doCompose, 1278 UBool onlyContiguous, 1279 UnicodeString &safeMiddle, 1280 ReorderingBuffer &buffer, 1281 UErrorCode &errorCode) const { 1282 if(!buffer.isEmpty()) { 1283 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit); 1284 if(src!=firstStarterInSrc) { 1285 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), 1286 buffer.getLimit()); 1287 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest); 1288 UnicodeString middle(lastStarterInDest, destSuffixLength); 1289 buffer.removeSuffix(destSuffixLength); 1290 safeMiddle=middle; 1291 middle.append(src, (int32_t)(firstStarterInSrc-src)); 1292 const UChar *middleStart=middle.getBuffer(); 1293 compose(middleStart, middleStart+middle.length(), onlyContiguous, 1294 TRUE, buffer, errorCode); 1295 if(U_FAILURE(errorCode)) { 1296 return; 1297 } 1298 src=firstStarterInSrc; 1299 } 1300 } 1301 if(doCompose) { 1302 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode); 1303 } else { 1304 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1305 limit=u_strchr(src, 0); 1306 } 1307 buffer.appendZeroCC(src, limit, errorCode); 1308 } 1309 } 1310 1311 /** 1312 * Does c have a composition boundary before it? 1313 * True if its decomposition begins with a character that has 1314 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1315 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1316 * (isCompYesAndZeroCC()) so we need not decompose. 1317 */ 1318 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { 1319 for(;;) { 1320 if(isCompYesAndZeroCC(norm16)) { 1321 return TRUE; 1322 } else if(isMaybeOrNonZeroCC(norm16)) { 1323 return FALSE; 1324 } else if(isDecompNoAlgorithmic(norm16)) { 1325 c=mapAlgorithmic(c, norm16); 1326 norm16=getNorm16(c); 1327 } else { 1328 // c decomposes, get everything from the variable-length extra data 1329 const uint16_t *mapping=getMapping(norm16); 1330 uint16_t firstUnit=*mapping++; 1331 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1332 return FALSE; 1333 } 1334 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) { 1335 return FALSE; // non-zero leadCC 1336 } 1337 int32_t i=0; 1338 UChar32 c; 1339 U16_NEXT_UNSAFE(mapping, i, c); 1340 return isCompYesAndZeroCC(getNorm16(c)); 1341 } 1342 } 1343 } 1344 1345 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const { 1346 for(;;) { 1347 uint16_t norm16=getNorm16(c); 1348 if(isInert(norm16)) { 1349 return TRUE; 1350 } else if(norm16<=minYesNo) { 1351 // Hangul LVT (==minYesNo) has a boundary after it. 1352 // Hangul LV and non-inert yesYes characters combine forward. 1353 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c); 1354 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 1355 return FALSE; 1356 } else if(isDecompNoAlgorithmic(norm16)) { 1357 c=mapAlgorithmic(c, norm16); 1358 } else { 1359 // c decomposes, get everything from the variable-length extra data. 1360 // If testInert, then c must be a yesNo character which has lccc=0, 1361 // otherwise it could be a noNo. 1362 const uint16_t *mapping=getMapping(norm16); 1363 uint16_t firstUnit=*mapping; 1364 // TRUE if 1365 // c is not deleted, and 1366 // it and its decomposition do not combine forward, and it has a starter, and 1367 // if FCC then trailCC<=1 1368 return 1369 (firstUnit&MAPPING_LENGTH_MASK)!=0 && 1370 (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 && 1371 (!onlyContiguous || firstUnit<=0x1ff); 1372 } 1373 } 1374 } 1375 1376 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const { 1377 BackwardUTrie2StringIterator iter(normTrie, start, p); 1378 uint16_t norm16; 1379 do { 1380 norm16=iter.previous16(); 1381 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1382 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 1383 // but that's probably not worth the extra cost. 1384 return iter.codePointStart; 1385 } 1386 1387 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const { 1388 ForwardUTrie2StringIterator iter(normTrie, p, limit); 1389 uint16_t norm16; 1390 do { 1391 norm16=iter.next16(); 1392 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1393 return iter.codePointStart; 1394 } 1395 1396 class FCDTrieSingleton : public UTrie2Singleton { 1397 public: 1398 FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) : 1399 UTrie2Singleton(s), impl(ni), errorCode(ec) {} 1400 UTrie2 *getInstance(UErrorCode &errorCode) { 1401 return UTrie2Singleton::getInstance(createInstance, this, errorCode); 1402 } 1403 static void *createInstance(const void *context, UErrorCode &errorCode); 1404 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 1405 if(value!=0) { 1406 impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode); 1407 } 1408 return U_SUCCESS(errorCode); 1409 } 1410 1411 Normalizer2Impl &impl; 1412 UTrie2 *newFCDTrie; 1413 UErrorCode &errorCode; 1414 }; 1415 1416 U_CDECL_BEGIN 1417 1418 // Set the FCD value for a range of same-norm16 characters. 1419 static UBool U_CALLCONV 1420 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1421 return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value); 1422 } 1423 1424 // Collect (OR together) the FCD values for a range of supplementary characters, 1425 // for their lead surrogate code unit. 1426 static UBool U_CALLCONV 1427 enumRangeOrValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { 1428 *((uint32_t *)context)|=value; 1429 return TRUE; 1430 } 1431 1432 U_CDECL_END 1433 1434 void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) { 1435 FCDTrieSingleton *me=(FCDTrieSingleton *)context; 1436 me->newFCDTrie=utrie2_open(0, 0, &errorCode); 1437 if(U_SUCCESS(errorCode)) { 1438 utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me); 1439 for(UChar lead=0xd800; lead<0xdc00; ++lead) { 1440 uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead); 1441 utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue); 1442 if(oredValue!=0) { 1443 // Set a "bad" value for makeFCD() to break the quick check loop 1444 // and look up the value for the supplementary code point. 1445 // If there is any lccc, then set the worst-case lccc of 1. 1446 // The ORed-together value's tccc is already the worst case. 1447 if(oredValue>0xff) { 1448 oredValue=0x100|(oredValue&0xff); 1449 } 1450 utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode); 1451 } 1452 } 1453 utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode); 1454 if(U_SUCCESS(errorCode)) { 1455 return me->newFCDTrie; 1456 } 1457 } 1458 utrie2_close(me->newFCDTrie); 1459 return NULL; 1460 } 1461 1462 void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 1463 UTrie2 *newFCDTrie, UErrorCode &errorCode) const { 1464 // Only loops for 1:1 algorithmic mappings. 1465 for(;;) { 1466 if(norm16>=MIN_NORMAL_MAYBE_YES) { 1467 norm16&=0xff; 1468 norm16|=norm16<<8; 1469 } else if(norm16<=minYesNo || minMaybeYes<=norm16) { 1470 // no decomposition or Hangul syllable, all zeros 1471 break; 1472 } else if(limitNoNo<=norm16) { 1473 int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1); 1474 if(start==end) { 1475 start+=delta; 1476 norm16=getNorm16(start); 1477 } else { 1478 // the same delta leads from different original characters to different mappings 1479 do { 1480 UChar32 c=start+delta; 1481 setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode); 1482 } while(++start<=end); 1483 break; 1484 } 1485 } else { 1486 // c decomposes, get everything from the variable-length extra data 1487 const uint16_t *mapping=getMapping(norm16); 1488 uint16_t firstUnit=*mapping; 1489 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1490 // A character that is deleted (maps to an empty string) must 1491 // get the worst-case lccc and tccc values because arbitrary 1492 // characters on both sides will become adjacent. 1493 norm16=0x1ff; 1494 } else { 1495 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 1496 norm16=mapping[1]&0xff00; // lccc 1497 } else { 1498 norm16=0; 1499 } 1500 norm16|=firstUnit>>8; // tccc 1501 } 1502 } 1503 utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode); 1504 break; 1505 } 1506 } 1507 1508 const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const { 1509 // Logically const: Synchronized instantiation. 1510 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 1511 return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode); 1512 } 1513 1514 // Dual functionality: 1515 // buffer!=NULL: normalize 1516 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1517 const UChar * 1518 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, 1519 ReorderingBuffer *buffer, 1520 UErrorCode &errorCode) const { 1521 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1522 // Similar to the prevBoundary in the compose() implementation. 1523 const UChar *prevBoundary=src; 1524 int32_t prevFCD16=0; 1525 if(limit==NULL) { 1526 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode); 1527 if(U_FAILURE(errorCode)) { 1528 return src; 1529 } 1530 if(prevBoundary<src) { 1531 prevBoundary=src; 1532 // We know that the previous character's lccc==0. 1533 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1534 prevFCD16=getFCD16FromSingleLead(*(src-1)); 1535 if(prevFCD16>1) { 1536 --prevBoundary; 1537 } 1538 } 1539 limit=u_strchr(src, 0); 1540 } 1541 1542 // Note: In this function we use buffer->appendZeroCC() because we track 1543 // the lead and trail combining classes here, rather than leaving it to 1544 // the ReorderingBuffer. 1545 // The exception is the call to decomposeShort() which uses the buffer 1546 // in the normal way. 1547 1548 const UTrie2 *trie=fcdTrie(); 1549 1550 const UChar *prevSrc; 1551 UChar32 c=0; 1552 uint16_t fcd16=0; 1553 1554 for(;;) { 1555 // count code units with lccc==0 1556 for(prevSrc=src; src!=limit;) { 1557 if((c=*src)<MIN_CCC_LCCC_CP) { 1558 prevFCD16=~c; 1559 ++src; 1560 } else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) { 1561 prevFCD16=fcd16; 1562 ++src; 1563 } else if(!U16_IS_SURROGATE(c)) { 1564 break; 1565 } else { 1566 UChar c2; 1567 if(U16_IS_SURROGATE_LEAD(c)) { 1568 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1569 c=U16_GET_SUPPLEMENTARY(c, c2); 1570 } 1571 } else /* trail surrogate */ { 1572 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1573 --src; 1574 c=U16_GET_SUPPLEMENTARY(c2, c); 1575 } 1576 } 1577 if((fcd16=getFCD16(c))<=0xff) { 1578 prevFCD16=fcd16; 1579 src+=U16_LENGTH(c); 1580 } else { 1581 break; 1582 } 1583 } 1584 } 1585 // copy these code units all at once 1586 if(src!=prevSrc) { 1587 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) { 1588 break; 1589 } 1590 if(src==limit) { 1591 break; 1592 } 1593 prevBoundary=src; 1594 // We know that the previous character's lccc==0. 1595 if(prevFCD16<0) { 1596 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1597 prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16); 1598 if(prevFCD16>1) { 1599 --prevBoundary; 1600 } 1601 } else { 1602 const UChar *p=src-1; 1603 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { 1604 --p; 1605 // Need to fetch the previous character's FCD value because 1606 // prevFCD16 was just for the trail surrogate code point. 1607 prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]); 1608 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1609 } 1610 if(prevFCD16>1) { 1611 prevBoundary=p; 1612 } 1613 } 1614 // The start of the current character (c). 1615 prevSrc=src; 1616 } else if(src==limit) { 1617 break; 1618 } 1619 1620 src+=U16_LENGTH(c); 1621 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1622 // Check for proper order, and decompose locally if necessary. 1623 if((prevFCD16&0xff)<=(fcd16>>8)) { 1624 // proper order: prev tccc <= current lccc 1625 if((fcd16&0xff)<=1) { 1626 prevBoundary=src; 1627 } 1628 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { 1629 break; 1630 } 1631 prevFCD16=fcd16; 1632 continue; 1633 } else if(buffer==NULL) { 1634 return prevBoundary; // quick check "no" 1635 } else { 1636 /* 1637 * Back out the part of the source that we copied or appended 1638 * already but is now going to be decomposed. 1639 * prevSrc is set to after what was copied/appended. 1640 */ 1641 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); 1642 /* 1643 * Find the part of the source that needs to be decomposed, 1644 * up to the next safe boundary. 1645 */ 1646 src=findNextFCDBoundary(src, limit); 1647 /* 1648 * The source text does not fulfill the conditions for FCD. 1649 * Decompose and reorder a limited piece of the text. 1650 */ 1651 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) { 1652 break; 1653 } 1654 prevBoundary=src; 1655 prevFCD16=0; 1656 } 1657 } 1658 return src; 1659 } 1660 1661 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, 1662 UBool doMakeFCD, 1663 UnicodeString &safeMiddle, 1664 ReorderingBuffer &buffer, 1665 UErrorCode &errorCode) const { 1666 if(!buffer.isEmpty()) { 1667 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit); 1668 if(src!=firstBoundaryInSrc) { 1669 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), 1670 buffer.getLimit()); 1671 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); 1672 UnicodeString middle(lastBoundaryInDest, destSuffixLength); 1673 buffer.removeSuffix(destSuffixLength); 1674 safeMiddle=middle; 1675 middle.append(src, (int32_t)(firstBoundaryInSrc-src)); 1676 const UChar *middleStart=middle.getBuffer(); 1677 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); 1678 if(U_FAILURE(errorCode)) { 1679 return; 1680 } 1681 src=firstBoundaryInSrc; 1682 } 1683 } 1684 if(doMakeFCD) { 1685 makeFCD(src, limit, &buffer, errorCode); 1686 } else { 1687 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1688 limit=u_strchr(src, 0); 1689 } 1690 buffer.appendZeroCC(src, limit, errorCode); 1691 } 1692 } 1693 1694 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { 1695 BackwardUTrie2StringIterator iter(fcdTrie(), start, p); 1696 uint16_t fcd16; 1697 do { 1698 fcd16=iter.previous16(); 1699 } while(fcd16>0xff); 1700 return iter.codePointStart; 1701 } 1702 1703 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { 1704 ForwardUTrie2StringIterator iter(fcdTrie(), p, limit); 1705 uint16_t fcd16; 1706 do { 1707 fcd16=iter.next16(); 1708 } while(fcd16>0xff); 1709 return iter.codePointStart; 1710 } 1711 1712 // CanonicalIterator data -------------------------------------------------- *** 1713 1714 CanonIterData::CanonIterData(UErrorCode &errorCode) : 1715 trie(utrie2_open(0, 0, &errorCode)), 1716 canonStartSets(uhash_deleteUObject, NULL, errorCode) {} 1717 1718 CanonIterData::~CanonIterData() { 1719 utrie2_close(trie); 1720 } 1721 1722 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) { 1723 uint32_t canonValue=utrie2_get32(trie, decompLead); 1724 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 1725 // origin is the first character whose decomposition starts with 1726 // the character for which we are setting the value. 1727 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode); 1728 } else { 1729 // origin is not the first character, or it is U+0000. 1730 UnicodeSet *set; 1731 if((canonValue&CANON_HAS_SET)==0) { 1732 set=new UnicodeSet; 1733 if(set==NULL) { 1734 errorCode=U_MEMORY_ALLOCATION_ERROR; 1735 return; 1736 } 1737 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); 1738 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); 1739 utrie2_set32(trie, decompLead, canonValue, &errorCode); 1740 canonStartSets.addElement(set, errorCode); 1741 if(firstOrigin!=0) { 1742 set->add(firstOrigin); 1743 } 1744 } else { 1745 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; 1746 } 1747 set->add(origin); 1748 } 1749 } 1750 1751 class CanonIterDataSingleton { 1752 public: 1753 CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) : 1754 singleton(s), impl(ni), errorCode(ec) {} 1755 CanonIterData *getInstance(UErrorCode &errorCode) { 1756 void *duplicate; 1757 CanonIterData *instance= 1758 (CanonIterData *)singleton.getInstance(createInstance, this, duplicate, errorCode); 1759 delete (CanonIterData *)duplicate; 1760 return instance; 1761 } 1762 static void *createInstance(const void *context, UErrorCode &errorCode); 1763 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 1764 if(value!=0) { 1765 impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode); 1766 } 1767 return U_SUCCESS(errorCode); 1768 } 1769 1770 private: 1771 SimpleSingleton &singleton; 1772 Normalizer2Impl &impl; 1773 CanonIterData *newData; 1774 UErrorCode &errorCode; 1775 }; 1776 1777 U_CDECL_BEGIN 1778 1779 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. 1780 static UBool U_CALLCONV 1781 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1782 return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value); 1783 } 1784 1785 U_CDECL_END 1786 1787 void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) { 1788 CanonIterDataSingleton *me=(CanonIterDataSingleton *)context; 1789 me->newData=new CanonIterData(errorCode); 1790 if(me->newData==NULL) { 1791 errorCode=U_MEMORY_ALLOCATION_ERROR; 1792 return NULL; 1793 } 1794 if(U_SUCCESS(errorCode)) { 1795 utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me); 1796 utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode); 1797 if(U_SUCCESS(errorCode)) { 1798 return me->newData; 1799 } 1800 } 1801 delete me->newData; 1802 return NULL; 1803 } 1804 1805 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 1806 CanonIterData &newData, 1807 UErrorCode &errorCode) const { 1808 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 1809 // Inert, or 2-way mapping (including Hangul syllable). 1810 // We do not write a canonStartSet for any yesNo character. 1811 // Composites from 2-way mappings are added at runtime from the 1812 // starter's compositions list, and the other characters in 1813 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 1814 // "maybe" characters. 1815 return; 1816 } 1817 for(UChar32 c=start; c<=end; ++c) { 1818 uint32_t oldValue=utrie2_get32(newData.trie, c); 1819 uint32_t newValue=oldValue; 1820 if(norm16>=minMaybeYes) { 1821 // not a segment starter if it occurs in a decomposition or has cc!=0 1822 newValue|=CANON_NOT_SEGMENT_STARTER; 1823 if(norm16<MIN_NORMAL_MAYBE_YES) { 1824 newValue|=CANON_HAS_COMPOSITIONS; 1825 } 1826 } else if(norm16<minYesNo) { 1827 newValue|=CANON_HAS_COMPOSITIONS; 1828 } else { 1829 // c has a one-way decomposition 1830 UChar32 c2=c; 1831 uint16_t norm16_2=norm16; 1832 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 1833 c2=mapAlgorithmic(c2, norm16_2); 1834 norm16_2=getNorm16(c2); 1835 } 1836 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 1837 // c decomposes, get everything from the variable-length extra data 1838 const uint16_t *mapping=getMapping(norm16_2); 1839 uint16_t firstUnit=*mapping++; 1840 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 1841 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1842 if(c==c2 && (*mapping&0xff)!=0) { 1843 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 1844 } 1845 ++mapping; 1846 } 1847 // Skip empty mappings (no characters in the decomposition). 1848 if(length!=0) { 1849 // add c to first code point's start set 1850 int32_t i=0; 1851 U16_NEXT_UNSAFE(mapping, i, c2); 1852 newData.addToStartSet(c, c2, errorCode); 1853 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 1854 // one-way mapping. A 2-way mapping is possible here after 1855 // intermediate algorithmic mapping. 1856 if(norm16_2>=minNoNo) { 1857 while(i<length) { 1858 U16_NEXT_UNSAFE(mapping, i, c2); 1859 uint32_t c2Value=utrie2_get32(newData.trie, c2); 1860 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 1861 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER, 1862 &errorCode); 1863 } 1864 } 1865 } 1866 } 1867 } else { 1868 // c decomposed to c2 algorithmically; c has cc==0 1869 newData.addToStartSet(c, c2, errorCode); 1870 } 1871 } 1872 if(newValue!=oldValue) { 1873 utrie2_set32(newData.trie, c, newValue, &errorCode); 1874 } 1875 } 1876 } 1877 1878 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { 1879 // Logically const: Synchronized instantiation. 1880 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 1881 CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode); 1882 return U_SUCCESS(errorCode); 1883 } 1884 1885 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { 1886 return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c); 1887 } 1888 1889 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { 1890 return *(const UnicodeSet *)( 1891 ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]); 1892 } 1893 1894 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { 1895 return getCanonValue(c)>=0; 1896 } 1897 1898 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { 1899 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; 1900 if(canonValue==0) { 1901 return FALSE; 1902 } 1903 set.clear(); 1904 int32_t value=canonValue&CANON_VALUE_MASK; 1905 if((canonValue&CANON_HAS_SET)!=0) { 1906 set.addAll(getCanonStartSet(value)); 1907 } else if(value!=0) { 1908 set.add(value); 1909 } 1910 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 1911 uint16_t norm16=getNorm16(c); 1912 if(norm16==JAMO_L) { 1913 UChar32 syllable= 1914 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); 1915 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); 1916 } else { 1917 addComposites(getCompositionsList(norm16), set); 1918 } 1919 } 1920 return TRUE; 1921 } 1922 1923 U_NAMESPACE_END 1924 1925 // Normalizer2 data swapping ----------------------------------------------- *** 1926 1927 U_NAMESPACE_USE 1928 1929 U_CAPI int32_t U_EXPORT2 1930 unorm2_swap(const UDataSwapper *ds, 1931 const void *inData, int32_t length, void *outData, 1932 UErrorCode *pErrorCode) { 1933 const UDataInfo *pInfo; 1934 int32_t headerSize; 1935 1936 const uint8_t *inBytes; 1937 uint8_t *outBytes; 1938 1939 const int32_t *inIndexes; 1940 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1]; 1941 1942 int32_t i, offset, nextOffset, size; 1943 1944 /* udata_swapDataHeader checks the arguments */ 1945 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 1946 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1947 return 0; 1948 } 1949 1950 /* check data format and format version */ 1951 pInfo=(const UDataInfo *)((const char *)inData+4); 1952 if(!( 1953 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 1954 pInfo->dataFormat[1]==0x72 && 1955 pInfo->dataFormat[2]==0x6d && 1956 pInfo->dataFormat[3]==0x32 && 1957 pInfo->formatVersion[0]==1 1958 )) { 1959 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", 1960 pInfo->dataFormat[0], pInfo->dataFormat[1], 1961 pInfo->dataFormat[2], pInfo->dataFormat[3], 1962 pInfo->formatVersion[0]); 1963 *pErrorCode=U_UNSUPPORTED_ERROR; 1964 return 0; 1965 } 1966 1967 inBytes=(const uint8_t *)inData+headerSize; 1968 outBytes=(uint8_t *)outData+headerSize; 1969 1970 inIndexes=(const int32_t *)inBytes; 1971 1972 if(length>=0) { 1973 length-=headerSize; 1974 if(length<(int32_t)sizeof(indexes)) { 1975 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", 1976 length); 1977 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1978 return 0; 1979 } 1980 } 1981 1982 /* read the first few indexes */ 1983 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) { 1984 indexes[i]=udata_readInt32(ds, inIndexes[i]); 1985 } 1986 1987 /* get the total length of the data */ 1988 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 1989 1990 if(length>=0) { 1991 if(length<size) { 1992 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n", 1993 length); 1994 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1995 return 0; 1996 } 1997 1998 /* copy the data for inaccessible bytes */ 1999 if(inBytes!=outBytes) { 2000 uprv_memcpy(outBytes, inBytes, size); 2001 } 2002 2003 offset=0; 2004 2005 /* swap the int32_t indexes[] */ 2006 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]; 2007 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); 2008 offset=nextOffset; 2009 2010 /* swap the UTrie2 */ 2011 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; 2012 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 2013 offset=nextOffset; 2014 2015 /* swap the uint16_t extraData[] */ 2016 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1]; 2017 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 2018 offset=nextOffset; 2019 2020 U_ASSERT(offset==size); 2021 } 2022 2023 return headerSize+size; 2024 } 2025 2026 #endif // !UCONFIG_NO_NORMALIZATION 2027