1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2014, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: normalizer2impl.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov22 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_NORMALIZATION 20 21 #include "unicode/normalizer2.h" 22 #include "unicode/udata.h" 23 #include "unicode/ustring.h" 24 #include "unicode/utf16.h" 25 #include "cmemory.h" 26 #include "mutex.h" 27 #include "normalizer2impl.h" 28 #include "putilimp.h" 29 #include "uassert.h" 30 #include "uset_imp.h" 31 #include "utrie2.h" 32 #include "uvector.h" 33 34 U_NAMESPACE_BEGIN 35 36 // ReorderingBuffer -------------------------------------------------------- *** 37 38 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { 39 int32_t length=str.length(); 40 start=str.getBuffer(destCapacity); 41 if(start==NULL) { 42 // getBuffer() already did str.setToBogus() 43 errorCode=U_MEMORY_ALLOCATION_ERROR; 44 return FALSE; 45 } 46 limit=start+length; 47 remainingCapacity=str.getCapacity()-length; 48 reorderStart=start; 49 if(start==limit) { 50 lastCC=0; 51 } else { 52 setIterator(); 53 lastCC=previousCC(); 54 // Set reorderStart after the last code point with cc<=1 if there is one. 55 if(lastCC>1) { 56 while(previousCC()>1) {} 57 } 58 reorderStart=codePointLimit; 59 } 60 return TRUE; 61 } 62 63 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { 64 int32_t length=(int32_t)(limit-start); 65 return 66 length==(int32_t)(otherLimit-otherStart) && 67 0==u_memcmp(start, otherStart, length); 68 } 69 70 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 71 if(remainingCapacity<2 && !resize(2, errorCode)) { 72 return FALSE; 73 } 74 if(lastCC<=cc || cc==0) { 75 limit[0]=U16_LEAD(c); 76 limit[1]=U16_TRAIL(c); 77 limit+=2; 78 lastCC=cc; 79 if(cc<=1) { 80 reorderStart=limit; 81 } 82 } else { 83 insert(c, cc); 84 } 85 remainingCapacity-=2; 86 return TRUE; 87 } 88 89 UBool ReorderingBuffer::append(const UChar *s, int32_t length, 90 uint8_t leadCC, uint8_t trailCC, 91 UErrorCode &errorCode) { 92 if(length==0) { 93 return TRUE; 94 } 95 if(remainingCapacity<length && !resize(length, errorCode)) { 96 return FALSE; 97 } 98 remainingCapacity-=length; 99 if(lastCC<=leadCC || leadCC==0) { 100 if(trailCC<=1) { 101 reorderStart=limit+length; 102 } else if(leadCC<=1) { 103 reorderStart=limit+1; // Ok if not a code point boundary. 104 } 105 const UChar *sLimit=s+length; 106 do { *limit++=*s++; } while(s!=sLimit); 107 lastCC=trailCC; 108 } else { 109 int32_t i=0; 110 UChar32 c; 111 U16_NEXT(s, i, length, c); 112 insert(c, leadCC); // insert first code point 113 while(i<length) { 114 U16_NEXT(s, i, length, c); 115 if(i<length) { 116 // s must be in NFD, otherwise we need to use getCC(). 117 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 118 } else { 119 leadCC=trailCC; 120 } 121 append(c, leadCC, errorCode); 122 } 123 } 124 return TRUE; 125 } 126 127 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { 128 int32_t cpLength=U16_LENGTH(c); 129 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { 130 return FALSE; 131 } 132 remainingCapacity-=cpLength; 133 if(cpLength==1) { 134 *limit++=(UChar)c; 135 } else { 136 limit[0]=U16_LEAD(c); 137 limit[1]=U16_TRAIL(c); 138 limit+=2; 139 } 140 lastCC=0; 141 reorderStart=limit; 142 return TRUE; 143 } 144 145 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) { 146 if(s==sLimit) { 147 return TRUE; 148 } 149 int32_t length=(int32_t)(sLimit-s); 150 if(remainingCapacity<length && !resize(length, errorCode)) { 151 return FALSE; 152 } 153 u_memcpy(limit, s, length); 154 limit+=length; 155 remainingCapacity-=length; 156 lastCC=0; 157 reorderStart=limit; 158 return TRUE; 159 } 160 161 void ReorderingBuffer::remove() { 162 reorderStart=limit=start; 163 remainingCapacity=str.getCapacity(); 164 lastCC=0; 165 } 166 167 void ReorderingBuffer::removeSuffix(int32_t suffixLength) { 168 if(suffixLength<(limit-start)) { 169 limit-=suffixLength; 170 remainingCapacity+=suffixLength; 171 } else { 172 limit=start; 173 remainingCapacity=str.getCapacity(); 174 } 175 lastCC=0; 176 reorderStart=limit; 177 } 178 179 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { 180 int32_t reorderStartIndex=(int32_t)(reorderStart-start); 181 int32_t length=(int32_t)(limit-start); 182 str.releaseBuffer(length); 183 int32_t newCapacity=length+appendLength; 184 int32_t doubleCapacity=2*str.getCapacity(); 185 if(newCapacity<doubleCapacity) { 186 newCapacity=doubleCapacity; 187 } 188 if(newCapacity<256) { 189 newCapacity=256; 190 } 191 start=str.getBuffer(newCapacity); 192 if(start==NULL) { 193 // getBuffer() already did str.setToBogus() 194 errorCode=U_MEMORY_ALLOCATION_ERROR; 195 return FALSE; 196 } 197 reorderStart=start+reorderStartIndex; 198 limit=start+length; 199 remainingCapacity=str.getCapacity()-length; 200 return TRUE; 201 } 202 203 void ReorderingBuffer::skipPrevious() { 204 codePointLimit=codePointStart; 205 UChar c=*--codePointStart; 206 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { 207 --codePointStart; 208 } 209 } 210 211 uint8_t ReorderingBuffer::previousCC() { 212 codePointLimit=codePointStart; 213 if(reorderStart>=codePointStart) { 214 return 0; 215 } 216 UChar32 c=*--codePointStart; 217 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) { 218 return 0; 219 } 220 221 UChar c2; 222 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { 223 --codePointStart; 224 c=U16_GET_SUPPLEMENTARY(c2, c); 225 } 226 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 227 } 228 229 // Inserts c somewhere before the last character. 230 // Requires 0<cc<lastCC which implies reorderStart<limit. 231 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) { 232 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 233 // insert c at codePointLimit, after the character with prevCC<=cc 234 UChar *q=limit; 235 UChar *r=limit+=U16_LENGTH(c); 236 do { 237 *--r=*--q; 238 } while(codePointLimit!=q); 239 writeCodePoint(q, c); 240 if(cc<=1) { 241 reorderStart=r; 242 } 243 } 244 245 // Normalizer2Impl --------------------------------------------------------- *** 246 247 struct CanonIterData : public UMemory { 248 CanonIterData(UErrorCode &errorCode); 249 ~CanonIterData(); 250 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); 251 UTrie2 *trie; 252 UVector canonStartSets; // contains UnicodeSet * 253 }; 254 255 Normalizer2Impl::~Normalizer2Impl() { 256 udata_close(memory); 257 utrie2_close(normTrie); 258 delete fCanonIterData; 259 } 260 261 UBool U_CALLCONV 262 Normalizer2Impl::isAcceptable(void *context, 263 const char * /* type */, const char * /*name*/, 264 const UDataInfo *pInfo) { 265 if( 266 pInfo->size>=20 && 267 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 268 pInfo->charsetFamily==U_CHARSET_FAMILY && 269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 270 pInfo->dataFormat[1]==0x72 && 271 pInfo->dataFormat[2]==0x6d && 272 pInfo->dataFormat[3]==0x32 && 273 pInfo->formatVersion[0]==2 274 ) { 275 Normalizer2Impl *me=(Normalizer2Impl *)context; 276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); 277 return TRUE; 278 } else { 279 return FALSE; 280 } 281 } 282 283 void 284 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { 285 if(U_FAILURE(errorCode)) { 286 return; 287 } 288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); 289 if(U_FAILURE(errorCode)) { 290 return; 291 } 292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); 293 const int32_t *inIndexes=(const int32_t *)inBytes; 294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; 295 if(indexesLength<=IX_MIN_MAYBE_YES) { 296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. 297 return; 298 } 299 300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 302 303 minYesNo=inIndexes[IX_MIN_YES_NO]; 304 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 305 minNoNo=inIndexes[IX_MIN_NO_NO]; 306 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 307 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 308 309 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; 310 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 311 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 312 inBytes+offset, nextOffset-offset, NULL, 313 &errorCode); 314 if(U_FAILURE(errorCode)) { 315 return; 316 } 317 318 offset=nextOffset; 319 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 320 maybeYesCompositions=(const uint16_t *)(inBytes+offset); 321 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); 322 323 // smallFCD: new in formatVersion 2 324 offset=nextOffset; 325 smallFCD=inBytes+offset; 326 327 // Build tccc180[]. 328 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. 329 uint8_t bits=0; 330 for(UChar c=0; c<0x180; bits>>=1) { 331 if((c&0xff)==0) { 332 bits=smallFCD[c>>8]; // one byte per 0x100 code points 333 } 334 if(bits&1) { 335 for(int i=0; i<0x20; ++i, ++c) { 336 tccc180[c]=(uint8_t)getFCD16FromNormData(c); 337 } 338 } else { 339 uprv_memset(tccc180+c, 0, 0x20); 340 c+=0x20; 341 } 342 } 343 } 344 345 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { 346 UChar32 c; 347 if(cpStart==(cpLimit-1)) { 348 c=*cpStart; 349 } else { 350 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); 351 } 352 uint16_t prevNorm16=getNorm16(c); 353 if(prevNorm16<=minYesNo) { 354 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 355 } else { 356 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo 357 } 358 } 359 360 namespace { 361 362 class LcccContext { 363 public: 364 LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {} 365 366 void handleRange(UChar32 start, UChar32 end, uint16_t norm16) { 367 if(impl.isAlgorithmicNoNo(norm16)) { 368 // Range of code points with same-norm16-value algorithmic decompositions. 369 // They might have different non-zero FCD16 values. 370 do { 371 uint16_t fcd16=impl.getFCD16(start); 372 if(fcd16>0xff) { set.add(start); } 373 } while(++start<=end); 374 } else { 375 uint16_t fcd16=impl.getFCD16(start); 376 if(fcd16>0xff) { set.add(start, end); } 377 } 378 } 379 380 private: 381 const Normalizer2Impl &impl; 382 UnicodeSet &set; 383 }; 384 385 struct PropertyStartsContext { 386 PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder) 387 : impl(ni), sa(adder) {} 388 389 const Normalizer2Impl &impl; 390 const USetAdder *sa; 391 }; 392 393 } // namespace 394 395 U_CDECL_BEGIN 396 397 static UBool U_CALLCONV 398 enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 399 ((LcccContext *)context)->handleRange(start, end, (uint16_t)value); 400 return TRUE; 401 } 402 403 static UBool U_CALLCONV 404 enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 405 /* add the start code point to the USet */ 406 const PropertyStartsContext *ctx=(const PropertyStartsContext *)context; 407 const USetAdder *sa=ctx->sa; 408 sa->add(sa->set, start); 409 if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) { 410 // Range of code points with same-norm16-value algorithmic decompositions. 411 // They might have different non-zero FCD16 values. 412 uint16_t prevFCD16=ctx->impl.getFCD16(start); 413 while(++start<=end) { 414 uint16_t fcd16=ctx->impl.getFCD16(start); 415 if(fcd16!=prevFCD16) { 416 sa->add(sa->set, start); 417 prevFCD16=fcd16; 418 } 419 } 420 } 421 return TRUE; 422 } 423 424 static UBool U_CALLCONV 425 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 426 /* add the start code point to the USet */ 427 const USetAdder *sa=(const USetAdder *)context; 428 sa->add(sa->set, start); 429 return TRUE; 430 } 431 432 static uint32_t U_CALLCONV 433 segmentStarterMapper(const void * /*context*/, uint32_t value) { 434 return value&CANON_NOT_SEGMENT_STARTER; 435 } 436 437 U_CDECL_END 438 439 void 440 Normalizer2Impl::addLcccChars(UnicodeSet &set) const { 441 /* add the start code point of each same-value range of each trie */ 442 LcccContext context(*this, set); 443 utrie2_enum(normTrie, NULL, enumLcccRange, &context); 444 } 445 446 void 447 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { 448 /* add the start code point of each same-value range of each trie */ 449 PropertyStartsContext context(*this, sa); 450 utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context); 451 452 /* add Hangul LV syllables and LV+1 because of skippables */ 453 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { 454 sa->add(sa->set, c); 455 sa->add(sa->set, c+1); 456 } 457 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 458 } 459 460 void 461 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { 462 /* add the start code point of each same-value range of the canonical iterator data trie */ 463 if(ensureCanonIterData(errorCode)) { 464 // currently only used for the SEGMENT_STARTER property 465 utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa); 466 } 467 } 468 469 const UChar * 470 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, 471 UChar32 minNeedDataCP, 472 ReorderingBuffer *buffer, 473 UErrorCode &errorCode) const { 474 // Make some effort to support NUL-terminated strings reasonably. 475 // Take the part of the fast quick check loop that does not look up 476 // data and check the first part of the string. 477 // After this prefix, determine the string length to simplify the rest 478 // of the code. 479 const UChar *prevSrc=src; 480 UChar c; 481 while((c=*src++)<minNeedDataCP && c!=0) {} 482 // Back out the last character for full processing. 483 // Copy this prefix. 484 if(--src!=prevSrc) { 485 if(buffer!=NULL) { 486 buffer->appendZeroCC(prevSrc, src, errorCode); 487 } 488 } 489 return src; 490 } 491 492 UnicodeString & 493 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest, 494 UErrorCode &errorCode) const { 495 if(U_FAILURE(errorCode)) { 496 dest.setToBogus(); 497 return dest; 498 } 499 const UChar *sArray=src.getBuffer(); 500 if(&dest==&src || sArray==NULL) { 501 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 502 dest.setToBogus(); 503 return dest; 504 } 505 decompose(sArray, sArray+src.length(), dest, src.length(), errorCode); 506 return dest; 507 } 508 509 void 510 Normalizer2Impl::decompose(const UChar *src, const UChar *limit, 511 UnicodeString &dest, 512 int32_t destLengthEstimate, 513 UErrorCode &errorCode) const { 514 if(destLengthEstimate<0 && limit!=NULL) { 515 destLengthEstimate=(int32_t)(limit-src); 516 } 517 dest.remove(); 518 ReorderingBuffer buffer(*this, dest); 519 if(buffer.init(destLengthEstimate, errorCode)) { 520 decompose(src, limit, &buffer, errorCode); 521 } 522 } 523 524 // Dual functionality: 525 // buffer!=NULL: normalize 526 // buffer==NULL: isNormalized/spanQuickCheckYes 527 const UChar * 528 Normalizer2Impl::decompose(const UChar *src, const UChar *limit, 529 ReorderingBuffer *buffer, 530 UErrorCode &errorCode) const { 531 UChar32 minNoCP=minDecompNoCP; 532 if(limit==NULL) { 533 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); 534 if(U_FAILURE(errorCode)) { 535 return src; 536 } 537 limit=u_strchr(src, 0); 538 } 539 540 const UChar *prevSrc; 541 UChar32 c=0; 542 uint16_t norm16=0; 543 544 // only for quick check 545 const UChar *prevBoundary=src; 546 uint8_t prevCC=0; 547 548 for(;;) { 549 // count code units below the minimum or with irrelevant data for the quick check 550 for(prevSrc=src; src!=limit;) { 551 if( (c=*src)<minNoCP || 552 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 553 ) { 554 ++src; 555 } else if(!U16_IS_SURROGATE(c)) { 556 break; 557 } else { 558 UChar c2; 559 if(U16_IS_SURROGATE_LEAD(c)) { 560 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 561 c=U16_GET_SUPPLEMENTARY(c, c2); 562 } 563 } else /* trail surrogate */ { 564 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 565 --src; 566 c=U16_GET_SUPPLEMENTARY(c2, c); 567 } 568 } 569 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 570 src+=U16_LENGTH(c); 571 } else { 572 break; 573 } 574 } 575 } 576 // copy these code units all at once 577 if(src!=prevSrc) { 578 if(buffer!=NULL) { 579 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { 580 break; 581 } 582 } else { 583 prevCC=0; 584 prevBoundary=src; 585 } 586 } 587 if(src==limit) { 588 break; 589 } 590 591 // Check one above-minimum, relevant code point. 592 src+=U16_LENGTH(c); 593 if(buffer!=NULL) { 594 if(!decompose(c, norm16, *buffer, errorCode)) { 595 break; 596 } 597 } else { 598 if(isDecompYes(norm16)) { 599 uint8_t cc=getCCFromYesOrMaybe(norm16); 600 if(prevCC<=cc || cc==0) { 601 prevCC=cc; 602 if(cc<=1) { 603 prevBoundary=src; 604 } 605 continue; 606 } 607 } 608 return prevBoundary; // "no" or cc out of order 609 } 610 } 611 return src; 612 } 613 614 // Decompose a short piece of text which is likely to contain characters that 615 // fail the quick check loop and/or where the quick check loop's overhead 616 // is unlikely to be amortized. 617 // Called by the compose() and makeFCD() implementations. 618 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit, 619 ReorderingBuffer &buffer, 620 UErrorCode &errorCode) const { 621 while(src<limit) { 622 UChar32 c; 623 uint16_t norm16; 624 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16); 625 if(!decompose(c, norm16, buffer, errorCode)) { 626 return FALSE; 627 } 628 } 629 return TRUE; 630 } 631 632 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, 633 ReorderingBuffer &buffer, 634 UErrorCode &errorCode) const { 635 // Only loops for 1:1 algorithmic mappings. 636 for(;;) { 637 // get the decomposition and the lead and trail cc's 638 if(isDecompYes(norm16)) { 639 // c does not decompose 640 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode); 641 } else if(isHangul(norm16)) { 642 // Hangul syllable: decompose algorithmically 643 UChar jamos[3]; 644 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); 645 } else if(isDecompNoAlgorithmic(norm16)) { 646 c=mapAlgorithmic(c, norm16); 647 norm16=getNorm16(c); 648 } else { 649 // c decomposes, get everything from the variable-length extra data 650 const uint16_t *mapping=getMapping(norm16); 651 uint16_t firstUnit=*mapping; 652 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 653 uint8_t leadCC, trailCC; 654 trailCC=(uint8_t)(firstUnit>>8); 655 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 656 leadCC=(uint8_t)(*(mapping-1)>>8); 657 } else { 658 leadCC=0; 659 } 660 return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode); 661 } 662 } 663 } 664 665 const UChar * 666 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { 667 const UChar *decomp=NULL; 668 uint16_t norm16; 669 for(;;) { 670 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 671 // c does not decompose 672 return decomp; 673 } else if(isHangul(norm16)) { 674 // Hangul syllable: decompose algorithmically 675 length=Hangul::decompose(c, buffer); 676 return buffer; 677 } else if(isDecompNoAlgorithmic(norm16)) { 678 c=mapAlgorithmic(c, norm16); 679 decomp=buffer; 680 length=0; 681 U16_APPEND_UNSAFE(buffer, length, c); 682 } else { 683 // c decomposes, get everything from the variable-length extra data 684 const uint16_t *mapping=getMapping(norm16); 685 length=*mapping&MAPPING_LENGTH_MASK; 686 return (const UChar *)mapping+1; 687 } 688 } 689 } 690 691 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1 692 // so that a raw mapping fits that consists of one unit ("rm0") 693 // plus all but the first two code units of the normal mapping. 694 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK. 695 const UChar * 696 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const { 697 // We do not loop in this method because an algorithmic mapping itself 698 // becomes a final result rather than having to be decomposed recursively. 699 uint16_t norm16; 700 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 701 // c does not decompose 702 return NULL; 703 } else if(isHangul(norm16)) { 704 // Hangul syllable: decompose algorithmically 705 Hangul::getRawDecomposition(c, buffer); 706 length=2; 707 return buffer; 708 } else if(isDecompNoAlgorithmic(norm16)) { 709 c=mapAlgorithmic(c, norm16); 710 length=0; 711 U16_APPEND_UNSAFE(buffer, length, c); 712 return buffer; 713 } else { 714 // c decomposes, get everything from the variable-length extra data 715 const uint16_t *mapping=getMapping(norm16); 716 uint16_t firstUnit=*mapping; 717 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 718 if(firstUnit&MAPPING_HAS_RAW_MAPPING) { 719 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 720 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 721 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1; 722 uint16_t rm0=*rawMapping; 723 if(rm0<=MAPPING_LENGTH_MASK) { 724 length=rm0; 725 return (const UChar *)rawMapping-rm0; 726 } else { 727 // Copy the normal mapping and replace its first two code units with rm0. 728 buffer[0]=(UChar)rm0; 729 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2); 730 length=mLength-1; 731 return buffer; 732 } 733 } else { 734 length=mLength; 735 return (const UChar *)mapping+1; 736 } 737 } 738 } 739 740 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, 741 UBool doDecompose, 742 UnicodeString &safeMiddle, 743 ReorderingBuffer &buffer, 744 UErrorCode &errorCode) const { 745 buffer.copyReorderableSuffixTo(safeMiddle); 746 if(doDecompose) { 747 decompose(src, limit, &buffer, errorCode); 748 return; 749 } 750 // Just merge the strings at the boundary. 751 ForwardUTrie2StringIterator iter(normTrie, src, limit); 752 uint8_t firstCC, prevCC, cc; 753 firstCC=prevCC=cc=getCC(iter.next16()); 754 while(cc!=0) { 755 prevCC=cc; 756 cc=getCC(iter.next16()); 757 }; 758 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 759 limit=u_strchr(iter.codePointStart, 0); 760 } 761 762 if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) { 763 buffer.appendZeroCC(iter.codePointStart, limit, errorCode); 764 } 765 } 766 767 // Note: hasDecompBoundary() could be implemented as aliases to 768 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 769 // at the cost of building the FCD trie for a decomposition normalizer. 770 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { 771 for(;;) { 772 if(c<minDecompNoCP) { 773 return TRUE; 774 } 775 uint16_t norm16=getNorm16(c); 776 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 777 return TRUE; 778 } else if(norm16>MIN_NORMAL_MAYBE_YES) { 779 return FALSE; // ccc!=0 780 } else if(isDecompNoAlgorithmic(norm16)) { 781 c=mapAlgorithmic(c, norm16); 782 } else { 783 // c decomposes, get everything from the variable-length extra data 784 const uint16_t *mapping=getMapping(norm16); 785 uint16_t firstUnit=*mapping; 786 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 787 return FALSE; 788 } 789 if(!before) { 790 // decomp after-boundary: same as hasFCDBoundaryAfter(), 791 // fcd16<=1 || trailCC==0 792 if(firstUnit>0x1ff) { 793 return FALSE; // trailCC>1 794 } 795 if(firstUnit<=0xff) { 796 return TRUE; // trailCC==0 797 } 798 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 799 } 800 // TRUE if leadCC==0 (hasFCDBoundaryBefore()) 801 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; 802 } 803 } 804 } 805 806 /* 807 * Finds the recomposition result for 808 * a forward-combining "lead" character, 809 * specified with a pointer to its compositions list, 810 * and a backward-combining "trail" character. 811 * 812 * If the lead and trail characters combine, then this function returns 813 * the following "compositeAndFwd" value: 814 * Bits 21..1 composite character 815 * Bit 0 set if the composite is a forward-combining starter 816 * otherwise it returns -1. 817 * 818 * The compositions list has (trail, compositeAndFwd) pair entries, 819 * encoded as either pairs or triples of 16-bit units. 820 * The last entry has the high bit of its first unit set. 821 * 822 * The list is sorted by ascending trail characters (there are no duplicates). 823 * A linear search is used. 824 * 825 * See normalizer2impl.h for a more detailed description 826 * of the compositions list format. 827 */ 828 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { 829 uint16_t key1, firstUnit; 830 if(trail<COMP_1_TRAIL_LIMIT) { 831 // trail character is 0..33FF 832 // result entry may have 2 or 3 units 833 key1=(uint16_t)(trail<<1); 834 while(key1>(firstUnit=*list)) { 835 list+=2+(firstUnit&COMP_1_TRIPLE); 836 } 837 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 838 if(firstUnit&COMP_1_TRIPLE) { 839 return ((int32_t)list[1]<<16)|list[2]; 840 } else { 841 return list[1]; 842 } 843 } 844 } else { 845 // trail character is 3400..10FFFF 846 // result entry has 3 units 847 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ 848 (((trail>>COMP_1_TRAIL_SHIFT))& 849 ~COMP_1_TRIPLE)); 850 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT); 851 uint16_t secondUnit; 852 for(;;) { 853 if(key1>(firstUnit=*list)) { 854 list+=2+(firstUnit&COMP_1_TRIPLE); 855 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 856 if(key2>(secondUnit=list[1])) { 857 if(firstUnit&COMP_1_LAST_TUPLE) { 858 break; 859 } else { 860 list+=3; 861 } 862 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 863 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; 864 } else { 865 break; 866 } 867 } else { 868 break; 869 } 870 } 871 } 872 return -1; 873 } 874 875 /** 876 * @param list some character's compositions list 877 * @param set recursively receives the composites from these compositions 878 */ 879 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { 880 uint16_t firstUnit; 881 int32_t compositeAndFwd; 882 do { 883 firstUnit=*list; 884 if((firstUnit&COMP_1_TRIPLE)==0) { 885 compositeAndFwd=list[1]; 886 list+=2; 887 } else { 888 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; 889 list+=3; 890 } 891 UChar32 composite=compositeAndFwd>>1; 892 if((compositeAndFwd&1)!=0) { 893 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 894 } 895 set.add(composite); 896 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 897 } 898 899 /* 900 * Recomposes the buffer text starting at recomposeStartIndex 901 * (which is in NFD - decomposed and canonically ordered), 902 * and truncates the buffer contents. 903 * 904 * Note that recomposition never lengthens the text: 905 * Any character consists of either one or two code units; 906 * a composition may contain at most one more code unit than the original starter, 907 * while the combining mark that is removed has at least one code unit. 908 */ 909 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 910 UBool onlyContiguous) const { 911 UChar *p=buffer.getStart()+recomposeStartIndex; 912 UChar *limit=buffer.getLimit(); 913 if(p==limit) { 914 return; 915 } 916 917 UChar *starter, *pRemove, *q, *r; 918 const uint16_t *compositionsList; 919 UChar32 c, compositeAndFwd; 920 uint16_t norm16; 921 uint8_t cc, prevCC; 922 UBool starterIsSupplementary; 923 924 // Some of the following variables are not used until we have a forward-combining starter 925 // and are only initialized now to avoid compiler warnings. 926 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter 927 starter=NULL; 928 starterIsSupplementary=FALSE; 929 prevCC=0; 930 931 for(;;) { 932 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16); 933 cc=getCCFromYesOrMaybe(norm16); 934 if( // this character combines backward and 935 isMaybe(norm16) && 936 // we have seen a starter that combines forward and 937 compositionsList!=NULL && 938 // the backward-combining character is not blocked 939 (prevCC<cc || prevCC==0) 940 ) { 941 if(isJamoVT(norm16)) { 942 // c is a Jamo V/T, see if we can compose it with the previous character. 943 if(c<Hangul::JAMO_T_BASE) { 944 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 945 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE); 946 if(prev<Hangul::JAMO_L_COUNT) { 947 pRemove=p-1; 948 UChar syllable=(UChar) 949 (Hangul::HANGUL_BASE+ 950 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 951 Hangul::JAMO_T_COUNT); 952 UChar t; 953 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 954 ++p; 955 syllable+=t; // The next character was a Jamo T. 956 } 957 *starter=syllable; 958 // remove the Jamo V/T 959 q=pRemove; 960 r=p; 961 while(r<limit) { 962 *q++=*r++; 963 } 964 limit=q; 965 p=pRemove; 966 } 967 } 968 /* 969 * No "else" for Jamo T: 970 * Since the input is in NFD, there are no Hangul LV syllables that 971 * a Jamo T could combine with. 972 * All Jamo Ts are combined above when handling Jamo Vs. 973 */ 974 if(p==limit) { 975 break; 976 } 977 compositionsList=NULL; 978 continue; 979 } else if((compositeAndFwd=combine(compositionsList, c))>=0) { 980 // The starter and the combining mark (c) do combine. 981 UChar32 composite=compositeAndFwd>>1; 982 983 // Replace the starter with the composite, remove the combining mark. 984 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark 985 if(starterIsSupplementary) { 986 if(U_IS_SUPPLEMENTARY(composite)) { 987 // both are supplementary 988 starter[0]=U16_LEAD(composite); 989 starter[1]=U16_TRAIL(composite); 990 } else { 991 *starter=(UChar)composite; 992 // The composite is shorter than the starter, 993 // move the intermediate characters forward one. 994 starterIsSupplementary=FALSE; 995 q=starter+1; 996 r=q+1; 997 while(r<pRemove) { 998 *q++=*r++; 999 } 1000 --pRemove; 1001 } 1002 } else if(U_IS_SUPPLEMENTARY(composite)) { 1003 // The composite is longer than the starter, 1004 // move the intermediate characters back one. 1005 starterIsSupplementary=TRUE; 1006 ++starter; // temporarily increment for the loop boundary 1007 q=pRemove; 1008 r=++pRemove; 1009 while(starter<q) { 1010 *--r=*--q; 1011 } 1012 *starter=U16_TRAIL(composite); 1013 *--starter=U16_LEAD(composite); // undo the temporary increment 1014 } else { 1015 // both are on the BMP 1016 *starter=(UChar)composite; 1017 } 1018 1019 /* remove the combining mark by moving the following text over it */ 1020 if(pRemove<p) { 1021 q=pRemove; 1022 r=p; 1023 while(r<limit) { 1024 *q++=*r++; 1025 } 1026 limit=q; 1027 p=pRemove; 1028 } 1029 // Keep prevCC because we removed the combining mark. 1030 1031 if(p==limit) { 1032 break; 1033 } 1034 // Is the composite a starter that combines forward? 1035 if(compositeAndFwd&1) { 1036 compositionsList= 1037 getCompositionsListForComposite(getNorm16(composite)); 1038 } else { 1039 compositionsList=NULL; 1040 } 1041 1042 // We combined; continue with looking for compositions. 1043 continue; 1044 } 1045 } 1046 1047 // no combination this time 1048 prevCC=cc; 1049 if(p==limit) { 1050 break; 1051 } 1052 1053 // If c did not combine, then check if it is a starter. 1054 if(cc==0) { 1055 // Found a new starter. 1056 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) { 1057 // It may combine with something, prepare for it. 1058 if(U_IS_BMP(c)) { 1059 starterIsSupplementary=FALSE; 1060 starter=p-1; 1061 } else { 1062 starterIsSupplementary=TRUE; 1063 starter=p-2; 1064 } 1065 } 1066 } else if(onlyContiguous) { 1067 // FCC: no discontiguous compositions; any intervening character blocks. 1068 compositionsList=NULL; 1069 } 1070 } 1071 buffer.setReorderingLimit(limit); 1072 } 1073 1074 UChar32 1075 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const { 1076 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 1077 const uint16_t *list; 1078 if(isInert(norm16)) { 1079 return U_SENTINEL; 1080 } else if(norm16<minYesNoMappingsOnly) { 1081 if(isJamoL(norm16)) { 1082 b-=Hangul::JAMO_V_BASE; 1083 if(0<=b && b<Hangul::JAMO_V_COUNT) { 1084 return 1085 (Hangul::HANGUL_BASE+ 1086 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)* 1087 Hangul::JAMO_T_COUNT); 1088 } else { 1089 return U_SENTINEL; 1090 } 1091 } else if(isHangul(norm16)) { 1092 b-=Hangul::JAMO_T_BASE; 1093 if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0! 1094 return a+b; 1095 } else { 1096 return U_SENTINEL; 1097 } 1098 } else { 1099 // 'a' has a compositions list in extraData 1100 list=extraData+norm16; 1101 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 1102 list+= // mapping pointer 1103 1+ // +1 to skip the first unit with the mapping lenth 1104 (*list&MAPPING_LENGTH_MASK); // + mapping length 1105 } 1106 } 1107 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 1108 return U_SENTINEL; 1109 } else { 1110 list=maybeYesCompositions+norm16-minMaybeYes; 1111 } 1112 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 1113 return U_SENTINEL; 1114 } 1115 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC 1116 return combine(list, b)>>1; 1117 #else 1118 int32_t compositeAndFwd=combine(list, b); 1119 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; 1120 #endif 1121 } 1122 1123 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1124 // doCompose: normalize 1125 // !doCompose: isNormalized (buffer must be empty and initialized) 1126 UBool 1127 Normalizer2Impl::compose(const UChar *src, const UChar *limit, 1128 UBool onlyContiguous, 1129 UBool doCompose, 1130 ReorderingBuffer &buffer, 1131 UErrorCode &errorCode) const { 1132 /* 1133 * prevBoundary points to the last character before the current one 1134 * that has a composition boundary before it with ccc==0 and quick check "yes". 1135 * Keeping track of prevBoundary saves us looking for a composition boundary 1136 * when we find a "no" or "maybe". 1137 * 1138 * When we back out from prevSrc back to prevBoundary, 1139 * then we also remove those same characters (which had been simply copied 1140 * or canonically-order-inserted) from the ReorderingBuffer. 1141 * Therefore, at all times, the [prevBoundary..prevSrc[ source units 1142 * must correspond 1:1 to destination units at the end of the destination buffer. 1143 */ 1144 const UChar *prevBoundary=src; 1145 UChar32 minNoMaybeCP=minCompNoMaybeCP; 1146 if(limit==NULL) { 1147 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, 1148 doCompose ? &buffer : NULL, 1149 errorCode); 1150 if(U_FAILURE(errorCode)) { 1151 return FALSE; 1152 } 1153 if(prevBoundary<src) { 1154 // Set prevBoundary to the last character in the prefix. 1155 prevBoundary=src-1; 1156 } 1157 limit=u_strchr(src, 0); 1158 } 1159 1160 const UChar *prevSrc; 1161 UChar32 c=0; 1162 uint16_t norm16=0; 1163 1164 // only for isNormalized 1165 uint8_t prevCC=0; 1166 1167 for(;;) { 1168 // count code units below the minimum or with irrelevant data for the quick check 1169 for(prevSrc=src; src!=limit;) { 1170 if( (c=*src)<minNoMaybeCP || 1171 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1172 ) { 1173 ++src; 1174 } else if(!U16_IS_SURROGATE(c)) { 1175 break; 1176 } else { 1177 UChar c2; 1178 if(U16_IS_SURROGATE_LEAD(c)) { 1179 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1180 c=U16_GET_SUPPLEMENTARY(c, c2); 1181 } 1182 } else /* trail surrogate */ { 1183 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1184 --src; 1185 c=U16_GET_SUPPLEMENTARY(c2, c); 1186 } 1187 } 1188 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1189 src+=U16_LENGTH(c); 1190 } else { 1191 break; 1192 } 1193 } 1194 } 1195 // copy these code units all at once 1196 if(src!=prevSrc) { 1197 if(doCompose) { 1198 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) { 1199 break; 1200 } 1201 } else { 1202 prevCC=0; 1203 } 1204 if(src==limit) { 1205 break; 1206 } 1207 // Set prevBoundary to the last character in the quick check loop. 1208 prevBoundary=src-1; 1209 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 1210 U16_IS_LEAD(*(prevBoundary-1)) 1211 ) { 1212 --prevBoundary; 1213 } 1214 // The start of the current character (c). 1215 prevSrc=src; 1216 } else if(src==limit) { 1217 break; 1218 } 1219 1220 src+=U16_LENGTH(c); 1221 /* 1222 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1223 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1224 * or has ccc!=0. 1225 * Check for Jamo V/T, then for regular characters. 1226 * c is not a Hangul syllable or Jamo L because those have "yes" properties. 1227 */ 1228 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 1229 UChar prev=*(prevSrc-1); 1230 UBool needToDecompose=FALSE; 1231 if(c<Hangul::JAMO_T_BASE) { 1232 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1233 prev=(UChar)(prev-Hangul::JAMO_L_BASE); 1234 if(prev<Hangul::JAMO_L_COUNT) { 1235 if(!doCompose) { 1236 return FALSE; 1237 } 1238 UChar syllable=(UChar) 1239 (Hangul::HANGUL_BASE+ 1240 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 1241 Hangul::JAMO_T_COUNT); 1242 UChar t; 1243 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 1244 ++src; 1245 syllable+=t; // The next character was a Jamo T. 1246 prevBoundary=src; 1247 buffer.setLastChar(syllable); 1248 continue; 1249 } 1250 // If we see L+V+x where x!=T then we drop to the slow path, 1251 // decompose and recompose. 1252 // This is to deal with NFKC finding normal L and V but a 1253 // compatibility variant of a T. We need to either fully compose that 1254 // combination here (which would complicate the code and may not work 1255 // with strange custom data) or use the slow path -- or else our replacing 1256 // two input characters (L+V) with one output character (LV syllable) 1257 // would violate the invariant that [prevBoundary..prevSrc[ has the same 1258 // length as what we appended to the buffer since prevBoundary. 1259 needToDecompose=TRUE; 1260 } 1261 } else if(Hangul::isHangulWithoutJamoT(prev)) { 1262 // c is a Jamo Trailing consonant, 1263 // compose with previous Hangul LV that does not contain a Jamo T. 1264 if(!doCompose) { 1265 return FALSE; 1266 } 1267 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE)); 1268 prevBoundary=src; 1269 continue; 1270 } 1271 if(!needToDecompose) { 1272 // The Jamo V/T did not compose into a Hangul syllable. 1273 if(doCompose) { 1274 if(!buffer.appendBMP((UChar)c, 0, errorCode)) { 1275 break; 1276 } 1277 } else { 1278 prevCC=0; 1279 } 1280 continue; 1281 } 1282 } 1283 /* 1284 * Source buffer pointers: 1285 * 1286 * all done quick check current char not yet 1287 * "yes" but (c) processed 1288 * may combine 1289 * forward 1290 * [-------------[-------------[-------------[-------------[ 1291 * | | | | | 1292 * orig. src prevBoundary prevSrc src limit 1293 * 1294 * 1295 * Destination buffer pointers inside the ReorderingBuffer: 1296 * 1297 * all done might take not filled yet 1298 * characters for 1299 * reordering 1300 * [-------------[-------------[-------------[ 1301 * | | | | 1302 * start reorderStart limit | 1303 * +remainingCap.+ 1304 */ 1305 if(norm16>=MIN_YES_YES_WITH_CC) { 1306 uint8_t cc=(uint8_t)norm16; // cc!=0 1307 if( onlyContiguous && // FCC 1308 (doCompose ? buffer.getLastCC() : prevCC)==0 && 1309 prevBoundary<prevSrc && 1310 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 1311 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1312 // passed the quick check "yes && ccc==0" test. 1313 // Check whether the last character was a "yesYes" or a "yesNo". 1314 // If a "yesNo", then we get its trailing ccc from its 1315 // mapping and check for canonical order. 1316 // All other cases are ok. 1317 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1318 ) { 1319 // Fails FCD test, need to decompose and contiguously recompose. 1320 if(!doCompose) { 1321 return FALSE; 1322 } 1323 } else if(doCompose) { 1324 if(!buffer.append(c, cc, errorCode)) { 1325 break; 1326 } 1327 continue; 1328 } else if(prevCC<=cc) { 1329 prevCC=cc; 1330 continue; 1331 } else { 1332 return FALSE; 1333 } 1334 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 1335 return FALSE; 1336 } 1337 1338 /* 1339 * Find appropriate boundaries around this character, 1340 * decompose the source text from between the boundaries, 1341 * and recompose it. 1342 * 1343 * We may need to remove the last few characters from the ReorderingBuffer 1344 * to account for source text that was copied or appended 1345 * but needs to take part in the recomposition. 1346 */ 1347 1348 /* 1349 * Find the last composition boundary in [prevBoundary..src[. 1350 * It is either the decomposition of the current character (at prevSrc), 1351 * or prevBoundary. 1352 */ 1353 if(hasCompBoundaryBefore(c, norm16)) { 1354 prevBoundary=prevSrc; 1355 } else if(doCompose) { 1356 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary)); 1357 } 1358 1359 // Find the next composition boundary in [src..limit[ - 1360 // modifies src to point to the next starter. 1361 src=(UChar *)findNextCompBoundary(src, limit); 1362 1363 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 1364 int32_t recomposeStartIndex=buffer.length(); 1365 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) { 1366 break; 1367 } 1368 recompose(buffer, recomposeStartIndex, onlyContiguous); 1369 if(!doCompose) { 1370 if(!buffer.equals(prevBoundary, src)) { 1371 return FALSE; 1372 } 1373 buffer.remove(); 1374 prevCC=0; 1375 } 1376 1377 // Move to the next starter. We never need to look back before this point again. 1378 prevBoundary=src; 1379 } 1380 return TRUE; 1381 } 1382 1383 // Very similar to compose(): Make the same changes in both places if relevant. 1384 // pQCResult==NULL: spanQuickCheckYes 1385 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES) 1386 const UChar * 1387 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, 1388 UBool onlyContiguous, 1389 UNormalizationCheckResult *pQCResult) const { 1390 /* 1391 * prevBoundary points to the last character before the current one 1392 * that has a composition boundary before it with ccc==0 and quick check "yes". 1393 */ 1394 const UChar *prevBoundary=src; 1395 UChar32 minNoMaybeCP=minCompNoMaybeCP; 1396 if(limit==NULL) { 1397 UErrorCode errorCode=U_ZERO_ERROR; 1398 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode); 1399 if(prevBoundary<src) { 1400 // Set prevBoundary to the last character in the prefix. 1401 prevBoundary=src-1; 1402 } 1403 limit=u_strchr(src, 0); 1404 } 1405 1406 const UChar *prevSrc; 1407 UChar32 c=0; 1408 uint16_t norm16=0; 1409 uint8_t prevCC=0; 1410 1411 for(;;) { 1412 // count code units below the minimum or with irrelevant data for the quick check 1413 for(prevSrc=src;;) { 1414 if(src==limit) { 1415 return src; 1416 } 1417 if( (c=*src)<minNoMaybeCP || 1418 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1419 ) { 1420 ++src; 1421 } else if(!U16_IS_SURROGATE(c)) { 1422 break; 1423 } else { 1424 UChar c2; 1425 if(U16_IS_SURROGATE_LEAD(c)) { 1426 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1427 c=U16_GET_SUPPLEMENTARY(c, c2); 1428 } 1429 } else /* trail surrogate */ { 1430 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1431 --src; 1432 c=U16_GET_SUPPLEMENTARY(c2, c); 1433 } 1434 } 1435 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1436 src+=U16_LENGTH(c); 1437 } else { 1438 break; 1439 } 1440 } 1441 } 1442 if(src!=prevSrc) { 1443 // Set prevBoundary to the last character in the quick check loop. 1444 prevBoundary=src-1; 1445 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 1446 U16_IS_LEAD(*(prevBoundary-1)) 1447 ) { 1448 --prevBoundary; 1449 } 1450 prevCC=0; 1451 // The start of the current character (c). 1452 prevSrc=src; 1453 } 1454 1455 src+=U16_LENGTH(c); 1456 /* 1457 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1458 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1459 * or has ccc!=0. 1460 */ 1461 if(isMaybeOrNonZeroCC(norm16)) { 1462 uint8_t cc=getCCFromYesOrMaybe(norm16); 1463 if( onlyContiguous && // FCC 1464 cc!=0 && 1465 prevCC==0 && 1466 prevBoundary<prevSrc && 1467 // prevCC==0 && prevBoundary<prevSrc tell us that 1468 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1469 // passed the quick check "yes && ccc==0" test. 1470 // Check whether the last character was a "yesYes" or a "yesNo". 1471 // If a "yesNo", then we get its trailing ccc from its 1472 // mapping and check for canonical order. 1473 // All other cases are ok. 1474 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1475 ) { 1476 // Fails FCD test. 1477 } else if(prevCC<=cc || cc==0) { 1478 prevCC=cc; 1479 if(norm16<MIN_YES_YES_WITH_CC) { 1480 if(pQCResult!=NULL) { 1481 *pQCResult=UNORM_MAYBE; 1482 } else { 1483 return prevBoundary; 1484 } 1485 } 1486 continue; 1487 } 1488 } 1489 if(pQCResult!=NULL) { 1490 *pQCResult=UNORM_NO; 1491 } 1492 return prevBoundary; 1493 } 1494 } 1495 1496 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, 1497 UBool doCompose, 1498 UBool onlyContiguous, 1499 UnicodeString &safeMiddle, 1500 ReorderingBuffer &buffer, 1501 UErrorCode &errorCode) const { 1502 if(!buffer.isEmpty()) { 1503 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit); 1504 if(src!=firstStarterInSrc) { 1505 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), 1506 buffer.getLimit()); 1507 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest); 1508 UnicodeString middle(lastStarterInDest, destSuffixLength); 1509 buffer.removeSuffix(destSuffixLength); 1510 safeMiddle=middle; 1511 middle.append(src, (int32_t)(firstStarterInSrc-src)); 1512 const UChar *middleStart=middle.getBuffer(); 1513 compose(middleStart, middleStart+middle.length(), onlyContiguous, 1514 TRUE, buffer, errorCode); 1515 if(U_FAILURE(errorCode)) { 1516 return; 1517 } 1518 src=firstStarterInSrc; 1519 } 1520 } 1521 if(doCompose) { 1522 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode); 1523 } else { 1524 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1525 limit=u_strchr(src, 0); 1526 } 1527 buffer.appendZeroCC(src, limit, errorCode); 1528 } 1529 } 1530 1531 /** 1532 * Does c have a composition boundary before it? 1533 * True if its decomposition begins with a character that has 1534 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1535 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1536 * (isCompYesAndZeroCC()) so we need not decompose. 1537 */ 1538 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { 1539 for(;;) { 1540 if(isCompYesAndZeroCC(norm16)) { 1541 return TRUE; 1542 } else if(isMaybeOrNonZeroCC(norm16)) { 1543 return FALSE; 1544 } else if(isDecompNoAlgorithmic(norm16)) { 1545 c=mapAlgorithmic(c, norm16); 1546 norm16=getNorm16(c); 1547 } else { 1548 // c decomposes, get everything from the variable-length extra data 1549 const uint16_t *mapping=getMapping(norm16); 1550 uint16_t firstUnit=*mapping; 1551 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1552 return FALSE; 1553 } 1554 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) { 1555 return FALSE; // non-zero leadCC 1556 } 1557 int32_t i=1; // skip over the firstUnit 1558 UChar32 c; 1559 U16_NEXT_UNSAFE(mapping, i, c); 1560 return isCompYesAndZeroCC(getNorm16(c)); 1561 } 1562 } 1563 } 1564 1565 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const { 1566 for(;;) { 1567 uint16_t norm16=getNorm16(c); 1568 if(isInert(norm16)) { 1569 return TRUE; 1570 } else if(norm16<=minYesNo) { 1571 // Hangul: norm16==minYesNo 1572 // Hangul LVT has a boundary after it. 1573 // Hangul LV and non-inert yesYes characters combine forward. 1574 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c); 1575 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 1576 return FALSE; 1577 } else if(isDecompNoAlgorithmic(norm16)) { 1578 c=mapAlgorithmic(c, norm16); 1579 } else { 1580 // c decomposes, get everything from the variable-length extra data. 1581 // If testInert, then c must be a yesNo character which has lccc=0, 1582 // otherwise it could be a noNo. 1583 const uint16_t *mapping=getMapping(norm16); 1584 uint16_t firstUnit=*mapping; 1585 // TRUE if 1586 // not MAPPING_NO_COMP_BOUNDARY_AFTER 1587 // (which is set if 1588 // c is not deleted, and 1589 // it and its decomposition do not combine forward, and it has a starter) 1590 // and if FCC then trailCC<=1 1591 return 1592 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && 1593 (!onlyContiguous || firstUnit<=0x1ff); 1594 } 1595 } 1596 } 1597 1598 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const { 1599 BackwardUTrie2StringIterator iter(normTrie, start, p); 1600 uint16_t norm16; 1601 do { 1602 norm16=iter.previous16(); 1603 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1604 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 1605 // but that's probably not worth the extra cost. 1606 return iter.codePointStart; 1607 } 1608 1609 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const { 1610 ForwardUTrie2StringIterator iter(normTrie, p, limit); 1611 uint16_t norm16; 1612 do { 1613 norm16=iter.next16(); 1614 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1615 return iter.codePointStart; 1616 } 1617 1618 // Note: normalizer2impl.cpp r30982 (2011-nov-27) 1619 // still had getFCDTrie() which built and cached an FCD trie. 1620 // That provided faster access to FCD data than getFCD16FromNormData() 1621 // but required synchronization and consumed some 10kB of heap memory 1622 // in any process that uses FCD (e.g., via collation). 1623 // tccc180[] and smallFCD[] are intended to help with any loss of performance, 1624 // at least for Latin & CJK. 1625 1626 // Gets the FCD value from the regular normalization data. 1627 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { 1628 // Only loops for 1:1 algorithmic mappings. 1629 for(;;) { 1630 uint16_t norm16=getNorm16(c); 1631 if(norm16<=minYesNo) { 1632 // no decomposition or Hangul syllable, all zeros 1633 return 0; 1634 } else if(norm16>=MIN_NORMAL_MAYBE_YES) { 1635 // combining mark 1636 norm16&=0xff; 1637 return norm16|(norm16<<8); 1638 } else if(norm16>=minMaybeYes) { 1639 return 0; 1640 } else if(isDecompNoAlgorithmic(norm16)) { 1641 c=mapAlgorithmic(c, norm16); 1642 } else { 1643 // c decomposes, get everything from the variable-length extra data 1644 const uint16_t *mapping=getMapping(norm16); 1645 uint16_t firstUnit=*mapping; 1646 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1647 // A character that is deleted (maps to an empty string) must 1648 // get the worst-case lccc and tccc values because arbitrary 1649 // characters on both sides will become adjacent. 1650 return 0x1ff; 1651 } else { 1652 norm16=firstUnit>>8; // tccc 1653 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 1654 norm16|=*(mapping-1)&0xff00; // lccc 1655 } 1656 return norm16; 1657 } 1658 } 1659 } 1660 } 1661 1662 // Dual functionality: 1663 // buffer!=NULL: normalize 1664 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1665 const UChar * 1666 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, 1667 ReorderingBuffer *buffer, 1668 UErrorCode &errorCode) const { 1669 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1670 // Similar to the prevBoundary in the compose() implementation. 1671 const UChar *prevBoundary=src; 1672 int32_t prevFCD16=0; 1673 if(limit==NULL) { 1674 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode); 1675 if(U_FAILURE(errorCode)) { 1676 return src; 1677 } 1678 if(prevBoundary<src) { 1679 prevBoundary=src; 1680 // We know that the previous character's lccc==0. 1681 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1682 prevFCD16=getFCD16(*(src-1)); 1683 if(prevFCD16>1) { 1684 --prevBoundary; 1685 } 1686 } 1687 limit=u_strchr(src, 0); 1688 } 1689 1690 // Note: In this function we use buffer->appendZeroCC() because we track 1691 // the lead and trail combining classes here, rather than leaving it to 1692 // the ReorderingBuffer. 1693 // The exception is the call to decomposeShort() which uses the buffer 1694 // in the normal way. 1695 1696 const UChar *prevSrc; 1697 UChar32 c=0; 1698 uint16_t fcd16=0; 1699 1700 for(;;) { 1701 // count code units with lccc==0 1702 for(prevSrc=src; src!=limit;) { 1703 if((c=*src)<MIN_CCC_LCCC_CP) { 1704 prevFCD16=~c; 1705 ++src; 1706 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1707 prevFCD16=0; 1708 ++src; 1709 } else { 1710 if(U16_IS_SURROGATE(c)) { 1711 UChar c2; 1712 if(U16_IS_SURROGATE_LEAD(c)) { 1713 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1714 c=U16_GET_SUPPLEMENTARY(c, c2); 1715 } 1716 } else /* trail surrogate */ { 1717 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1718 --src; 1719 c=U16_GET_SUPPLEMENTARY(c2, c); 1720 } 1721 } 1722 } 1723 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1724 prevFCD16=fcd16; 1725 src+=U16_LENGTH(c); 1726 } else { 1727 break; 1728 } 1729 } 1730 } 1731 // copy these code units all at once 1732 if(src!=prevSrc) { 1733 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) { 1734 break; 1735 } 1736 if(src==limit) { 1737 break; 1738 } 1739 prevBoundary=src; 1740 // We know that the previous character's lccc==0. 1741 if(prevFCD16<0) { 1742 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1743 UChar32 prev=~prevFCD16; 1744 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); 1745 if(prevFCD16>1) { 1746 --prevBoundary; 1747 } 1748 } else { 1749 const UChar *p=src-1; 1750 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { 1751 --p; 1752 // Need to fetch the previous character's FCD value because 1753 // prevFCD16 was just for the trail surrogate code point. 1754 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])); 1755 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1756 } 1757 if(prevFCD16>1) { 1758 prevBoundary=p; 1759 } 1760 } 1761 // The start of the current character (c). 1762 prevSrc=src; 1763 } else if(src==limit) { 1764 break; 1765 } 1766 1767 src+=U16_LENGTH(c); 1768 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1769 // Check for proper order, and decompose locally if necessary. 1770 if((prevFCD16&0xff)<=(fcd16>>8)) { 1771 // proper order: prev tccc <= current lccc 1772 if((fcd16&0xff)<=1) { 1773 prevBoundary=src; 1774 } 1775 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { 1776 break; 1777 } 1778 prevFCD16=fcd16; 1779 continue; 1780 } else if(buffer==NULL) { 1781 return prevBoundary; // quick check "no" 1782 } else { 1783 /* 1784 * Back out the part of the source that we copied or appended 1785 * already but is now going to be decomposed. 1786 * prevSrc is set to after what was copied/appended. 1787 */ 1788 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); 1789 /* 1790 * Find the part of the source that needs to be decomposed, 1791 * up to the next safe boundary. 1792 */ 1793 src=findNextFCDBoundary(src, limit); 1794 /* 1795 * The source text does not fulfill the conditions for FCD. 1796 * Decompose and reorder a limited piece of the text. 1797 */ 1798 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) { 1799 break; 1800 } 1801 prevBoundary=src; 1802 prevFCD16=0; 1803 } 1804 } 1805 return src; 1806 } 1807 1808 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, 1809 UBool doMakeFCD, 1810 UnicodeString &safeMiddle, 1811 ReorderingBuffer &buffer, 1812 UErrorCode &errorCode) const { 1813 if(!buffer.isEmpty()) { 1814 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit); 1815 if(src!=firstBoundaryInSrc) { 1816 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), 1817 buffer.getLimit()); 1818 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); 1819 UnicodeString middle(lastBoundaryInDest, destSuffixLength); 1820 buffer.removeSuffix(destSuffixLength); 1821 safeMiddle=middle; 1822 middle.append(src, (int32_t)(firstBoundaryInSrc-src)); 1823 const UChar *middleStart=middle.getBuffer(); 1824 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); 1825 if(U_FAILURE(errorCode)) { 1826 return; 1827 } 1828 src=firstBoundaryInSrc; 1829 } 1830 } 1831 if(doMakeFCD) { 1832 makeFCD(src, limit, &buffer, errorCode); 1833 } else { 1834 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1835 limit=u_strchr(src, 0); 1836 } 1837 buffer.appendZeroCC(src, limit, errorCode); 1838 } 1839 } 1840 1841 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { 1842 while(start<p && previousFCD16(start, p)>0xff) {} 1843 return p; 1844 } 1845 1846 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { 1847 while(p<limit) { 1848 const UChar *codePointStart=p; 1849 if(nextFCD16(p, limit)<=0xff) { 1850 return codePointStart; 1851 } 1852 } 1853 return p; 1854 } 1855 1856 // CanonicalIterator data -------------------------------------------------- *** 1857 1858 CanonIterData::CanonIterData(UErrorCode &errorCode) : 1859 trie(utrie2_open(0, 0, &errorCode)), 1860 canonStartSets(uprv_deleteUObject, NULL, errorCode) {} 1861 1862 CanonIterData::~CanonIterData() { 1863 utrie2_close(trie); 1864 } 1865 1866 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) { 1867 uint32_t canonValue=utrie2_get32(trie, decompLead); 1868 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 1869 // origin is the first character whose decomposition starts with 1870 // the character for which we are setting the value. 1871 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode); 1872 } else { 1873 // origin is not the first character, or it is U+0000. 1874 UnicodeSet *set; 1875 if((canonValue&CANON_HAS_SET)==0) { 1876 set=new UnicodeSet; 1877 if(set==NULL) { 1878 errorCode=U_MEMORY_ALLOCATION_ERROR; 1879 return; 1880 } 1881 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); 1882 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); 1883 utrie2_set32(trie, decompLead, canonValue, &errorCode); 1884 canonStartSets.addElement(set, errorCode); 1885 if(firstOrigin!=0) { 1886 set->add(firstOrigin); 1887 } 1888 } else { 1889 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; 1890 } 1891 set->add(origin); 1892 } 1893 } 1894 1895 U_CDECL_BEGIN 1896 1897 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. 1898 // context: the Normalizer2Impl 1899 static UBool U_CALLCONV 1900 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1901 UErrorCode errorCode = U_ZERO_ERROR; 1902 if (value != 0) { 1903 Normalizer2Impl *impl = (Normalizer2Impl *)context; 1904 impl->makeCanonIterDataFromNorm16( 1905 start, end, (uint16_t)value, *impl->fCanonIterData, errorCode); 1906 } 1907 return U_SUCCESS(errorCode); 1908 } 1909 1910 1911 1912 // UInitOnce instantiation function for CanonIterData 1913 1914 static void U_CALLCONV 1915 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) { 1916 U_ASSERT(impl->fCanonIterData == NULL); 1917 impl->fCanonIterData = new CanonIterData(errorCode); 1918 if (impl->fCanonIterData == NULL) { 1919 errorCode=U_MEMORY_ALLOCATION_ERROR; 1920 } 1921 if (U_SUCCESS(errorCode)) { 1922 utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl); 1923 utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode); 1924 } 1925 if (U_FAILURE(errorCode)) { 1926 delete impl->fCanonIterData; 1927 impl->fCanonIterData = NULL; 1928 } 1929 } 1930 1931 U_CDECL_END 1932 1933 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 1934 CanonIterData &newData, 1935 UErrorCode &errorCode) const { 1936 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 1937 // Inert, or 2-way mapping (including Hangul syllable). 1938 // We do not write a canonStartSet for any yesNo character. 1939 // Composites from 2-way mappings are added at runtime from the 1940 // starter's compositions list, and the other characters in 1941 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 1942 // "maybe" characters. 1943 return; 1944 } 1945 for(UChar32 c=start; c<=end; ++c) { 1946 uint32_t oldValue=utrie2_get32(newData.trie, c); 1947 uint32_t newValue=oldValue; 1948 if(norm16>=minMaybeYes) { 1949 // not a segment starter if it occurs in a decomposition or has cc!=0 1950 newValue|=CANON_NOT_SEGMENT_STARTER; 1951 if(norm16<MIN_NORMAL_MAYBE_YES) { 1952 newValue|=CANON_HAS_COMPOSITIONS; 1953 } 1954 } else if(norm16<minYesNo) { 1955 newValue|=CANON_HAS_COMPOSITIONS; 1956 } else { 1957 // c has a one-way decomposition 1958 UChar32 c2=c; 1959 uint16_t norm16_2=norm16; 1960 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 1961 c2=mapAlgorithmic(c2, norm16_2); 1962 norm16_2=getNorm16(c2); 1963 } 1964 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 1965 // c decomposes, get everything from the variable-length extra data 1966 const uint16_t *mapping=getMapping(norm16_2); 1967 uint16_t firstUnit=*mapping; 1968 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 1969 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1970 if(c==c2 && (*(mapping-1)&0xff)!=0) { 1971 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 1972 } 1973 } 1974 // Skip empty mappings (no characters in the decomposition). 1975 if(length!=0) { 1976 ++mapping; // skip over the firstUnit 1977 // add c to first code point's start set 1978 int32_t i=0; 1979 U16_NEXT_UNSAFE(mapping, i, c2); 1980 newData.addToStartSet(c, c2, errorCode); 1981 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 1982 // one-way mapping. A 2-way mapping is possible here after 1983 // intermediate algorithmic mapping. 1984 if(norm16_2>=minNoNo) { 1985 while(i<length) { 1986 U16_NEXT_UNSAFE(mapping, i, c2); 1987 uint32_t c2Value=utrie2_get32(newData.trie, c2); 1988 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 1989 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER, 1990 &errorCode); 1991 } 1992 } 1993 } 1994 } 1995 } else { 1996 // c decomposed to c2 algorithmically; c has cc==0 1997 newData.addToStartSet(c, c2, errorCode); 1998 } 1999 } 2000 if(newValue!=oldValue) { 2001 utrie2_set32(newData.trie, c, newValue, &errorCode); 2002 } 2003 } 2004 } 2005 2006 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { 2007 // Logically const: Synchronized instantiation. 2008 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 2009 umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode); 2010 return U_SUCCESS(errorCode); 2011 } 2012 2013 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { 2014 return (int32_t)utrie2_get32(fCanonIterData->trie, c); 2015 } 2016 2017 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { 2018 return *(const UnicodeSet *)fCanonIterData->canonStartSets[n]; 2019 } 2020 2021 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { 2022 return getCanonValue(c)>=0; 2023 } 2024 2025 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { 2026 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; 2027 if(canonValue==0) { 2028 return FALSE; 2029 } 2030 set.clear(); 2031 int32_t value=canonValue&CANON_VALUE_MASK; 2032 if((canonValue&CANON_HAS_SET)!=0) { 2033 set.addAll(getCanonStartSet(value)); 2034 } else if(value!=0) { 2035 set.add(value); 2036 } 2037 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 2038 uint16_t norm16=getNorm16(c); 2039 if(norm16==JAMO_L) { 2040 UChar32 syllable= 2041 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); 2042 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); 2043 } else { 2044 addComposites(getCompositionsList(norm16), set); 2045 } 2046 } 2047 return TRUE; 2048 } 2049 2050 U_NAMESPACE_END 2051 2052 // Normalizer2 data swapping ----------------------------------------------- *** 2053 2054 U_NAMESPACE_USE 2055 2056 U_CAPI int32_t U_EXPORT2 2057 unorm2_swap(const UDataSwapper *ds, 2058 const void *inData, int32_t length, void *outData, 2059 UErrorCode *pErrorCode) { 2060 const UDataInfo *pInfo; 2061 int32_t headerSize; 2062 2063 const uint8_t *inBytes; 2064 uint8_t *outBytes; 2065 2066 const int32_t *inIndexes; 2067 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1]; 2068 2069 int32_t i, offset, nextOffset, size; 2070 2071 /* udata_swapDataHeader checks the arguments */ 2072 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 2073 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 2074 return 0; 2075 } 2076 2077 /* check data format and format version */ 2078 pInfo=(const UDataInfo *)((const char *)inData+4); 2079 if(!( 2080 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 2081 pInfo->dataFormat[1]==0x72 && 2082 pInfo->dataFormat[2]==0x6d && 2083 pInfo->dataFormat[3]==0x32 && 2084 (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2) 2085 )) { 2086 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", 2087 pInfo->dataFormat[0], pInfo->dataFormat[1], 2088 pInfo->dataFormat[2], pInfo->dataFormat[3], 2089 pInfo->formatVersion[0]); 2090 *pErrorCode=U_UNSUPPORTED_ERROR; 2091 return 0; 2092 } 2093 2094 inBytes=(const uint8_t *)inData+headerSize; 2095 outBytes=(uint8_t *)outData+headerSize; 2096 2097 inIndexes=(const int32_t *)inBytes; 2098 2099 if(length>=0) { 2100 length-=headerSize; 2101 if(length<(int32_t)sizeof(indexes)) { 2102 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", 2103 length); 2104 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2105 return 0; 2106 } 2107 } 2108 2109 /* read the first few indexes */ 2110 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) { 2111 indexes[i]=udata_readInt32(ds, inIndexes[i]); 2112 } 2113 2114 /* get the total length of the data */ 2115 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 2116 2117 if(length>=0) { 2118 if(length<size) { 2119 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n", 2120 length); 2121 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2122 return 0; 2123 } 2124 2125 /* copy the data for inaccessible bytes */ 2126 if(inBytes!=outBytes) { 2127 uprv_memcpy(outBytes, inBytes, size); 2128 } 2129 2130 offset=0; 2131 2132 /* swap the int32_t indexes[] */ 2133 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]; 2134 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); 2135 offset=nextOffset; 2136 2137 /* swap the UTrie2 */ 2138 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; 2139 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 2140 offset=nextOffset; 2141 2142 /* swap the uint16_t extraData[] */ 2143 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]; 2144 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 2145 offset=nextOffset; 2146 2147 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */ 2148 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; 2149 offset=nextOffset; 2150 2151 U_ASSERT(offset==size); 2152 } 2153 2154 return headerSize+size; 2155 } 2156 2157 #endif // !UCONFIG_NO_NORMALIZATION 2158