1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: normalizer2impl.cpp 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov22 16 * created by: Markus W. Scherer 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_NORMALIZATION 22 23 #include "unicode/normalizer2.h" 24 #include "unicode/udata.h" 25 #include "unicode/ustring.h" 26 #include "unicode/utf16.h" 27 #include "cmemory.h" 28 #include "mutex.h" 29 #include "normalizer2impl.h" 30 #include "putilimp.h" 31 #include "uassert.h" 32 #include "uset_imp.h" 33 #include "utrie2.h" 34 #include "uvector.h" 35 36 U_NAMESPACE_BEGIN 37 38 // ReorderingBuffer -------------------------------------------------------- *** 39 40 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { 41 int32_t length=str.length(); 42 start=str.getBuffer(destCapacity); 43 if(start==NULL) { 44 // getBuffer() already did str.setToBogus() 45 errorCode=U_MEMORY_ALLOCATION_ERROR; 46 return FALSE; 47 } 48 limit=start+length; 49 remainingCapacity=str.getCapacity()-length; 50 reorderStart=start; 51 if(start==limit) { 52 lastCC=0; 53 } else { 54 setIterator(); 55 lastCC=previousCC(); 56 // Set reorderStart after the last code point with cc<=1 if there is one. 57 if(lastCC>1) { 58 while(previousCC()>1) {} 59 } 60 reorderStart=codePointLimit; 61 } 62 return TRUE; 63 } 64 65 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { 66 int32_t length=(int32_t)(limit-start); 67 return 68 length==(int32_t)(otherLimit-otherStart) && 69 0==u_memcmp(start, otherStart, length); 70 } 71 72 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 73 if(remainingCapacity<2 && !resize(2, errorCode)) { 74 return FALSE; 75 } 76 if(lastCC<=cc || cc==0) { 77 limit[0]=U16_LEAD(c); 78 limit[1]=U16_TRAIL(c); 79 limit+=2; 80 lastCC=cc; 81 if(cc<=1) { 82 reorderStart=limit; 83 } 84 } else { 85 insert(c, cc); 86 } 87 remainingCapacity-=2; 88 return TRUE; 89 } 90 91 UBool ReorderingBuffer::append(const UChar *s, int32_t length, 92 uint8_t leadCC, uint8_t trailCC, 93 UErrorCode &errorCode) { 94 if(length==0) { 95 return TRUE; 96 } 97 if(remainingCapacity<length && !resize(length, errorCode)) { 98 return FALSE; 99 } 100 remainingCapacity-=length; 101 if(lastCC<=leadCC || leadCC==0) { 102 if(trailCC<=1) { 103 reorderStart=limit+length; 104 } else if(leadCC<=1) { 105 reorderStart=limit+1; // Ok if not a code point boundary. 106 } 107 const UChar *sLimit=s+length; 108 do { *limit++=*s++; } while(s!=sLimit); 109 lastCC=trailCC; 110 } else { 111 int32_t i=0; 112 UChar32 c; 113 U16_NEXT(s, i, length, c); 114 insert(c, leadCC); // insert first code point 115 while(i<length) { 116 U16_NEXT(s, i, length, c); 117 if(i<length) { 118 // s must be in NFD, otherwise we need to use getCC(). 119 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 120 } else { 121 leadCC=trailCC; 122 } 123 append(c, leadCC, errorCode); 124 } 125 } 126 return TRUE; 127 } 128 129 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { 130 int32_t cpLength=U16_LENGTH(c); 131 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { 132 return FALSE; 133 } 134 remainingCapacity-=cpLength; 135 if(cpLength==1) { 136 *limit++=(UChar)c; 137 } else { 138 limit[0]=U16_LEAD(c); 139 limit[1]=U16_TRAIL(c); 140 limit+=2; 141 } 142 lastCC=0; 143 reorderStart=limit; 144 return TRUE; 145 } 146 147 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) { 148 if(s==sLimit) { 149 return TRUE; 150 } 151 int32_t length=(int32_t)(sLimit-s); 152 if(remainingCapacity<length && !resize(length, errorCode)) { 153 return FALSE; 154 } 155 u_memcpy(limit, s, length); 156 limit+=length; 157 remainingCapacity-=length; 158 lastCC=0; 159 reorderStart=limit; 160 return TRUE; 161 } 162 163 void ReorderingBuffer::remove() { 164 reorderStart=limit=start; 165 remainingCapacity=str.getCapacity(); 166 lastCC=0; 167 } 168 169 void ReorderingBuffer::removeSuffix(int32_t suffixLength) { 170 if(suffixLength<(limit-start)) { 171 limit-=suffixLength; 172 remainingCapacity+=suffixLength; 173 } else { 174 limit=start; 175 remainingCapacity=str.getCapacity(); 176 } 177 lastCC=0; 178 reorderStart=limit; 179 } 180 181 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { 182 int32_t reorderStartIndex=(int32_t)(reorderStart-start); 183 int32_t length=(int32_t)(limit-start); 184 str.releaseBuffer(length); 185 int32_t newCapacity=length+appendLength; 186 int32_t doubleCapacity=2*str.getCapacity(); 187 if(newCapacity<doubleCapacity) { 188 newCapacity=doubleCapacity; 189 } 190 if(newCapacity<256) { 191 newCapacity=256; 192 } 193 start=str.getBuffer(newCapacity); 194 if(start==NULL) { 195 // getBuffer() already did str.setToBogus() 196 errorCode=U_MEMORY_ALLOCATION_ERROR; 197 return FALSE; 198 } 199 reorderStart=start+reorderStartIndex; 200 limit=start+length; 201 remainingCapacity=str.getCapacity()-length; 202 return TRUE; 203 } 204 205 void ReorderingBuffer::skipPrevious() { 206 codePointLimit=codePointStart; 207 UChar c=*--codePointStart; 208 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { 209 --codePointStart; 210 } 211 } 212 213 uint8_t ReorderingBuffer::previousCC() { 214 codePointLimit=codePointStart; 215 if(reorderStart>=codePointStart) { 216 return 0; 217 } 218 UChar32 c=*--codePointStart; 219 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) { 220 return 0; 221 } 222 223 UChar c2; 224 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { 225 --codePointStart; 226 c=U16_GET_SUPPLEMENTARY(c2, c); 227 } 228 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 229 } 230 231 // Inserts c somewhere before the last character. 232 // Requires 0<cc<lastCC which implies reorderStart<limit. 233 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) { 234 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 235 // insert c at codePointLimit, after the character with prevCC<=cc 236 UChar *q=limit; 237 UChar *r=limit+=U16_LENGTH(c); 238 do { 239 *--r=*--q; 240 } while(codePointLimit!=q); 241 writeCodePoint(q, c); 242 if(cc<=1) { 243 reorderStart=r; 244 } 245 } 246 247 // Normalizer2Impl --------------------------------------------------------- *** 248 249 struct CanonIterData : public UMemory { 250 CanonIterData(UErrorCode &errorCode); 251 ~CanonIterData(); 252 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); 253 UTrie2 *trie; 254 UVector canonStartSets; // contains UnicodeSet * 255 }; 256 257 Normalizer2Impl::~Normalizer2Impl() { 258 delete fCanonIterData; 259 } 260 261 void 262 Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie, 263 const uint16_t *inExtraData, const uint8_t *inSmallFCD) { 264 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 265 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 266 267 minYesNo=inIndexes[IX_MIN_YES_NO]; 268 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 269 minNoNo=inIndexes[IX_MIN_NO_NO]; 270 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 271 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 272 273 normTrie=inTrie; 274 275 maybeYesCompositions=inExtraData; 276 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); 277 278 smallFCD=inSmallFCD; 279 280 // Build tccc180[]. 281 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. 282 uint8_t bits=0; 283 for(UChar c=0; c<0x180; bits>>=1) { 284 if((c&0xff)==0) { 285 bits=smallFCD[c>>8]; // one byte per 0x100 code points 286 } 287 if(bits&1) { 288 for(int i=0; i<0x20; ++i, ++c) { 289 tccc180[c]=(uint8_t)getFCD16FromNormData(c); 290 } 291 } else { 292 uprv_memset(tccc180+c, 0, 0x20); 293 c+=0x20; 294 } 295 } 296 } 297 298 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { 299 UChar32 c; 300 if(cpStart==(cpLimit-1)) { 301 c=*cpStart; 302 } else { 303 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); 304 } 305 uint16_t prevNorm16=getNorm16(c); 306 if(prevNorm16<=minYesNo) { 307 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 308 } else { 309 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo 310 } 311 } 312 313 namespace { 314 315 class LcccContext { 316 public: 317 LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {} 318 319 void handleRange(UChar32 start, UChar32 end, uint16_t norm16) { 320 if(impl.isAlgorithmicNoNo(norm16)) { 321 // Range of code points with same-norm16-value algorithmic decompositions. 322 // They might have different non-zero FCD16 values. 323 do { 324 uint16_t fcd16=impl.getFCD16(start); 325 if(fcd16>0xff) { set.add(start); } 326 } while(++start<=end); 327 } else { 328 uint16_t fcd16=impl.getFCD16(start); 329 if(fcd16>0xff) { set.add(start, end); } 330 } 331 } 332 333 private: 334 const Normalizer2Impl &impl; 335 UnicodeSet &set; 336 }; 337 338 struct PropertyStartsContext { 339 PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder) 340 : impl(ni), sa(adder) {} 341 342 const Normalizer2Impl &impl; 343 const USetAdder *sa; 344 }; 345 346 } // namespace 347 348 U_CDECL_BEGIN 349 350 static UBool U_CALLCONV 351 enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 352 ((LcccContext *)context)->handleRange(start, end, (uint16_t)value); 353 return TRUE; 354 } 355 356 static UBool U_CALLCONV 357 enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 358 /* add the start code point to the USet */ 359 const PropertyStartsContext *ctx=(const PropertyStartsContext *)context; 360 const USetAdder *sa=ctx->sa; 361 sa->add(sa->set, start); 362 if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) { 363 // Range of code points with same-norm16-value algorithmic decompositions. 364 // They might have different non-zero FCD16 values. 365 uint16_t prevFCD16=ctx->impl.getFCD16(start); 366 while(++start<=end) { 367 uint16_t fcd16=ctx->impl.getFCD16(start); 368 if(fcd16!=prevFCD16) { 369 sa->add(sa->set, start); 370 prevFCD16=fcd16; 371 } 372 } 373 } 374 return TRUE; 375 } 376 377 static UBool U_CALLCONV 378 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 379 /* add the start code point to the USet */ 380 const USetAdder *sa=(const USetAdder *)context; 381 sa->add(sa->set, start); 382 return TRUE; 383 } 384 385 static uint32_t U_CALLCONV 386 segmentStarterMapper(const void * /*context*/, uint32_t value) { 387 return value&CANON_NOT_SEGMENT_STARTER; 388 } 389 390 U_CDECL_END 391 392 void 393 Normalizer2Impl::addLcccChars(UnicodeSet &set) const { 394 /* add the start code point of each same-value range of each trie */ 395 LcccContext context(*this, set); 396 utrie2_enum(normTrie, NULL, enumLcccRange, &context); 397 } 398 399 void 400 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { 401 /* add the start code point of each same-value range of each trie */ 402 PropertyStartsContext context(*this, sa); 403 utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context); 404 405 /* add Hangul LV syllables and LV+1 because of skippables */ 406 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { 407 sa->add(sa->set, c); 408 sa->add(sa->set, c+1); 409 } 410 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 411 } 412 413 void 414 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { 415 /* add the start code point of each same-value range of the canonical iterator data trie */ 416 if(ensureCanonIterData(errorCode)) { 417 // currently only used for the SEGMENT_STARTER property 418 utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa); 419 } 420 } 421 422 const UChar * 423 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, 424 UChar32 minNeedDataCP, 425 ReorderingBuffer *buffer, 426 UErrorCode &errorCode) const { 427 // Make some effort to support NUL-terminated strings reasonably. 428 // Take the part of the fast quick check loop that does not look up 429 // data and check the first part of the string. 430 // After this prefix, determine the string length to simplify the rest 431 // of the code. 432 const UChar *prevSrc=src; 433 UChar c; 434 while((c=*src++)<minNeedDataCP && c!=0) {} 435 // Back out the last character for full processing. 436 // Copy this prefix. 437 if(--src!=prevSrc) { 438 if(buffer!=NULL) { 439 buffer->appendZeroCC(prevSrc, src, errorCode); 440 } 441 } 442 return src; 443 } 444 445 UnicodeString & 446 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest, 447 UErrorCode &errorCode) const { 448 if(U_FAILURE(errorCode)) { 449 dest.setToBogus(); 450 return dest; 451 } 452 const UChar *sArray=src.getBuffer(); 453 if(&dest==&src || sArray==NULL) { 454 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 455 dest.setToBogus(); 456 return dest; 457 } 458 decompose(sArray, sArray+src.length(), dest, src.length(), errorCode); 459 return dest; 460 } 461 462 void 463 Normalizer2Impl::decompose(const UChar *src, const UChar *limit, 464 UnicodeString &dest, 465 int32_t destLengthEstimate, 466 UErrorCode &errorCode) const { 467 if(destLengthEstimate<0 && limit!=NULL) { 468 destLengthEstimate=(int32_t)(limit-src); 469 } 470 dest.remove(); 471 ReorderingBuffer buffer(*this, dest); 472 if(buffer.init(destLengthEstimate, errorCode)) { 473 decompose(src, limit, &buffer, errorCode); 474 } 475 } 476 477 // Dual functionality: 478 // buffer!=NULL: normalize 479 // buffer==NULL: isNormalized/spanQuickCheckYes 480 const UChar * 481 Normalizer2Impl::decompose(const UChar *src, const UChar *limit, 482 ReorderingBuffer *buffer, 483 UErrorCode &errorCode) const { 484 UChar32 minNoCP=minDecompNoCP; 485 if(limit==NULL) { 486 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); 487 if(U_FAILURE(errorCode)) { 488 return src; 489 } 490 limit=u_strchr(src, 0); 491 } 492 493 const UChar *prevSrc; 494 UChar32 c=0; 495 uint16_t norm16=0; 496 497 // only for quick check 498 const UChar *prevBoundary=src; 499 uint8_t prevCC=0; 500 501 for(;;) { 502 // count code units below the minimum or with irrelevant data for the quick check 503 for(prevSrc=src; src!=limit;) { 504 if( (c=*src)<minNoCP || 505 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 506 ) { 507 ++src; 508 } else if(!U16_IS_SURROGATE(c)) { 509 break; 510 } else { 511 UChar c2; 512 if(U16_IS_SURROGATE_LEAD(c)) { 513 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 514 c=U16_GET_SUPPLEMENTARY(c, c2); 515 } 516 } else /* trail surrogate */ { 517 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 518 --src; 519 c=U16_GET_SUPPLEMENTARY(c2, c); 520 } 521 } 522 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 523 src+=U16_LENGTH(c); 524 } else { 525 break; 526 } 527 } 528 } 529 // copy these code units all at once 530 if(src!=prevSrc) { 531 if(buffer!=NULL) { 532 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { 533 break; 534 } 535 } else { 536 prevCC=0; 537 prevBoundary=src; 538 } 539 } 540 if(src==limit) { 541 break; 542 } 543 544 // Check one above-minimum, relevant code point. 545 src+=U16_LENGTH(c); 546 if(buffer!=NULL) { 547 if(!decompose(c, norm16, *buffer, errorCode)) { 548 break; 549 } 550 } else { 551 if(isDecompYes(norm16)) { 552 uint8_t cc=getCCFromYesOrMaybe(norm16); 553 if(prevCC<=cc || cc==0) { 554 prevCC=cc; 555 if(cc<=1) { 556 prevBoundary=src; 557 } 558 continue; 559 } 560 } 561 return prevBoundary; // "no" or cc out of order 562 } 563 } 564 return src; 565 } 566 567 // Decompose a short piece of text which is likely to contain characters that 568 // fail the quick check loop and/or where the quick check loop's overhead 569 // is unlikely to be amortized. 570 // Called by the compose() and makeFCD() implementations. 571 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit, 572 ReorderingBuffer &buffer, 573 UErrorCode &errorCode) const { 574 while(src<limit) { 575 UChar32 c; 576 uint16_t norm16; 577 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16); 578 if(!decompose(c, norm16, buffer, errorCode)) { 579 return FALSE; 580 } 581 } 582 return TRUE; 583 } 584 585 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, 586 ReorderingBuffer &buffer, 587 UErrorCode &errorCode) const { 588 // Only loops for 1:1 algorithmic mappings. 589 for(;;) { 590 // get the decomposition and the lead and trail cc's 591 if(isDecompYes(norm16)) { 592 // c does not decompose 593 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode); 594 } else if(isHangul(norm16)) { 595 // Hangul syllable: decompose algorithmically 596 UChar jamos[3]; 597 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); 598 } else if(isDecompNoAlgorithmic(norm16)) { 599 c=mapAlgorithmic(c, norm16); 600 norm16=getNorm16(c); 601 } else { 602 // c decomposes, get everything from the variable-length extra data 603 const uint16_t *mapping=getMapping(norm16); 604 uint16_t firstUnit=*mapping; 605 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 606 uint8_t leadCC, trailCC; 607 trailCC=(uint8_t)(firstUnit>>8); 608 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 609 leadCC=(uint8_t)(*(mapping-1)>>8); 610 } else { 611 leadCC=0; 612 } 613 return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode); 614 } 615 } 616 } 617 618 const UChar * 619 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { 620 const UChar *decomp=NULL; 621 uint16_t norm16; 622 for(;;) { 623 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 624 // c does not decompose 625 return decomp; 626 } else if(isHangul(norm16)) { 627 // Hangul syllable: decompose algorithmically 628 length=Hangul::decompose(c, buffer); 629 return buffer; 630 } else if(isDecompNoAlgorithmic(norm16)) { 631 c=mapAlgorithmic(c, norm16); 632 decomp=buffer; 633 length=0; 634 U16_APPEND_UNSAFE(buffer, length, c); 635 } else { 636 // c decomposes, get everything from the variable-length extra data 637 const uint16_t *mapping=getMapping(norm16); 638 length=*mapping&MAPPING_LENGTH_MASK; 639 return (const UChar *)mapping+1; 640 } 641 } 642 } 643 644 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1 645 // so that a raw mapping fits that consists of one unit ("rm0") 646 // plus all but the first two code units of the normal mapping. 647 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK. 648 const UChar * 649 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const { 650 // We do not loop in this method because an algorithmic mapping itself 651 // becomes a final result rather than having to be decomposed recursively. 652 uint16_t norm16; 653 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 654 // c does not decompose 655 return NULL; 656 } else if(isHangul(norm16)) { 657 // Hangul syllable: decompose algorithmically 658 Hangul::getRawDecomposition(c, buffer); 659 length=2; 660 return buffer; 661 } else if(isDecompNoAlgorithmic(norm16)) { 662 c=mapAlgorithmic(c, norm16); 663 length=0; 664 U16_APPEND_UNSAFE(buffer, length, c); 665 return buffer; 666 } else { 667 // c decomposes, get everything from the variable-length extra data 668 const uint16_t *mapping=getMapping(norm16); 669 uint16_t firstUnit=*mapping; 670 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 671 if(firstUnit&MAPPING_HAS_RAW_MAPPING) { 672 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 673 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 674 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1; 675 uint16_t rm0=*rawMapping; 676 if(rm0<=MAPPING_LENGTH_MASK) { 677 length=rm0; 678 return (const UChar *)rawMapping-rm0; 679 } else { 680 // Copy the normal mapping and replace its first two code units with rm0. 681 buffer[0]=(UChar)rm0; 682 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2); 683 length=mLength-1; 684 return buffer; 685 } 686 } else { 687 length=mLength; 688 return (const UChar *)mapping+1; 689 } 690 } 691 } 692 693 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, 694 UBool doDecompose, 695 UnicodeString &safeMiddle, 696 ReorderingBuffer &buffer, 697 UErrorCode &errorCode) const { 698 buffer.copyReorderableSuffixTo(safeMiddle); 699 if(doDecompose) { 700 decompose(src, limit, &buffer, errorCode); 701 return; 702 } 703 // Just merge the strings at the boundary. 704 ForwardUTrie2StringIterator iter(normTrie, src, limit); 705 uint8_t firstCC, prevCC, cc; 706 firstCC=prevCC=cc=getCC(iter.next16()); 707 while(cc!=0) { 708 prevCC=cc; 709 cc=getCC(iter.next16()); 710 }; 711 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 712 limit=u_strchr(iter.codePointStart, 0); 713 } 714 715 if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) { 716 buffer.appendZeroCC(iter.codePointStart, limit, errorCode); 717 } 718 } 719 720 // Note: hasDecompBoundary() could be implemented as aliases to 721 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 722 // at the cost of building the FCD trie for a decomposition normalizer. 723 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { 724 for(;;) { 725 if(c<minDecompNoCP) { 726 return TRUE; 727 } 728 uint16_t norm16=getNorm16(c); 729 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 730 return TRUE; 731 } else if(norm16>MIN_NORMAL_MAYBE_YES) { 732 return FALSE; // ccc!=0 733 } else if(isDecompNoAlgorithmic(norm16)) { 734 c=mapAlgorithmic(c, norm16); 735 } else { 736 // c decomposes, get everything from the variable-length extra data 737 const uint16_t *mapping=getMapping(norm16); 738 uint16_t firstUnit=*mapping; 739 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 740 return FALSE; 741 } 742 if(!before) { 743 // decomp after-boundary: same as hasFCDBoundaryAfter(), 744 // fcd16<=1 || trailCC==0 745 if(firstUnit>0x1ff) { 746 return FALSE; // trailCC>1 747 } 748 if(firstUnit<=0xff) { 749 return TRUE; // trailCC==0 750 } 751 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 752 } 753 // TRUE if leadCC==0 (hasFCDBoundaryBefore()) 754 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; 755 } 756 } 757 } 758 759 /* 760 * Finds the recomposition result for 761 * a forward-combining "lead" character, 762 * specified with a pointer to its compositions list, 763 * and a backward-combining "trail" character. 764 * 765 * If the lead and trail characters combine, then this function returns 766 * the following "compositeAndFwd" value: 767 * Bits 21..1 composite character 768 * Bit 0 set if the composite is a forward-combining starter 769 * otherwise it returns -1. 770 * 771 * The compositions list has (trail, compositeAndFwd) pair entries, 772 * encoded as either pairs or triples of 16-bit units. 773 * The last entry has the high bit of its first unit set. 774 * 775 * The list is sorted by ascending trail characters (there are no duplicates). 776 * A linear search is used. 777 * 778 * See normalizer2impl.h for a more detailed description 779 * of the compositions list format. 780 */ 781 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { 782 uint16_t key1, firstUnit; 783 if(trail<COMP_1_TRAIL_LIMIT) { 784 // trail character is 0..33FF 785 // result entry may have 2 or 3 units 786 key1=(uint16_t)(trail<<1); 787 while(key1>(firstUnit=*list)) { 788 list+=2+(firstUnit&COMP_1_TRIPLE); 789 } 790 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 791 if(firstUnit&COMP_1_TRIPLE) { 792 return ((int32_t)list[1]<<16)|list[2]; 793 } else { 794 return list[1]; 795 } 796 } 797 } else { 798 // trail character is 3400..10FFFF 799 // result entry has 3 units 800 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ 801 (((trail>>COMP_1_TRAIL_SHIFT))& 802 ~COMP_1_TRIPLE)); 803 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT); 804 uint16_t secondUnit; 805 for(;;) { 806 if(key1>(firstUnit=*list)) { 807 list+=2+(firstUnit&COMP_1_TRIPLE); 808 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 809 if(key2>(secondUnit=list[1])) { 810 if(firstUnit&COMP_1_LAST_TUPLE) { 811 break; 812 } else { 813 list+=3; 814 } 815 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 816 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; 817 } else { 818 break; 819 } 820 } else { 821 break; 822 } 823 } 824 } 825 return -1; 826 } 827 828 /** 829 * @param list some character's compositions list 830 * @param set recursively receives the composites from these compositions 831 */ 832 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { 833 uint16_t firstUnit; 834 int32_t compositeAndFwd; 835 do { 836 firstUnit=*list; 837 if((firstUnit&COMP_1_TRIPLE)==0) { 838 compositeAndFwd=list[1]; 839 list+=2; 840 } else { 841 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; 842 list+=3; 843 } 844 UChar32 composite=compositeAndFwd>>1; 845 if((compositeAndFwd&1)!=0) { 846 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 847 } 848 set.add(composite); 849 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 850 } 851 852 /* 853 * Recomposes the buffer text starting at recomposeStartIndex 854 * (which is in NFD - decomposed and canonically ordered), 855 * and truncates the buffer contents. 856 * 857 * Note that recomposition never lengthens the text: 858 * Any character consists of either one or two code units; 859 * a composition may contain at most one more code unit than the original starter, 860 * while the combining mark that is removed has at least one code unit. 861 */ 862 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 863 UBool onlyContiguous) const { 864 UChar *p=buffer.getStart()+recomposeStartIndex; 865 UChar *limit=buffer.getLimit(); 866 if(p==limit) { 867 return; 868 } 869 870 UChar *starter, *pRemove, *q, *r; 871 const uint16_t *compositionsList; 872 UChar32 c, compositeAndFwd; 873 uint16_t norm16; 874 uint8_t cc, prevCC; 875 UBool starterIsSupplementary; 876 877 // Some of the following variables are not used until we have a forward-combining starter 878 // and are only initialized now to avoid compiler warnings. 879 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter 880 starter=NULL; 881 starterIsSupplementary=FALSE; 882 prevCC=0; 883 884 for(;;) { 885 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16); 886 cc=getCCFromYesOrMaybe(norm16); 887 if( // this character combines backward and 888 isMaybe(norm16) && 889 // we have seen a starter that combines forward and 890 compositionsList!=NULL && 891 // the backward-combining character is not blocked 892 (prevCC<cc || prevCC==0) 893 ) { 894 if(isJamoVT(norm16)) { 895 // c is a Jamo V/T, see if we can compose it with the previous character. 896 if(c<Hangul::JAMO_T_BASE) { 897 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 898 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE); 899 if(prev<Hangul::JAMO_L_COUNT) { 900 pRemove=p-1; 901 UChar syllable=(UChar) 902 (Hangul::HANGUL_BASE+ 903 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 904 Hangul::JAMO_T_COUNT); 905 UChar t; 906 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 907 ++p; 908 syllable+=t; // The next character was a Jamo T. 909 } 910 *starter=syllable; 911 // remove the Jamo V/T 912 q=pRemove; 913 r=p; 914 while(r<limit) { 915 *q++=*r++; 916 } 917 limit=q; 918 p=pRemove; 919 } 920 } 921 /* 922 * No "else" for Jamo T: 923 * Since the input is in NFD, there are no Hangul LV syllables that 924 * a Jamo T could combine with. 925 * All Jamo Ts are combined above when handling Jamo Vs. 926 */ 927 if(p==limit) { 928 break; 929 } 930 compositionsList=NULL; 931 continue; 932 } else if((compositeAndFwd=combine(compositionsList, c))>=0) { 933 // The starter and the combining mark (c) do combine. 934 UChar32 composite=compositeAndFwd>>1; 935 936 // Replace the starter with the composite, remove the combining mark. 937 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark 938 if(starterIsSupplementary) { 939 if(U_IS_SUPPLEMENTARY(composite)) { 940 // both are supplementary 941 starter[0]=U16_LEAD(composite); 942 starter[1]=U16_TRAIL(composite); 943 } else { 944 *starter=(UChar)composite; 945 // The composite is shorter than the starter, 946 // move the intermediate characters forward one. 947 starterIsSupplementary=FALSE; 948 q=starter+1; 949 r=q+1; 950 while(r<pRemove) { 951 *q++=*r++; 952 } 953 --pRemove; 954 } 955 } else if(U_IS_SUPPLEMENTARY(composite)) { 956 // The composite is longer than the starter, 957 // move the intermediate characters back one. 958 starterIsSupplementary=TRUE; 959 ++starter; // temporarily increment for the loop boundary 960 q=pRemove; 961 r=++pRemove; 962 while(starter<q) { 963 *--r=*--q; 964 } 965 *starter=U16_TRAIL(composite); 966 *--starter=U16_LEAD(composite); // undo the temporary increment 967 } else { 968 // both are on the BMP 969 *starter=(UChar)composite; 970 } 971 972 /* remove the combining mark by moving the following text over it */ 973 if(pRemove<p) { 974 q=pRemove; 975 r=p; 976 while(r<limit) { 977 *q++=*r++; 978 } 979 limit=q; 980 p=pRemove; 981 } 982 // Keep prevCC because we removed the combining mark. 983 984 if(p==limit) { 985 break; 986 } 987 // Is the composite a starter that combines forward? 988 if(compositeAndFwd&1) { 989 compositionsList= 990 getCompositionsListForComposite(getNorm16(composite)); 991 } else { 992 compositionsList=NULL; 993 } 994 995 // We combined; continue with looking for compositions. 996 continue; 997 } 998 } 999 1000 // no combination this time 1001 prevCC=cc; 1002 if(p==limit) { 1003 break; 1004 } 1005 1006 // If c did not combine, then check if it is a starter. 1007 if(cc==0) { 1008 // Found a new starter. 1009 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) { 1010 // It may combine with something, prepare for it. 1011 if(U_IS_BMP(c)) { 1012 starterIsSupplementary=FALSE; 1013 starter=p-1; 1014 } else { 1015 starterIsSupplementary=TRUE; 1016 starter=p-2; 1017 } 1018 } 1019 } else if(onlyContiguous) { 1020 // FCC: no discontiguous compositions; any intervening character blocks. 1021 compositionsList=NULL; 1022 } 1023 } 1024 buffer.setReorderingLimit(limit); 1025 } 1026 1027 UChar32 1028 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const { 1029 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 1030 const uint16_t *list; 1031 if(isInert(norm16)) { 1032 return U_SENTINEL; 1033 } else if(norm16<minYesNoMappingsOnly) { 1034 if(isJamoL(norm16)) { 1035 b-=Hangul::JAMO_V_BASE; 1036 if(0<=b && b<Hangul::JAMO_V_COUNT) { 1037 return 1038 (Hangul::HANGUL_BASE+ 1039 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)* 1040 Hangul::JAMO_T_COUNT); 1041 } else { 1042 return U_SENTINEL; 1043 } 1044 } else if(isHangul(norm16)) { 1045 b-=Hangul::JAMO_T_BASE; 1046 if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0! 1047 return a+b; 1048 } else { 1049 return U_SENTINEL; 1050 } 1051 } else { 1052 // 'a' has a compositions list in extraData 1053 list=extraData+norm16; 1054 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 1055 list+= // mapping pointer 1056 1+ // +1 to skip the first unit with the mapping lenth 1057 (*list&MAPPING_LENGTH_MASK); // + mapping length 1058 } 1059 } 1060 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 1061 return U_SENTINEL; 1062 } else { 1063 list=maybeYesCompositions+norm16-minMaybeYes; 1064 } 1065 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 1066 return U_SENTINEL; 1067 } 1068 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC 1069 return combine(list, b)>>1; 1070 #else 1071 int32_t compositeAndFwd=combine(list, b); 1072 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; 1073 #endif 1074 } 1075 1076 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1077 // doCompose: normalize 1078 // !doCompose: isNormalized (buffer must be empty and initialized) 1079 UBool 1080 Normalizer2Impl::compose(const UChar *src, const UChar *limit, 1081 UBool onlyContiguous, 1082 UBool doCompose, 1083 ReorderingBuffer &buffer, 1084 UErrorCode &errorCode) const { 1085 /* 1086 * prevBoundary points to the last character before the current one 1087 * that has a composition boundary before it with ccc==0 and quick check "yes". 1088 * Keeping track of prevBoundary saves us looking for a composition boundary 1089 * when we find a "no" or "maybe". 1090 * 1091 * When we back out from prevSrc back to prevBoundary, 1092 * then we also remove those same characters (which had been simply copied 1093 * or canonically-order-inserted) from the ReorderingBuffer. 1094 * Therefore, at all times, the [prevBoundary..prevSrc[ source units 1095 * must correspond 1:1 to destination units at the end of the destination buffer. 1096 */ 1097 const UChar *prevBoundary=src; 1098 UChar32 minNoMaybeCP=minCompNoMaybeCP; 1099 if(limit==NULL) { 1100 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, 1101 doCompose ? &buffer : NULL, 1102 errorCode); 1103 if(U_FAILURE(errorCode)) { 1104 return FALSE; 1105 } 1106 if(prevBoundary<src) { 1107 // Set prevBoundary to the last character in the prefix. 1108 prevBoundary=src-1; 1109 } 1110 limit=u_strchr(src, 0); 1111 } 1112 1113 const UChar *prevSrc; 1114 UChar32 c=0; 1115 uint16_t norm16=0; 1116 1117 // only for isNormalized 1118 uint8_t prevCC=0; 1119 1120 for(;;) { 1121 // count code units below the minimum or with irrelevant data for the quick check 1122 for(prevSrc=src; src!=limit;) { 1123 if( (c=*src)<minNoMaybeCP || 1124 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1125 ) { 1126 ++src; 1127 } else if(!U16_IS_SURROGATE(c)) { 1128 break; 1129 } else { 1130 UChar c2; 1131 if(U16_IS_SURROGATE_LEAD(c)) { 1132 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1133 c=U16_GET_SUPPLEMENTARY(c, c2); 1134 } 1135 } else /* trail surrogate */ { 1136 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1137 --src; 1138 c=U16_GET_SUPPLEMENTARY(c2, c); 1139 } 1140 } 1141 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1142 src+=U16_LENGTH(c); 1143 } else { 1144 break; 1145 } 1146 } 1147 } 1148 // copy these code units all at once 1149 if(src!=prevSrc) { 1150 if(doCompose) { 1151 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) { 1152 break; 1153 } 1154 } else { 1155 prevCC=0; 1156 } 1157 if(src==limit) { 1158 break; 1159 } 1160 // Set prevBoundary to the last character in the quick check loop. 1161 prevBoundary=src-1; 1162 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 1163 U16_IS_LEAD(*(prevBoundary-1)) 1164 ) { 1165 --prevBoundary; 1166 } 1167 // The start of the current character (c). 1168 prevSrc=src; 1169 } else if(src==limit) { 1170 break; 1171 } 1172 1173 src+=U16_LENGTH(c); 1174 /* 1175 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1176 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1177 * or has ccc!=0. 1178 * Check for Jamo V/T, then for regular characters. 1179 * c is not a Hangul syllable or Jamo L because those have "yes" properties. 1180 */ 1181 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 1182 UChar prev=*(prevSrc-1); 1183 UBool needToDecompose=FALSE; 1184 if(c<Hangul::JAMO_T_BASE) { 1185 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1186 prev=(UChar)(prev-Hangul::JAMO_L_BASE); 1187 if(prev<Hangul::JAMO_L_COUNT) { 1188 if(!doCompose) { 1189 return FALSE; 1190 } 1191 UChar syllable=(UChar) 1192 (Hangul::HANGUL_BASE+ 1193 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 1194 Hangul::JAMO_T_COUNT); 1195 UChar t; 1196 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 1197 ++src; 1198 syllable+=t; // The next character was a Jamo T. 1199 prevBoundary=src; 1200 buffer.setLastChar(syllable); 1201 continue; 1202 } 1203 // If we see L+V+x where x!=T then we drop to the slow path, 1204 // decompose and recompose. 1205 // This is to deal with NFKC finding normal L and V but a 1206 // compatibility variant of a T. We need to either fully compose that 1207 // combination here (which would complicate the code and may not work 1208 // with strange custom data) or use the slow path -- or else our replacing 1209 // two input characters (L+V) with one output character (LV syllable) 1210 // would violate the invariant that [prevBoundary..prevSrc[ has the same 1211 // length as what we appended to the buffer since prevBoundary. 1212 needToDecompose=TRUE; 1213 } 1214 } else if(Hangul::isHangulWithoutJamoT(prev)) { 1215 // c is a Jamo Trailing consonant, 1216 // compose with previous Hangul LV that does not contain a Jamo T. 1217 if(!doCompose) { 1218 return FALSE; 1219 } 1220 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE)); 1221 prevBoundary=src; 1222 continue; 1223 } 1224 if(!needToDecompose) { 1225 // The Jamo V/T did not compose into a Hangul syllable. 1226 if(doCompose) { 1227 if(!buffer.appendBMP((UChar)c, 0, errorCode)) { 1228 break; 1229 } 1230 } else { 1231 prevCC=0; 1232 } 1233 continue; 1234 } 1235 } 1236 /* 1237 * Source buffer pointers: 1238 * 1239 * all done quick check current char not yet 1240 * "yes" but (c) processed 1241 * may combine 1242 * forward 1243 * [-------------[-------------[-------------[-------------[ 1244 * | | | | | 1245 * orig. src prevBoundary prevSrc src limit 1246 * 1247 * 1248 * Destination buffer pointers inside the ReorderingBuffer: 1249 * 1250 * all done might take not filled yet 1251 * characters for 1252 * reordering 1253 * [-------------[-------------[-------------[ 1254 * | | | | 1255 * start reorderStart limit | 1256 * +remainingCap.+ 1257 */ 1258 if(norm16>=MIN_YES_YES_WITH_CC) { 1259 uint8_t cc=(uint8_t)norm16; // cc!=0 1260 if( onlyContiguous && // FCC 1261 (doCompose ? buffer.getLastCC() : prevCC)==0 && 1262 prevBoundary<prevSrc && 1263 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 1264 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1265 // passed the quick check "yes && ccc==0" test. 1266 // Check whether the last character was a "yesYes" or a "yesNo". 1267 // If a "yesNo", then we get its trailing ccc from its 1268 // mapping and check for canonical order. 1269 // All other cases are ok. 1270 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1271 ) { 1272 // Fails FCD test, need to decompose and contiguously recompose. 1273 if(!doCompose) { 1274 return FALSE; 1275 } 1276 } else if(doCompose) { 1277 if(!buffer.append(c, cc, errorCode)) { 1278 break; 1279 } 1280 continue; 1281 } else if(prevCC<=cc) { 1282 prevCC=cc; 1283 continue; 1284 } else { 1285 return FALSE; 1286 } 1287 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 1288 return FALSE; 1289 } 1290 1291 /* 1292 * Find appropriate boundaries around this character, 1293 * decompose the source text from between the boundaries, 1294 * and recompose it. 1295 * 1296 * We may need to remove the last few characters from the ReorderingBuffer 1297 * to account for source text that was copied or appended 1298 * but needs to take part in the recomposition. 1299 */ 1300 1301 /* 1302 * Find the last composition boundary in [prevBoundary..src[. 1303 * It is either the decomposition of the current character (at prevSrc), 1304 * or prevBoundary. 1305 */ 1306 if(hasCompBoundaryBefore(c, norm16)) { 1307 prevBoundary=prevSrc; 1308 } else if(doCompose) { 1309 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary)); 1310 } 1311 1312 // Find the next composition boundary in [src..limit[ - 1313 // modifies src to point to the next starter. 1314 src=(UChar *)findNextCompBoundary(src, limit); 1315 1316 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 1317 int32_t recomposeStartIndex=buffer.length(); 1318 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) { 1319 break; 1320 } 1321 recompose(buffer, recomposeStartIndex, onlyContiguous); 1322 if(!doCompose) { 1323 if(!buffer.equals(prevBoundary, src)) { 1324 return FALSE; 1325 } 1326 buffer.remove(); 1327 prevCC=0; 1328 } 1329 1330 // Move to the next starter. We never need to look back before this point again. 1331 prevBoundary=src; 1332 } 1333 return TRUE; 1334 } 1335 1336 // Very similar to compose(): Make the same changes in both places if relevant. 1337 // pQCResult==NULL: spanQuickCheckYes 1338 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES) 1339 const UChar * 1340 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, 1341 UBool onlyContiguous, 1342 UNormalizationCheckResult *pQCResult) const { 1343 /* 1344 * prevBoundary points to the last character before the current one 1345 * that has a composition boundary before it with ccc==0 and quick check "yes". 1346 */ 1347 const UChar *prevBoundary=src; 1348 UChar32 minNoMaybeCP=minCompNoMaybeCP; 1349 if(limit==NULL) { 1350 UErrorCode errorCode=U_ZERO_ERROR; 1351 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode); 1352 if(prevBoundary<src) { 1353 // Set prevBoundary to the last character in the prefix. 1354 prevBoundary=src-1; 1355 } 1356 limit=u_strchr(src, 0); 1357 } 1358 1359 const UChar *prevSrc; 1360 UChar32 c=0; 1361 uint16_t norm16=0; 1362 uint8_t prevCC=0; 1363 1364 for(;;) { 1365 // count code units below the minimum or with irrelevant data for the quick check 1366 for(prevSrc=src;;) { 1367 if(src==limit) { 1368 return src; 1369 } 1370 if( (c=*src)<minNoMaybeCP || 1371 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1372 ) { 1373 ++src; 1374 } else if(!U16_IS_SURROGATE(c)) { 1375 break; 1376 } else { 1377 UChar c2; 1378 if(U16_IS_SURROGATE_LEAD(c)) { 1379 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1380 c=U16_GET_SUPPLEMENTARY(c, c2); 1381 } 1382 } else /* trail surrogate */ { 1383 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1384 --src; 1385 c=U16_GET_SUPPLEMENTARY(c2, c); 1386 } 1387 } 1388 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1389 src+=U16_LENGTH(c); 1390 } else { 1391 break; 1392 } 1393 } 1394 } 1395 if(src!=prevSrc) { 1396 // Set prevBoundary to the last character in the quick check loop. 1397 prevBoundary=src-1; 1398 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 1399 U16_IS_LEAD(*(prevBoundary-1)) 1400 ) { 1401 --prevBoundary; 1402 } 1403 prevCC=0; 1404 // The start of the current character (c). 1405 prevSrc=src; 1406 } 1407 1408 src+=U16_LENGTH(c); 1409 /* 1410 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1411 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1412 * or has ccc!=0. 1413 */ 1414 if(isMaybeOrNonZeroCC(norm16)) { 1415 uint8_t cc=getCCFromYesOrMaybe(norm16); 1416 if( onlyContiguous && // FCC 1417 cc!=0 && 1418 prevCC==0 && 1419 prevBoundary<prevSrc && 1420 // prevCC==0 && prevBoundary<prevSrc tell us that 1421 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1422 // passed the quick check "yes && ccc==0" test. 1423 // Check whether the last character was a "yesYes" or a "yesNo". 1424 // If a "yesNo", then we get its trailing ccc from its 1425 // mapping and check for canonical order. 1426 // All other cases are ok. 1427 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1428 ) { 1429 // Fails FCD test. 1430 } else if(prevCC<=cc || cc==0) { 1431 prevCC=cc; 1432 if(norm16<MIN_YES_YES_WITH_CC) { 1433 if(pQCResult!=NULL) { 1434 *pQCResult=UNORM_MAYBE; 1435 } else { 1436 return prevBoundary; 1437 } 1438 } 1439 continue; 1440 } 1441 } 1442 if(pQCResult!=NULL) { 1443 *pQCResult=UNORM_NO; 1444 } 1445 return prevBoundary; 1446 } 1447 } 1448 1449 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, 1450 UBool doCompose, 1451 UBool onlyContiguous, 1452 UnicodeString &safeMiddle, 1453 ReorderingBuffer &buffer, 1454 UErrorCode &errorCode) const { 1455 if(!buffer.isEmpty()) { 1456 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit); 1457 if(src!=firstStarterInSrc) { 1458 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), 1459 buffer.getLimit()); 1460 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest); 1461 UnicodeString middle(lastStarterInDest, destSuffixLength); 1462 buffer.removeSuffix(destSuffixLength); 1463 safeMiddle=middle; 1464 middle.append(src, (int32_t)(firstStarterInSrc-src)); 1465 const UChar *middleStart=middle.getBuffer(); 1466 compose(middleStart, middleStart+middle.length(), onlyContiguous, 1467 TRUE, buffer, errorCode); 1468 if(U_FAILURE(errorCode)) { 1469 return; 1470 } 1471 src=firstStarterInSrc; 1472 } 1473 } 1474 if(doCompose) { 1475 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode); 1476 } else { 1477 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1478 limit=u_strchr(src, 0); 1479 } 1480 buffer.appendZeroCC(src, limit, errorCode); 1481 } 1482 } 1483 1484 /** 1485 * Does c have a composition boundary before it? 1486 * True if its decomposition begins with a character that has 1487 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1488 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1489 * (isCompYesAndZeroCC()) so we need not decompose. 1490 */ 1491 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { 1492 for(;;) { 1493 if(isCompYesAndZeroCC(norm16)) { 1494 return TRUE; 1495 } else if(isMaybeOrNonZeroCC(norm16)) { 1496 return FALSE; 1497 } else if(isDecompNoAlgorithmic(norm16)) { 1498 c=mapAlgorithmic(c, norm16); 1499 norm16=getNorm16(c); 1500 } else { 1501 // c decomposes, get everything from the variable-length extra data 1502 const uint16_t *mapping=getMapping(norm16); 1503 uint16_t firstUnit=*mapping; 1504 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1505 return FALSE; 1506 } 1507 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) { 1508 return FALSE; // non-zero leadCC 1509 } 1510 int32_t i=1; // skip over the firstUnit 1511 UChar32 c; 1512 U16_NEXT_UNSAFE(mapping, i, c); 1513 return isCompYesAndZeroCC(getNorm16(c)); 1514 } 1515 } 1516 } 1517 1518 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const { 1519 for(;;) { 1520 uint16_t norm16=getNorm16(c); 1521 if(isInert(norm16)) { 1522 return TRUE; 1523 } else if(norm16<=minYesNo) { 1524 // Hangul: norm16==minYesNo 1525 // Hangul LVT has a boundary after it. 1526 // Hangul LV and non-inert yesYes characters combine forward. 1527 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c); 1528 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 1529 return FALSE; 1530 } else if(isDecompNoAlgorithmic(norm16)) { 1531 c=mapAlgorithmic(c, norm16); 1532 } else { 1533 // c decomposes, get everything from the variable-length extra data. 1534 // If testInert, then c must be a yesNo character which has lccc=0, 1535 // otherwise it could be a noNo. 1536 const uint16_t *mapping=getMapping(norm16); 1537 uint16_t firstUnit=*mapping; 1538 // TRUE if 1539 // not MAPPING_NO_COMP_BOUNDARY_AFTER 1540 // (which is set if 1541 // c is not deleted, and 1542 // it and its decomposition do not combine forward, and it has a starter) 1543 // and if FCC then trailCC<=1 1544 return 1545 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && 1546 (!onlyContiguous || firstUnit<=0x1ff); 1547 } 1548 } 1549 } 1550 1551 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const { 1552 BackwardUTrie2StringIterator iter(normTrie, start, p); 1553 uint16_t norm16; 1554 do { 1555 norm16=iter.previous16(); 1556 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1557 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 1558 // but that's probably not worth the extra cost. 1559 return iter.codePointStart; 1560 } 1561 1562 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const { 1563 ForwardUTrie2StringIterator iter(normTrie, p, limit); 1564 uint16_t norm16; 1565 do { 1566 norm16=iter.next16(); 1567 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1568 return iter.codePointStart; 1569 } 1570 1571 // Note: normalizer2impl.cpp r30982 (2011-nov-27) 1572 // still had getFCDTrie() which built and cached an FCD trie. 1573 // That provided faster access to FCD data than getFCD16FromNormData() 1574 // but required synchronization and consumed some 10kB of heap memory 1575 // in any process that uses FCD (e.g., via collation). 1576 // tccc180[] and smallFCD[] are intended to help with any loss of performance, 1577 // at least for Latin & CJK. 1578 1579 // Gets the FCD value from the regular normalization data. 1580 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { 1581 // Only loops for 1:1 algorithmic mappings. 1582 for(;;) { 1583 uint16_t norm16=getNorm16(c); 1584 if(norm16<=minYesNo) { 1585 // no decomposition or Hangul syllable, all zeros 1586 return 0; 1587 } else if(norm16>=MIN_NORMAL_MAYBE_YES) { 1588 // combining mark 1589 norm16&=0xff; 1590 return norm16|(norm16<<8); 1591 } else if(norm16>=minMaybeYes) { 1592 return 0; 1593 } else if(isDecompNoAlgorithmic(norm16)) { 1594 c=mapAlgorithmic(c, norm16); 1595 } else { 1596 // c decomposes, get everything from the variable-length extra data 1597 const uint16_t *mapping=getMapping(norm16); 1598 uint16_t firstUnit=*mapping; 1599 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1600 // A character that is deleted (maps to an empty string) must 1601 // get the worst-case lccc and tccc values because arbitrary 1602 // characters on both sides will become adjacent. 1603 return 0x1ff; 1604 } else { 1605 norm16=firstUnit>>8; // tccc 1606 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 1607 norm16|=*(mapping-1)&0xff00; // lccc 1608 } 1609 return norm16; 1610 } 1611 } 1612 } 1613 } 1614 1615 // Dual functionality: 1616 // buffer!=NULL: normalize 1617 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1618 const UChar * 1619 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, 1620 ReorderingBuffer *buffer, 1621 UErrorCode &errorCode) const { 1622 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1623 // Similar to the prevBoundary in the compose() implementation. 1624 const UChar *prevBoundary=src; 1625 int32_t prevFCD16=0; 1626 if(limit==NULL) { 1627 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode); 1628 if(U_FAILURE(errorCode)) { 1629 return src; 1630 } 1631 if(prevBoundary<src) { 1632 prevBoundary=src; 1633 // We know that the previous character's lccc==0. 1634 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1635 prevFCD16=getFCD16(*(src-1)); 1636 if(prevFCD16>1) { 1637 --prevBoundary; 1638 } 1639 } 1640 limit=u_strchr(src, 0); 1641 } 1642 1643 // Note: In this function we use buffer->appendZeroCC() because we track 1644 // the lead and trail combining classes here, rather than leaving it to 1645 // the ReorderingBuffer. 1646 // The exception is the call to decomposeShort() which uses the buffer 1647 // in the normal way. 1648 1649 const UChar *prevSrc; 1650 UChar32 c=0; 1651 uint16_t fcd16=0; 1652 1653 for(;;) { 1654 // count code units with lccc==0 1655 for(prevSrc=src; src!=limit;) { 1656 if((c=*src)<MIN_CCC_LCCC_CP) { 1657 prevFCD16=~c; 1658 ++src; 1659 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1660 prevFCD16=0; 1661 ++src; 1662 } else { 1663 if(U16_IS_SURROGATE(c)) { 1664 UChar c2; 1665 if(U16_IS_SURROGATE_LEAD(c)) { 1666 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1667 c=U16_GET_SUPPLEMENTARY(c, c2); 1668 } 1669 } else /* trail surrogate */ { 1670 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1671 --src; 1672 c=U16_GET_SUPPLEMENTARY(c2, c); 1673 } 1674 } 1675 } 1676 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1677 prevFCD16=fcd16; 1678 src+=U16_LENGTH(c); 1679 } else { 1680 break; 1681 } 1682 } 1683 } 1684 // copy these code units all at once 1685 if(src!=prevSrc) { 1686 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) { 1687 break; 1688 } 1689 if(src==limit) { 1690 break; 1691 } 1692 prevBoundary=src; 1693 // We know that the previous character's lccc==0. 1694 if(prevFCD16<0) { 1695 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1696 UChar32 prev=~prevFCD16; 1697 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); 1698 if(prevFCD16>1) { 1699 --prevBoundary; 1700 } 1701 } else { 1702 const UChar *p=src-1; 1703 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { 1704 --p; 1705 // Need to fetch the previous character's FCD value because 1706 // prevFCD16 was just for the trail surrogate code point. 1707 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])); 1708 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1709 } 1710 if(prevFCD16>1) { 1711 prevBoundary=p; 1712 } 1713 } 1714 // The start of the current character (c). 1715 prevSrc=src; 1716 } else if(src==limit) { 1717 break; 1718 } 1719 1720 src+=U16_LENGTH(c); 1721 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1722 // Check for proper order, and decompose locally if necessary. 1723 if((prevFCD16&0xff)<=(fcd16>>8)) { 1724 // proper order: prev tccc <= current lccc 1725 if((fcd16&0xff)<=1) { 1726 prevBoundary=src; 1727 } 1728 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { 1729 break; 1730 } 1731 prevFCD16=fcd16; 1732 continue; 1733 } else if(buffer==NULL) { 1734 return prevBoundary; // quick check "no" 1735 } else { 1736 /* 1737 * Back out the part of the source that we copied or appended 1738 * already but is now going to be decomposed. 1739 * prevSrc is set to after what was copied/appended. 1740 */ 1741 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); 1742 /* 1743 * Find the part of the source that needs to be decomposed, 1744 * up to the next safe boundary. 1745 */ 1746 src=findNextFCDBoundary(src, limit); 1747 /* 1748 * The source text does not fulfill the conditions for FCD. 1749 * Decompose and reorder a limited piece of the text. 1750 */ 1751 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) { 1752 break; 1753 } 1754 prevBoundary=src; 1755 prevFCD16=0; 1756 } 1757 } 1758 return src; 1759 } 1760 1761 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, 1762 UBool doMakeFCD, 1763 UnicodeString &safeMiddle, 1764 ReorderingBuffer &buffer, 1765 UErrorCode &errorCode) const { 1766 if(!buffer.isEmpty()) { 1767 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit); 1768 if(src!=firstBoundaryInSrc) { 1769 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), 1770 buffer.getLimit()); 1771 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); 1772 UnicodeString middle(lastBoundaryInDest, destSuffixLength); 1773 buffer.removeSuffix(destSuffixLength); 1774 safeMiddle=middle; 1775 middle.append(src, (int32_t)(firstBoundaryInSrc-src)); 1776 const UChar *middleStart=middle.getBuffer(); 1777 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); 1778 if(U_FAILURE(errorCode)) { 1779 return; 1780 } 1781 src=firstBoundaryInSrc; 1782 } 1783 } 1784 if(doMakeFCD) { 1785 makeFCD(src, limit, &buffer, errorCode); 1786 } else { 1787 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1788 limit=u_strchr(src, 0); 1789 } 1790 buffer.appendZeroCC(src, limit, errorCode); 1791 } 1792 } 1793 1794 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { 1795 while(start<p && previousFCD16(start, p)>0xff) {} 1796 return p; 1797 } 1798 1799 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { 1800 while(p<limit) { 1801 const UChar *codePointStart=p; 1802 if(nextFCD16(p, limit)<=0xff) { 1803 return codePointStart; 1804 } 1805 } 1806 return p; 1807 } 1808 1809 // CanonicalIterator data -------------------------------------------------- *** 1810 1811 CanonIterData::CanonIterData(UErrorCode &errorCode) : 1812 trie(utrie2_open(0, 0, &errorCode)), 1813 canonStartSets(uprv_deleteUObject, NULL, errorCode) {} 1814 1815 CanonIterData::~CanonIterData() { 1816 utrie2_close(trie); 1817 } 1818 1819 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) { 1820 uint32_t canonValue=utrie2_get32(trie, decompLead); 1821 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 1822 // origin is the first character whose decomposition starts with 1823 // the character for which we are setting the value. 1824 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode); 1825 } else { 1826 // origin is not the first character, or it is U+0000. 1827 UnicodeSet *set; 1828 if((canonValue&CANON_HAS_SET)==0) { 1829 set=new UnicodeSet; 1830 if(set==NULL) { 1831 errorCode=U_MEMORY_ALLOCATION_ERROR; 1832 return; 1833 } 1834 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); 1835 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); 1836 utrie2_set32(trie, decompLead, canonValue, &errorCode); 1837 canonStartSets.addElement(set, errorCode); 1838 if(firstOrigin!=0) { 1839 set->add(firstOrigin); 1840 } 1841 } else { 1842 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; 1843 } 1844 set->add(origin); 1845 } 1846 } 1847 1848 U_CDECL_BEGIN 1849 1850 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. 1851 // context: the Normalizer2Impl 1852 static UBool U_CALLCONV 1853 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1854 UErrorCode errorCode = U_ZERO_ERROR; 1855 if (value != 0) { 1856 Normalizer2Impl *impl = (Normalizer2Impl *)context; 1857 impl->makeCanonIterDataFromNorm16( 1858 start, end, (uint16_t)value, *impl->fCanonIterData, errorCode); 1859 } 1860 return U_SUCCESS(errorCode); 1861 } 1862 1863 1864 1865 // UInitOnce instantiation function for CanonIterData 1866 1867 static void U_CALLCONV 1868 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) { 1869 U_ASSERT(impl->fCanonIterData == NULL); 1870 impl->fCanonIterData = new CanonIterData(errorCode); 1871 if (impl->fCanonIterData == NULL) { 1872 errorCode=U_MEMORY_ALLOCATION_ERROR; 1873 } 1874 if (U_SUCCESS(errorCode)) { 1875 utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl); 1876 utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode); 1877 } 1878 if (U_FAILURE(errorCode)) { 1879 delete impl->fCanonIterData; 1880 impl->fCanonIterData = NULL; 1881 } 1882 } 1883 1884 U_CDECL_END 1885 1886 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 1887 CanonIterData &newData, 1888 UErrorCode &errorCode) const { 1889 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 1890 // Inert, or 2-way mapping (including Hangul syllable). 1891 // We do not write a canonStartSet for any yesNo character. 1892 // Composites from 2-way mappings are added at runtime from the 1893 // starter's compositions list, and the other characters in 1894 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 1895 // "maybe" characters. 1896 return; 1897 } 1898 for(UChar32 c=start; c<=end; ++c) { 1899 uint32_t oldValue=utrie2_get32(newData.trie, c); 1900 uint32_t newValue=oldValue; 1901 if(norm16>=minMaybeYes) { 1902 // not a segment starter if it occurs in a decomposition or has cc!=0 1903 newValue|=CANON_NOT_SEGMENT_STARTER; 1904 if(norm16<MIN_NORMAL_MAYBE_YES) { 1905 newValue|=CANON_HAS_COMPOSITIONS; 1906 } 1907 } else if(norm16<minYesNo) { 1908 newValue|=CANON_HAS_COMPOSITIONS; 1909 } else { 1910 // c has a one-way decomposition 1911 UChar32 c2=c; 1912 uint16_t norm16_2=norm16; 1913 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 1914 c2=mapAlgorithmic(c2, norm16_2); 1915 norm16_2=getNorm16(c2); 1916 } 1917 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 1918 // c decomposes, get everything from the variable-length extra data 1919 const uint16_t *mapping=getMapping(norm16_2); 1920 uint16_t firstUnit=*mapping; 1921 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 1922 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1923 if(c==c2 && (*(mapping-1)&0xff)!=0) { 1924 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 1925 } 1926 } 1927 // Skip empty mappings (no characters in the decomposition). 1928 if(length!=0) { 1929 ++mapping; // skip over the firstUnit 1930 // add c to first code point's start set 1931 int32_t i=0; 1932 U16_NEXT_UNSAFE(mapping, i, c2); 1933 newData.addToStartSet(c, c2, errorCode); 1934 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 1935 // one-way mapping. A 2-way mapping is possible here after 1936 // intermediate algorithmic mapping. 1937 if(norm16_2>=minNoNo) { 1938 while(i<length) { 1939 U16_NEXT_UNSAFE(mapping, i, c2); 1940 uint32_t c2Value=utrie2_get32(newData.trie, c2); 1941 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 1942 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER, 1943 &errorCode); 1944 } 1945 } 1946 } 1947 } 1948 } else { 1949 // c decomposed to c2 algorithmically; c has cc==0 1950 newData.addToStartSet(c, c2, errorCode); 1951 } 1952 } 1953 if(newValue!=oldValue) { 1954 utrie2_set32(newData.trie, c, newValue, &errorCode); 1955 } 1956 } 1957 } 1958 1959 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { 1960 // Logically const: Synchronized instantiation. 1961 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 1962 umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode); 1963 return U_SUCCESS(errorCode); 1964 } 1965 1966 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { 1967 return (int32_t)utrie2_get32(fCanonIterData->trie, c); 1968 } 1969 1970 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { 1971 return *(const UnicodeSet *)fCanonIterData->canonStartSets[n]; 1972 } 1973 1974 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { 1975 return getCanonValue(c)>=0; 1976 } 1977 1978 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { 1979 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; 1980 if(canonValue==0) { 1981 return FALSE; 1982 } 1983 set.clear(); 1984 int32_t value=canonValue&CANON_VALUE_MASK; 1985 if((canonValue&CANON_HAS_SET)!=0) { 1986 set.addAll(getCanonStartSet(value)); 1987 } else if(value!=0) { 1988 set.add(value); 1989 } 1990 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 1991 uint16_t norm16=getNorm16(c); 1992 if(norm16==JAMO_L) { 1993 UChar32 syllable= 1994 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); 1995 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); 1996 } else { 1997 addComposites(getCompositionsList(norm16), set); 1998 } 1999 } 2000 return TRUE; 2001 } 2002 2003 U_NAMESPACE_END 2004 2005 // Normalizer2 data swapping ----------------------------------------------- *** 2006 2007 U_NAMESPACE_USE 2008 2009 U_CAPI int32_t U_EXPORT2 2010 unorm2_swap(const UDataSwapper *ds, 2011 const void *inData, int32_t length, void *outData, 2012 UErrorCode *pErrorCode) { 2013 const UDataInfo *pInfo; 2014 int32_t headerSize; 2015 2016 const uint8_t *inBytes; 2017 uint8_t *outBytes; 2018 2019 const int32_t *inIndexes; 2020 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1]; 2021 2022 int32_t i, offset, nextOffset, size; 2023 2024 /* udata_swapDataHeader checks the arguments */ 2025 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 2026 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 2027 return 0; 2028 } 2029 2030 /* check data format and format version */ 2031 pInfo=(const UDataInfo *)((const char *)inData+4); 2032 if(!( 2033 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 2034 pInfo->dataFormat[1]==0x72 && 2035 pInfo->dataFormat[2]==0x6d && 2036 pInfo->dataFormat[3]==0x32 && 2037 (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2) 2038 )) { 2039 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", 2040 pInfo->dataFormat[0], pInfo->dataFormat[1], 2041 pInfo->dataFormat[2], pInfo->dataFormat[3], 2042 pInfo->formatVersion[0]); 2043 *pErrorCode=U_UNSUPPORTED_ERROR; 2044 return 0; 2045 } 2046 2047 inBytes=(const uint8_t *)inData+headerSize; 2048 outBytes=(uint8_t *)outData+headerSize; 2049 2050 inIndexes=(const int32_t *)inBytes; 2051 2052 if(length>=0) { 2053 length-=headerSize; 2054 if(length<(int32_t)sizeof(indexes)) { 2055 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", 2056 length); 2057 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2058 return 0; 2059 } 2060 } 2061 2062 /* read the first few indexes */ 2063 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) { 2064 indexes[i]=udata_readInt32(ds, inIndexes[i]); 2065 } 2066 2067 /* get the total length of the data */ 2068 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 2069 2070 if(length>=0) { 2071 if(length<size) { 2072 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n", 2073 length); 2074 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2075 return 0; 2076 } 2077 2078 /* copy the data for inaccessible bytes */ 2079 if(inBytes!=outBytes) { 2080 uprv_memcpy(outBytes, inBytes, size); 2081 } 2082 2083 offset=0; 2084 2085 /* swap the int32_t indexes[] */ 2086 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]; 2087 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); 2088 offset=nextOffset; 2089 2090 /* swap the UTrie2 */ 2091 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; 2092 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 2093 offset=nextOffset; 2094 2095 /* swap the uint16_t extraData[] */ 2096 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]; 2097 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 2098 offset=nextOffset; 2099 2100 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */ 2101 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; 2102 offset=nextOffset; 2103 2104 U_ASSERT(offset==size); 2105 } 2106 2107 return headerSize+size; 2108 } 2109 2110 #endif // !UCONFIG_NO_NORMALIZATION 2111