1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: normalizer2.cpp 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov22 16 * created by: Markus W. Scherer 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_NORMALIZATION 22 23 #include "unicode/normalizer2.h" 24 #include "unicode/unistr.h" 25 #include "unicode/unorm.h" 26 #include "cstring.h" 27 #include "mutex.h" 28 #include "norm2allmodes.h" 29 #include "normalizer2impl.h" 30 #include "uassert.h" 31 #include "ucln_cmn.h" 32 33 using icu::Normalizer2Impl; 34 35 // NFC/NFD data machine-generated by gennorm2 --csource 36 #define INCLUDED_FROM_NORMALIZER2_CPP 37 #include "norm2_nfc_data.h" 38 39 U_NAMESPACE_BEGIN 40 41 // Public API dispatch via Normalizer2 subclasses -------------------------- *** 42 43 Normalizer2::~Normalizer2() {} 44 45 UBool 46 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const { 47 return FALSE; 48 } 49 50 UChar32 51 Normalizer2::composePair(UChar32, UChar32) const { 52 return U_SENTINEL; 53 } 54 55 uint8_t 56 Normalizer2::getCombiningClass(UChar32 /*c*/) const { 57 return 0; 58 } 59 60 // Normalizer2 implementation for the old UNORM_NONE. 61 class NoopNormalizer2 : public Normalizer2 { 62 virtual ~NoopNormalizer2(); 63 64 virtual UnicodeString & 65 normalize(const UnicodeString &src, 66 UnicodeString &dest, 67 UErrorCode &errorCode) const { 68 if(U_SUCCESS(errorCode)) { 69 if(&dest!=&src) { 70 dest=src; 71 } else { 72 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 73 } 74 } 75 return dest; 76 } 77 virtual UnicodeString & 78 normalizeSecondAndAppend(UnicodeString &first, 79 const UnicodeString &second, 80 UErrorCode &errorCode) const { 81 if(U_SUCCESS(errorCode)) { 82 if(&first!=&second) { 83 first.append(second); 84 } else { 85 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 86 } 87 } 88 return first; 89 } 90 virtual UnicodeString & 91 append(UnicodeString &first, 92 const UnicodeString &second, 93 UErrorCode &errorCode) const { 94 if(U_SUCCESS(errorCode)) { 95 if(&first!=&second) { 96 first.append(second); 97 } else { 98 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 99 } 100 } 101 return first; 102 } 103 virtual UBool 104 getDecomposition(UChar32, UnicodeString &) const { 105 return FALSE; 106 } 107 // No need to override the default getRawDecomposition(). 108 virtual UBool 109 isNormalized(const UnicodeString &, UErrorCode &) const { 110 return TRUE; 111 } 112 virtual UNormalizationCheckResult 113 quickCheck(const UnicodeString &, UErrorCode &) const { 114 return UNORM_YES; 115 } 116 virtual int32_t 117 spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const { 118 return s.length(); 119 } 120 virtual UBool hasBoundaryBefore(UChar32) const { return TRUE; } 121 virtual UBool hasBoundaryAfter(UChar32) const { return TRUE; } 122 virtual UBool isInert(UChar32) const { return TRUE; } 123 }; 124 125 NoopNormalizer2::~NoopNormalizer2() {} 126 127 Normalizer2WithImpl::~Normalizer2WithImpl() {} 128 129 DecomposeNormalizer2::~DecomposeNormalizer2() {} 130 131 ComposeNormalizer2::~ComposeNormalizer2() {} 132 133 FCDNormalizer2::~FCDNormalizer2() {} 134 135 // instance cache ---------------------------------------------------------- *** 136 137 Norm2AllModes::~Norm2AllModes() { 138 delete impl; 139 } 140 141 Norm2AllModes * 142 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) { 143 if(U_FAILURE(errorCode)) { 144 delete impl; 145 return NULL; 146 } 147 Norm2AllModes *allModes=new Norm2AllModes(impl); 148 if(allModes==NULL) { 149 errorCode=U_MEMORY_ALLOCATION_ERROR; 150 delete impl; 151 return NULL; 152 } 153 return allModes; 154 } 155 156 Norm2AllModes * 157 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) { 158 if(U_FAILURE(errorCode)) { 159 return NULL; 160 } 161 Normalizer2Impl *impl=new Normalizer2Impl; 162 if(impl==NULL) { 163 errorCode=U_MEMORY_ALLOCATION_ERROR; 164 return NULL; 165 } 166 impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie, 167 norm2_nfc_data_extraData, norm2_nfc_data_smallFCD); 168 return createInstance(impl, errorCode); 169 } 170 171 U_CDECL_BEGIN 172 static UBool U_CALLCONV uprv_normalizer2_cleanup(); 173 U_CDECL_END 174 175 static Norm2AllModes *nfcSingleton; 176 static Normalizer2 *noopSingleton; 177 178 static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER; 179 static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER; 180 181 // UInitOnce singleton initialization functions 182 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) { 183 nfcSingleton=Norm2AllModes::createNFCInstance(errorCode); 184 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup); 185 } 186 187 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) { 188 if(U_FAILURE(errorCode)) { 189 return; 190 } 191 noopSingleton=new NoopNormalizer2; 192 if(noopSingleton==NULL) { 193 errorCode=U_MEMORY_ALLOCATION_ERROR; 194 return; 195 } 196 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup); 197 } 198 199 U_CDECL_BEGIN 200 201 static UBool U_CALLCONV uprv_normalizer2_cleanup() { 202 delete nfcSingleton; 203 nfcSingleton = NULL; 204 delete noopSingleton; 205 noopSingleton = NULL; 206 nfcInitOnce.reset(); 207 noopInitOnce.reset(); 208 return TRUE; 209 } 210 211 U_CDECL_END 212 213 const Norm2AllModes * 214 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) { 215 if(U_FAILURE(errorCode)) { return NULL; } 216 umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode); 217 return nfcSingleton; 218 } 219 220 const Normalizer2 * 221 Normalizer2::getNFCInstance(UErrorCode &errorCode) { 222 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 223 return allModes!=NULL ? &allModes->comp : NULL; 224 } 225 226 const Normalizer2 * 227 Normalizer2::getNFDInstance(UErrorCode &errorCode) { 228 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 229 return allModes!=NULL ? &allModes->decomp : NULL; 230 } 231 232 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) { 233 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 234 return allModes!=NULL ? &allModes->fcd : NULL; 235 } 236 237 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) { 238 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 239 return allModes!=NULL ? &allModes->fcc : NULL; 240 } 241 242 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) { 243 if(U_FAILURE(errorCode)) { return NULL; } 244 umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode); 245 return noopSingleton; 246 } 247 248 const Normalizer2Impl * 249 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) { 250 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 251 return allModes!=NULL ? allModes->impl : NULL; 252 } 253 254 const Normalizer2Impl * 255 Normalizer2Factory::getImpl(const Normalizer2 *norm2) { 256 return &((Normalizer2WithImpl *)norm2)->impl; 257 } 258 259 U_NAMESPACE_END 260 261 // C API ------------------------------------------------------------------- *** 262 263 U_NAMESPACE_USE 264 265 U_CAPI const UNormalizer2 * U_EXPORT2 266 unorm2_getNFCInstance(UErrorCode *pErrorCode) { 267 return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode); 268 } 269 270 U_CAPI const UNormalizer2 * U_EXPORT2 271 unorm2_getNFDInstance(UErrorCode *pErrorCode) { 272 return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode); 273 } 274 275 U_CAPI void U_EXPORT2 276 unorm2_close(UNormalizer2 *norm2) { 277 delete (Normalizer2 *)norm2; 278 } 279 280 U_CAPI int32_t U_EXPORT2 281 unorm2_normalize(const UNormalizer2 *norm2, 282 const UChar *src, int32_t length, 283 UChar *dest, int32_t capacity, 284 UErrorCode *pErrorCode) { 285 if(U_FAILURE(*pErrorCode)) { 286 return 0; 287 } 288 if( (src==NULL ? length!=0 : length<-1) || 289 (dest==NULL ? capacity!=0 : capacity<0) || 290 (src==dest && src!=NULL) 291 ) { 292 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 293 return 0; 294 } 295 UnicodeString destString(dest, 0, capacity); 296 // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash. 297 if(length!=0) { 298 const Normalizer2 *n2=(const Normalizer2 *)norm2; 299 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2); 300 if(n2wi!=NULL) { 301 // Avoid duplicate argument checking and support NUL-terminated src. 302 ReorderingBuffer buffer(n2wi->impl, destString); 303 if(buffer.init(length, *pErrorCode)) { 304 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode); 305 } 306 } else { 307 UnicodeString srcString(length<0, src, length); 308 n2->normalize(srcString, destString, *pErrorCode); 309 } 310 } 311 return destString.extract(dest, capacity, *pErrorCode); 312 } 313 314 static int32_t 315 normalizeSecondAndAppend(const UNormalizer2 *norm2, 316 UChar *first, int32_t firstLength, int32_t firstCapacity, 317 const UChar *second, int32_t secondLength, 318 UBool doNormalize, 319 UErrorCode *pErrorCode) { 320 if(U_FAILURE(*pErrorCode)) { 321 return 0; 322 } 323 if( (second==NULL ? secondLength!=0 : secondLength<-1) || 324 (first==NULL ? (firstCapacity!=0 || firstLength!=0) : 325 (firstCapacity<0 || firstLength<-1)) || 326 (first==second && first!=NULL) 327 ) { 328 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 329 return 0; 330 } 331 UnicodeString firstString(first, firstLength, firstCapacity); 332 firstLength=firstString.length(); // In case it was -1. 333 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash. 334 if(secondLength!=0) { 335 const Normalizer2 *n2=(const Normalizer2 *)norm2; 336 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2); 337 if(n2wi!=NULL) { 338 // Avoid duplicate argument checking and support NUL-terminated src. 339 UnicodeString safeMiddle; 340 { 341 ReorderingBuffer buffer(n2wi->impl, firstString); 342 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1 343 n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL, 344 doNormalize, safeMiddle, buffer, *pErrorCode); 345 } 346 } // The ReorderingBuffer destructor finalizes firstString. 347 if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) { 348 // Restore the modified suffix of the first string. 349 // This does not restore first[] array contents between firstLength and firstCapacity. 350 // (That might be uninitialized memory, as far as we know.) 351 if(first!=NULL) { /* don't dereference NULL */ 352 safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length()); 353 if(firstLength<firstCapacity) { 354 first[firstLength]=0; // NUL-terminate in case it was originally. 355 } 356 } 357 } 358 } else { 359 UnicodeString secondString(secondLength<0, second, secondLength); 360 if(doNormalize) { 361 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode); 362 } else { 363 n2->append(firstString, secondString, *pErrorCode); 364 } 365 } 366 } 367 return firstString.extract(first, firstCapacity, *pErrorCode); 368 } 369 370 U_CAPI int32_t U_EXPORT2 371 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2, 372 UChar *first, int32_t firstLength, int32_t firstCapacity, 373 const UChar *second, int32_t secondLength, 374 UErrorCode *pErrorCode) { 375 return normalizeSecondAndAppend(norm2, 376 first, firstLength, firstCapacity, 377 second, secondLength, 378 TRUE, pErrorCode); 379 } 380 381 U_CAPI int32_t U_EXPORT2 382 unorm2_append(const UNormalizer2 *norm2, 383 UChar *first, int32_t firstLength, int32_t firstCapacity, 384 const UChar *second, int32_t secondLength, 385 UErrorCode *pErrorCode) { 386 return normalizeSecondAndAppend(norm2, 387 first, firstLength, firstCapacity, 388 second, secondLength, 389 FALSE, pErrorCode); 390 } 391 392 U_CAPI int32_t U_EXPORT2 393 unorm2_getDecomposition(const UNormalizer2 *norm2, 394 UChar32 c, UChar *decomposition, int32_t capacity, 395 UErrorCode *pErrorCode) { 396 if(U_FAILURE(*pErrorCode)) { 397 return 0; 398 } 399 if(decomposition==NULL ? capacity!=0 : capacity<0) { 400 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 401 return 0; 402 } 403 UnicodeString destString(decomposition, 0, capacity); 404 if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) { 405 return destString.extract(decomposition, capacity, *pErrorCode); 406 } else { 407 return -1; 408 } 409 } 410 411 U_CAPI int32_t U_EXPORT2 412 unorm2_getRawDecomposition(const UNormalizer2 *norm2, 413 UChar32 c, UChar *decomposition, int32_t capacity, 414 UErrorCode *pErrorCode) { 415 if(U_FAILURE(*pErrorCode)) { 416 return 0; 417 } 418 if(decomposition==NULL ? capacity!=0 : capacity<0) { 419 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 420 return 0; 421 } 422 UnicodeString destString(decomposition, 0, capacity); 423 if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) { 424 return destString.extract(decomposition, capacity, *pErrorCode); 425 } else { 426 return -1; 427 } 428 } 429 430 U_CAPI UChar32 U_EXPORT2 431 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) { 432 return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b); 433 } 434 435 U_CAPI uint8_t U_EXPORT2 436 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) { 437 return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c); 438 } 439 440 U_CAPI UBool U_EXPORT2 441 unorm2_isNormalized(const UNormalizer2 *norm2, 442 const UChar *s, int32_t length, 443 UErrorCode *pErrorCode) { 444 if(U_FAILURE(*pErrorCode)) { 445 return 0; 446 } 447 if((s==NULL && length!=0) || length<-1) { 448 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 449 return 0; 450 } 451 UnicodeString sString(length<0, s, length); 452 return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode); 453 } 454 455 U_CAPI UNormalizationCheckResult U_EXPORT2 456 unorm2_quickCheck(const UNormalizer2 *norm2, 457 const UChar *s, int32_t length, 458 UErrorCode *pErrorCode) { 459 if(U_FAILURE(*pErrorCode)) { 460 return UNORM_NO; 461 } 462 if((s==NULL && length!=0) || length<-1) { 463 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 464 return UNORM_NO; 465 } 466 UnicodeString sString(length<0, s, length); 467 return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode); 468 } 469 470 U_CAPI int32_t U_EXPORT2 471 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2, 472 const UChar *s, int32_t length, 473 UErrorCode *pErrorCode) { 474 if(U_FAILURE(*pErrorCode)) { 475 return 0; 476 } 477 if((s==NULL && length!=0) || length<-1) { 478 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 479 return 0; 480 } 481 UnicodeString sString(length<0, s, length); 482 return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode); 483 } 484 485 U_CAPI UBool U_EXPORT2 486 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) { 487 return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c); 488 } 489 490 U_CAPI UBool U_EXPORT2 491 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) { 492 return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c); 493 } 494 495 U_CAPI UBool U_EXPORT2 496 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) { 497 return ((const Normalizer2 *)norm2)->isInert(c); 498 } 499 500 // Some properties APIs ---------------------------------------------------- *** 501 502 U_CAPI uint8_t U_EXPORT2 503 u_getCombiningClass(UChar32 c) { 504 UErrorCode errorCode=U_ZERO_ERROR; 505 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode); 506 if(U_SUCCESS(errorCode)) { 507 return nfd->getCombiningClass(c); 508 } else { 509 return 0; 510 } 511 } 512 513 U_CFUNC uint16_t 514 unorm_getFCD16(UChar32 c) { 515 UErrorCode errorCode=U_ZERO_ERROR; 516 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); 517 if(U_SUCCESS(errorCode)) { 518 return impl->getFCD16(c); 519 } else { 520 return 0; 521 } 522 } 523 524 #endif // !UCONFIG_NO_NORMALIZATION 525