1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2014, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: normalizer2.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov22 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_NORMALIZATION 20 21 #include "unicode/normalizer2.h" 22 #include "unicode/unistr.h" 23 #include "unicode/unorm.h" 24 #include "cstring.h" 25 #include "mutex.h" 26 #include "norm2allmodes.h" 27 #include "normalizer2impl.h" 28 #include "uassert.h" 29 #include "ucln_cmn.h" 30 31 using icu::Normalizer2Impl; 32 33 // NFC/NFD data machine-generated by gennorm2 --csource 34 #include "norm2_nfc_data.h" 35 36 U_NAMESPACE_BEGIN 37 38 // Public API dispatch via Normalizer2 subclasses -------------------------- *** 39 40 Normalizer2::~Normalizer2() {} 41 42 UBool 43 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const { 44 return FALSE; 45 } 46 47 UChar32 48 Normalizer2::composePair(UChar32, UChar32) const { 49 return U_SENTINEL; 50 } 51 52 uint8_t 53 Normalizer2::getCombiningClass(UChar32 /*c*/) const { 54 return 0; 55 } 56 57 // Normalizer2 implementation for the old UNORM_NONE. 58 class NoopNormalizer2 : public Normalizer2 { 59 virtual ~NoopNormalizer2(); 60 61 virtual UnicodeString & 62 normalize(const UnicodeString &src, 63 UnicodeString &dest, 64 UErrorCode &errorCode) const { 65 if(U_SUCCESS(errorCode)) { 66 if(&dest!=&src) { 67 dest=src; 68 } else { 69 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 70 } 71 } 72 return dest; 73 } 74 virtual UnicodeString & 75 normalizeSecondAndAppend(UnicodeString &first, 76 const UnicodeString &second, 77 UErrorCode &errorCode) const { 78 if(U_SUCCESS(errorCode)) { 79 if(&first!=&second) { 80 first.append(second); 81 } else { 82 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 83 } 84 } 85 return first; 86 } 87 virtual UnicodeString & 88 append(UnicodeString &first, 89 const UnicodeString &second, 90 UErrorCode &errorCode) const { 91 if(U_SUCCESS(errorCode)) { 92 if(&first!=&second) { 93 first.append(second); 94 } else { 95 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 96 } 97 } 98 return first; 99 } 100 virtual UBool 101 getDecomposition(UChar32, UnicodeString &) const { 102 return FALSE; 103 } 104 // No need to override the default getRawDecomposition(). 105 virtual UBool 106 isNormalized(const UnicodeString &, UErrorCode &) const { 107 return TRUE; 108 } 109 virtual UNormalizationCheckResult 110 quickCheck(const UnicodeString &, UErrorCode &) const { 111 return UNORM_YES; 112 } 113 virtual int32_t 114 spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const { 115 return s.length(); 116 } 117 virtual UBool hasBoundaryBefore(UChar32) const { return TRUE; } 118 virtual UBool hasBoundaryAfter(UChar32) const { return TRUE; } 119 virtual UBool isInert(UChar32) const { return TRUE; } 120 }; 121 122 NoopNormalizer2::~NoopNormalizer2() {} 123 124 Normalizer2WithImpl::~Normalizer2WithImpl() {} 125 126 DecomposeNormalizer2::~DecomposeNormalizer2() {} 127 128 ComposeNormalizer2::~ComposeNormalizer2() {} 129 130 FCDNormalizer2::~FCDNormalizer2() {} 131 132 // instance cache ---------------------------------------------------------- *** 133 134 Norm2AllModes::~Norm2AllModes() { 135 delete impl; 136 } 137 138 Norm2AllModes * 139 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) { 140 if(U_FAILURE(errorCode)) { 141 delete impl; 142 return NULL; 143 } 144 Norm2AllModes *allModes=new Norm2AllModes(impl); 145 if(allModes==NULL) { 146 errorCode=U_MEMORY_ALLOCATION_ERROR; 147 delete impl; 148 return NULL; 149 } 150 return allModes; 151 } 152 153 Norm2AllModes * 154 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) { 155 if(U_FAILURE(errorCode)) { 156 return NULL; 157 } 158 Normalizer2Impl *impl=new Normalizer2Impl; 159 if(impl==NULL) { 160 errorCode=U_MEMORY_ALLOCATION_ERROR; 161 return NULL; 162 } 163 impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie, 164 norm2_nfc_data_extraData, norm2_nfc_data_smallFCD); 165 return createInstance(impl, errorCode); 166 } 167 168 U_CDECL_BEGIN 169 static UBool U_CALLCONV uprv_normalizer2_cleanup(); 170 U_CDECL_END 171 172 static Norm2AllModes *nfcSingleton; 173 static Normalizer2 *noopSingleton; 174 175 static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER; 176 static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER; 177 178 // UInitOnce singleton initialization functions 179 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) { 180 nfcSingleton=Norm2AllModes::createNFCInstance(errorCode); 181 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup); 182 } 183 184 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) { 185 if(U_FAILURE(errorCode)) { 186 return; 187 } 188 noopSingleton=new NoopNormalizer2; 189 if(noopSingleton==NULL) { 190 errorCode=U_MEMORY_ALLOCATION_ERROR; 191 return; 192 } 193 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup); 194 } 195 196 U_CDECL_BEGIN 197 198 static UBool U_CALLCONV uprv_normalizer2_cleanup() { 199 delete nfcSingleton; 200 nfcSingleton = NULL; 201 delete noopSingleton; 202 noopSingleton = NULL; 203 nfcInitOnce.reset(); 204 noopInitOnce.reset(); 205 return TRUE; 206 } 207 208 U_CDECL_END 209 210 const Norm2AllModes * 211 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) { 212 if(U_FAILURE(errorCode)) { return NULL; } 213 umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode); 214 return nfcSingleton; 215 } 216 217 const Normalizer2 * 218 Normalizer2::getNFCInstance(UErrorCode &errorCode) { 219 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 220 return allModes!=NULL ? &allModes->comp : NULL; 221 } 222 223 const Normalizer2 * 224 Normalizer2::getNFDInstance(UErrorCode &errorCode) { 225 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 226 return allModes!=NULL ? &allModes->decomp : NULL; 227 } 228 229 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) { 230 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 231 return allModes!=NULL ? &allModes->fcd : NULL; 232 } 233 234 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) { 235 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 236 return allModes!=NULL ? &allModes->fcc : NULL; 237 } 238 239 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) { 240 if(U_FAILURE(errorCode)) { return NULL; } 241 umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode); 242 return noopSingleton; 243 } 244 245 const Normalizer2Impl * 246 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) { 247 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 248 return allModes!=NULL ? allModes->impl : NULL; 249 } 250 251 const Normalizer2Impl * 252 Normalizer2Factory::getImpl(const Normalizer2 *norm2) { 253 return &((Normalizer2WithImpl *)norm2)->impl; 254 } 255 256 U_NAMESPACE_END 257 258 // C API ------------------------------------------------------------------- *** 259 260 U_NAMESPACE_USE 261 262 U_CAPI const UNormalizer2 * U_EXPORT2 263 unorm2_getNFCInstance(UErrorCode *pErrorCode) { 264 return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode); 265 } 266 267 U_CAPI const UNormalizer2 * U_EXPORT2 268 unorm2_getNFDInstance(UErrorCode *pErrorCode) { 269 return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode); 270 } 271 272 U_CAPI void U_EXPORT2 273 unorm2_close(UNormalizer2 *norm2) { 274 delete (Normalizer2 *)norm2; 275 } 276 277 U_CAPI int32_t U_EXPORT2 278 unorm2_normalize(const UNormalizer2 *norm2, 279 const UChar *src, int32_t length, 280 UChar *dest, int32_t capacity, 281 UErrorCode *pErrorCode) { 282 if(U_FAILURE(*pErrorCode)) { 283 return 0; 284 } 285 if( (src==NULL ? length!=0 : length<-1) || 286 (dest==NULL ? capacity!=0 : capacity<0) || 287 (src==dest && src!=NULL) 288 ) { 289 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 290 return 0; 291 } 292 UnicodeString destString(dest, 0, capacity); 293 // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash. 294 if(length!=0) { 295 const Normalizer2 *n2=(const Normalizer2 *)norm2; 296 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2); 297 if(n2wi!=NULL) { 298 // Avoid duplicate argument checking and support NUL-terminated src. 299 ReorderingBuffer buffer(n2wi->impl, destString); 300 if(buffer.init(length, *pErrorCode)) { 301 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode); 302 } 303 } else { 304 UnicodeString srcString(length<0, src, length); 305 n2->normalize(srcString, destString, *pErrorCode); 306 } 307 } 308 return destString.extract(dest, capacity, *pErrorCode); 309 } 310 311 static int32_t 312 normalizeSecondAndAppend(const UNormalizer2 *norm2, 313 UChar *first, int32_t firstLength, int32_t firstCapacity, 314 const UChar *second, int32_t secondLength, 315 UBool doNormalize, 316 UErrorCode *pErrorCode) { 317 if(U_FAILURE(*pErrorCode)) { 318 return 0; 319 } 320 if( (second==NULL ? secondLength!=0 : secondLength<-1) || 321 (first==NULL ? (firstCapacity!=0 || firstLength!=0) : 322 (firstCapacity<0 || firstLength<-1)) || 323 (first==second && first!=NULL) 324 ) { 325 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 326 return 0; 327 } 328 UnicodeString firstString(first, firstLength, firstCapacity); 329 firstLength=firstString.length(); // In case it was -1. 330 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash. 331 if(secondLength!=0) { 332 const Normalizer2 *n2=(const Normalizer2 *)norm2; 333 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2); 334 if(n2wi!=NULL) { 335 // Avoid duplicate argument checking and support NUL-terminated src. 336 UnicodeString safeMiddle; 337 { 338 ReorderingBuffer buffer(n2wi->impl, firstString); 339 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1 340 n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL, 341 doNormalize, safeMiddle, buffer, *pErrorCode); 342 } 343 } // The ReorderingBuffer destructor finalizes firstString. 344 if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) { 345 // Restore the modified suffix of the first string. 346 // This does not restore first[] array contents between firstLength and firstCapacity. 347 // (That might be uninitialized memory, as far as we know.) 348 if(first!=NULL) { /* don't dereference NULL */ 349 safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length()); 350 if(firstLength<firstCapacity) { 351 first[firstLength]=0; // NUL-terminate in case it was originally. 352 } 353 } 354 } 355 } else { 356 UnicodeString secondString(secondLength<0, second, secondLength); 357 if(doNormalize) { 358 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode); 359 } else { 360 n2->append(firstString, secondString, *pErrorCode); 361 } 362 } 363 } 364 return firstString.extract(first, firstCapacity, *pErrorCode); 365 } 366 367 U_CAPI int32_t U_EXPORT2 368 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2, 369 UChar *first, int32_t firstLength, int32_t firstCapacity, 370 const UChar *second, int32_t secondLength, 371 UErrorCode *pErrorCode) { 372 return normalizeSecondAndAppend(norm2, 373 first, firstLength, firstCapacity, 374 second, secondLength, 375 TRUE, pErrorCode); 376 } 377 378 U_CAPI int32_t U_EXPORT2 379 unorm2_append(const UNormalizer2 *norm2, 380 UChar *first, int32_t firstLength, int32_t firstCapacity, 381 const UChar *second, int32_t secondLength, 382 UErrorCode *pErrorCode) { 383 return normalizeSecondAndAppend(norm2, 384 first, firstLength, firstCapacity, 385 second, secondLength, 386 FALSE, pErrorCode); 387 } 388 389 U_CAPI int32_t U_EXPORT2 390 unorm2_getDecomposition(const UNormalizer2 *norm2, 391 UChar32 c, UChar *decomposition, int32_t capacity, 392 UErrorCode *pErrorCode) { 393 if(U_FAILURE(*pErrorCode)) { 394 return 0; 395 } 396 if(decomposition==NULL ? capacity!=0 : capacity<0) { 397 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 398 return 0; 399 } 400 UnicodeString destString(decomposition, 0, capacity); 401 if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) { 402 return destString.extract(decomposition, capacity, *pErrorCode); 403 } else { 404 return -1; 405 } 406 } 407 408 U_CAPI int32_t U_EXPORT2 409 unorm2_getRawDecomposition(const UNormalizer2 *norm2, 410 UChar32 c, UChar *decomposition, int32_t capacity, 411 UErrorCode *pErrorCode) { 412 if(U_FAILURE(*pErrorCode)) { 413 return 0; 414 } 415 if(decomposition==NULL ? capacity!=0 : capacity<0) { 416 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 417 return 0; 418 } 419 UnicodeString destString(decomposition, 0, capacity); 420 if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) { 421 return destString.extract(decomposition, capacity, *pErrorCode); 422 } else { 423 return -1; 424 } 425 } 426 427 U_CAPI UChar32 U_EXPORT2 428 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) { 429 return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b); 430 } 431 432 U_CAPI uint8_t U_EXPORT2 433 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) { 434 return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c); 435 } 436 437 U_CAPI UBool U_EXPORT2 438 unorm2_isNormalized(const UNormalizer2 *norm2, 439 const UChar *s, int32_t length, 440 UErrorCode *pErrorCode) { 441 if(U_FAILURE(*pErrorCode)) { 442 return 0; 443 } 444 if((s==NULL && length!=0) || length<-1) { 445 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 446 return 0; 447 } 448 UnicodeString sString(length<0, s, length); 449 return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode); 450 } 451 452 U_CAPI UNormalizationCheckResult U_EXPORT2 453 unorm2_quickCheck(const UNormalizer2 *norm2, 454 const UChar *s, int32_t length, 455 UErrorCode *pErrorCode) { 456 if(U_FAILURE(*pErrorCode)) { 457 return UNORM_NO; 458 } 459 if((s==NULL && length!=0) || length<-1) { 460 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 461 return UNORM_NO; 462 } 463 UnicodeString sString(length<0, s, length); 464 return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode); 465 } 466 467 U_CAPI int32_t U_EXPORT2 468 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2, 469 const UChar *s, int32_t length, 470 UErrorCode *pErrorCode) { 471 if(U_FAILURE(*pErrorCode)) { 472 return 0; 473 } 474 if((s==NULL && length!=0) || length<-1) { 475 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 476 return 0; 477 } 478 UnicodeString sString(length<0, s, length); 479 return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode); 480 } 481 482 U_CAPI UBool U_EXPORT2 483 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) { 484 return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c); 485 } 486 487 U_CAPI UBool U_EXPORT2 488 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) { 489 return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c); 490 } 491 492 U_CAPI UBool U_EXPORT2 493 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) { 494 return ((const Normalizer2 *)norm2)->isInert(c); 495 } 496 497 // Some properties APIs ---------------------------------------------------- *** 498 499 U_CAPI uint8_t U_EXPORT2 500 u_getCombiningClass(UChar32 c) { 501 UErrorCode errorCode=U_ZERO_ERROR; 502 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode); 503 if(U_SUCCESS(errorCode)) { 504 return nfd->getCombiningClass(c); 505 } else { 506 return 0; 507 } 508 } 509 510 U_CFUNC uint16_t 511 unorm_getFCD16(UChar32 c) { 512 UErrorCode errorCode=U_ZERO_ERROR; 513 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); 514 if(U_SUCCESS(errorCode)) { 515 return impl->getFCD16(c); 516 } else { 517 return 0; 518 } 519 } 520 521 #endif // !UCONFIG_NO_NORMALIZATION 522