Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2009-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  normalizer2.cpp
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009nov22
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_NORMALIZATION
     22 
     23 #include "unicode/normalizer2.h"
     24 #include "unicode/unistr.h"
     25 #include "unicode/unorm.h"
     26 #include "cstring.h"
     27 #include "mutex.h"
     28 #include "norm2allmodes.h"
     29 #include "normalizer2impl.h"
     30 #include "uassert.h"
     31 #include "ucln_cmn.h"
     32 
     33 using icu::Normalizer2Impl;
     34 
     35 // NFC/NFD data machine-generated by gennorm2 --csource
     36 #define INCLUDED_FROM_NORMALIZER2_CPP
     37 #include "norm2_nfc_data.h"
     38 
     39 U_NAMESPACE_BEGIN
     40 
     41 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
     42 
     43 Normalizer2::~Normalizer2() {}
     44 
     45 UBool
     46 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
     47     return FALSE;
     48 }
     49 
     50 UChar32
     51 Normalizer2::composePair(UChar32, UChar32) const {
     52     return U_SENTINEL;
     53 }
     54 
     55 uint8_t
     56 Normalizer2::getCombiningClass(UChar32 /*c*/) const {
     57     return 0;
     58 }
     59 
     60 // Normalizer2 implementation for the old UNORM_NONE.
     61 class NoopNormalizer2 : public Normalizer2 {
     62     virtual ~NoopNormalizer2();
     63 
     64     virtual UnicodeString &
     65     normalize(const UnicodeString &src,
     66               UnicodeString &dest,
     67               UErrorCode &errorCode) const {
     68         if(U_SUCCESS(errorCode)) {
     69             if(&dest!=&src) {
     70                 dest=src;
     71             } else {
     72                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     73             }
     74         }
     75         return dest;
     76     }
     77     virtual UnicodeString &
     78     normalizeSecondAndAppend(UnicodeString &first,
     79                              const UnicodeString &second,
     80                              UErrorCode &errorCode) const {
     81         if(U_SUCCESS(errorCode)) {
     82             if(&first!=&second) {
     83                 first.append(second);
     84             } else {
     85                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     86             }
     87         }
     88         return first;
     89     }
     90     virtual UnicodeString &
     91     append(UnicodeString &first,
     92            const UnicodeString &second,
     93            UErrorCode &errorCode) const {
     94         if(U_SUCCESS(errorCode)) {
     95             if(&first!=&second) {
     96                 first.append(second);
     97             } else {
     98                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     99             }
    100         }
    101         return first;
    102     }
    103     virtual UBool
    104     getDecomposition(UChar32, UnicodeString &) const {
    105         return FALSE;
    106     }
    107     // No need to override the default getRawDecomposition().
    108     virtual UBool
    109     isNormalized(const UnicodeString &, UErrorCode &) const {
    110         return TRUE;
    111     }
    112     virtual UNormalizationCheckResult
    113     quickCheck(const UnicodeString &, UErrorCode &) const {
    114         return UNORM_YES;
    115     }
    116     virtual int32_t
    117     spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const {
    118         return s.length();
    119     }
    120     virtual UBool hasBoundaryBefore(UChar32) const { return TRUE; }
    121     virtual UBool hasBoundaryAfter(UChar32) const { return TRUE; }
    122     virtual UBool isInert(UChar32) const { return TRUE; }
    123 };
    124 
    125 NoopNormalizer2::~NoopNormalizer2() {}
    126 
    127 Normalizer2WithImpl::~Normalizer2WithImpl() {}
    128 
    129 DecomposeNormalizer2::~DecomposeNormalizer2() {}
    130 
    131 ComposeNormalizer2::~ComposeNormalizer2() {}
    132 
    133 FCDNormalizer2::~FCDNormalizer2() {}
    134 
    135 // instance cache ---------------------------------------------------------- ***
    136 
    137 Norm2AllModes::~Norm2AllModes() {
    138     delete impl;
    139 }
    140 
    141 Norm2AllModes *
    142 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
    143     if(U_FAILURE(errorCode)) {
    144         delete impl;
    145         return NULL;
    146     }
    147     Norm2AllModes *allModes=new Norm2AllModes(impl);
    148     if(allModes==NULL) {
    149         errorCode=U_MEMORY_ALLOCATION_ERROR;
    150         delete impl;
    151         return NULL;
    152     }
    153     return allModes;
    154 }
    155 
    156 Norm2AllModes *
    157 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
    158     if(U_FAILURE(errorCode)) {
    159         return NULL;
    160     }
    161     Normalizer2Impl *impl=new Normalizer2Impl;
    162     if(impl==NULL) {
    163         errorCode=U_MEMORY_ALLOCATION_ERROR;
    164         return NULL;
    165     }
    166     impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
    167                norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
    168     return createInstance(impl, errorCode);
    169 }
    170 
    171 U_CDECL_BEGIN
    172 static UBool U_CALLCONV uprv_normalizer2_cleanup();
    173 U_CDECL_END
    174 
    175 static Norm2AllModes *nfcSingleton;
    176 static Normalizer2   *noopSingleton;
    177 
    178 static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
    179 static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
    180 
    181 // UInitOnce singleton initialization functions
    182 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
    183     nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
    184     ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
    185 }
    186 
    187 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
    188     if(U_FAILURE(errorCode)) {
    189         return;
    190     }
    191     noopSingleton=new NoopNormalizer2;
    192     if(noopSingleton==NULL) {
    193         errorCode=U_MEMORY_ALLOCATION_ERROR;
    194         return;
    195     }
    196     ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
    197 }
    198 
    199 U_CDECL_BEGIN
    200 
    201 static UBool U_CALLCONV uprv_normalizer2_cleanup() {
    202     delete nfcSingleton;
    203     nfcSingleton = NULL;
    204     delete noopSingleton;
    205     noopSingleton = NULL;
    206     nfcInitOnce.reset();
    207     noopInitOnce.reset();
    208     return TRUE;
    209 }
    210 
    211 U_CDECL_END
    212 
    213 const Norm2AllModes *
    214 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
    215     if(U_FAILURE(errorCode)) { return NULL; }
    216     umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
    217     return nfcSingleton;
    218 }
    219 
    220 const Normalizer2 *
    221 Normalizer2::getNFCInstance(UErrorCode &errorCode) {
    222     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    223     return allModes!=NULL ? &allModes->comp : NULL;
    224 }
    225 
    226 const Normalizer2 *
    227 Normalizer2::getNFDInstance(UErrorCode &errorCode) {
    228     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    229     return allModes!=NULL ? &allModes->decomp : NULL;
    230 }
    231 
    232 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
    233     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    234     return allModes!=NULL ? &allModes->fcd : NULL;
    235 }
    236 
    237 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
    238     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    239     return allModes!=NULL ? &allModes->fcc : NULL;
    240 }
    241 
    242 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
    243     if(U_FAILURE(errorCode)) { return NULL; }
    244     umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
    245     return noopSingleton;
    246 }
    247 
    248 const Normalizer2Impl *
    249 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
    250     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    251     return allModes!=NULL ? allModes->impl : NULL;
    252 }
    253 
    254 const Normalizer2Impl *
    255 Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
    256     return &((Normalizer2WithImpl *)norm2)->impl;
    257 }
    258 
    259 U_NAMESPACE_END
    260 
    261 // C API ------------------------------------------------------------------- ***
    262 
    263 U_NAMESPACE_USE
    264 
    265 U_CAPI const UNormalizer2 * U_EXPORT2
    266 unorm2_getNFCInstance(UErrorCode *pErrorCode) {
    267     return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
    268 }
    269 
    270 U_CAPI const UNormalizer2 * U_EXPORT2
    271 unorm2_getNFDInstance(UErrorCode *pErrorCode) {
    272     return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
    273 }
    274 
    275 U_CAPI void U_EXPORT2
    276 unorm2_close(UNormalizer2 *norm2) {
    277     delete (Normalizer2 *)norm2;
    278 }
    279 
    280 U_CAPI int32_t U_EXPORT2
    281 unorm2_normalize(const UNormalizer2 *norm2,
    282                  const UChar *src, int32_t length,
    283                  UChar *dest, int32_t capacity,
    284                  UErrorCode *pErrorCode) {
    285     if(U_FAILURE(*pErrorCode)) {
    286         return 0;
    287     }
    288     if( (src==NULL ? length!=0 : length<-1) ||
    289         (dest==NULL ? capacity!=0 : capacity<0) ||
    290         (src==dest && src!=NULL)
    291     ) {
    292         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    293         return 0;
    294     }
    295     UnicodeString destString(dest, 0, capacity);
    296     // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
    297     if(length!=0) {
    298         const Normalizer2 *n2=(const Normalizer2 *)norm2;
    299         const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
    300         if(n2wi!=NULL) {
    301             // Avoid duplicate argument checking and support NUL-terminated src.
    302             ReorderingBuffer buffer(n2wi->impl, destString);
    303             if(buffer.init(length, *pErrorCode)) {
    304                 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
    305             }
    306         } else {
    307             UnicodeString srcString(length<0, src, length);
    308             n2->normalize(srcString, destString, *pErrorCode);
    309         }
    310     }
    311     return destString.extract(dest, capacity, *pErrorCode);
    312 }
    313 
    314 static int32_t
    315 normalizeSecondAndAppend(const UNormalizer2 *norm2,
    316                          UChar *first, int32_t firstLength, int32_t firstCapacity,
    317                          const UChar *second, int32_t secondLength,
    318                          UBool doNormalize,
    319                          UErrorCode *pErrorCode) {
    320     if(U_FAILURE(*pErrorCode)) {
    321         return 0;
    322     }
    323     if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
    324         (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
    325                        (firstCapacity<0 || firstLength<-1)) ||
    326         (first==second && first!=NULL)
    327     ) {
    328         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    329         return 0;
    330     }
    331     UnicodeString firstString(first, firstLength, firstCapacity);
    332     firstLength=firstString.length();  // In case it was -1.
    333     // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
    334     if(secondLength!=0) {
    335         const Normalizer2 *n2=(const Normalizer2 *)norm2;
    336         const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
    337         if(n2wi!=NULL) {
    338             // Avoid duplicate argument checking and support NUL-terminated src.
    339             UnicodeString safeMiddle;
    340             {
    341                 ReorderingBuffer buffer(n2wi->impl, firstString);
    342                 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) {  // destCapacity>=-1
    343                     n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
    344                                              doNormalize, safeMiddle, buffer, *pErrorCode);
    345                 }
    346             }  // The ReorderingBuffer destructor finalizes firstString.
    347             if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
    348                 // Restore the modified suffix of the first string.
    349                 // This does not restore first[] array contents between firstLength and firstCapacity.
    350                 // (That might be uninitialized memory, as far as we know.)
    351                 if(first!=NULL) { /* don't dereference NULL */
    352                   safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
    353                   if(firstLength<firstCapacity) {
    354                     first[firstLength]=0;  // NUL-terminate in case it was originally.
    355                   }
    356                 }
    357             }
    358         } else {
    359             UnicodeString secondString(secondLength<0, second, secondLength);
    360             if(doNormalize) {
    361                 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
    362             } else {
    363                 n2->append(firstString, secondString, *pErrorCode);
    364             }
    365         }
    366     }
    367     return firstString.extract(first, firstCapacity, *pErrorCode);
    368 }
    369 
    370 U_CAPI int32_t U_EXPORT2
    371 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
    372                                 UChar *first, int32_t firstLength, int32_t firstCapacity,
    373                                 const UChar *second, int32_t secondLength,
    374                                 UErrorCode *pErrorCode) {
    375     return normalizeSecondAndAppend(norm2,
    376                                     first, firstLength, firstCapacity,
    377                                     second, secondLength,
    378                                     TRUE, pErrorCode);
    379 }
    380 
    381 U_CAPI int32_t U_EXPORT2
    382 unorm2_append(const UNormalizer2 *norm2,
    383               UChar *first, int32_t firstLength, int32_t firstCapacity,
    384               const UChar *second, int32_t secondLength,
    385               UErrorCode *pErrorCode) {
    386     return normalizeSecondAndAppend(norm2,
    387                                     first, firstLength, firstCapacity,
    388                                     second, secondLength,
    389                                     FALSE, pErrorCode);
    390 }
    391 
    392 U_CAPI int32_t U_EXPORT2
    393 unorm2_getDecomposition(const UNormalizer2 *norm2,
    394                         UChar32 c, UChar *decomposition, int32_t capacity,
    395                         UErrorCode *pErrorCode) {
    396     if(U_FAILURE(*pErrorCode)) {
    397         return 0;
    398     }
    399     if(decomposition==NULL ? capacity!=0 : capacity<0) {
    400         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    401         return 0;
    402     }
    403     UnicodeString destString(decomposition, 0, capacity);
    404     if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
    405         return destString.extract(decomposition, capacity, *pErrorCode);
    406     } else {
    407         return -1;
    408     }
    409 }
    410 
    411 U_CAPI int32_t U_EXPORT2
    412 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
    413                            UChar32 c, UChar *decomposition, int32_t capacity,
    414                            UErrorCode *pErrorCode) {
    415     if(U_FAILURE(*pErrorCode)) {
    416         return 0;
    417     }
    418     if(decomposition==NULL ? capacity!=0 : capacity<0) {
    419         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    420         return 0;
    421     }
    422     UnicodeString destString(decomposition, 0, capacity);
    423     if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
    424         return destString.extract(decomposition, capacity, *pErrorCode);
    425     } else {
    426         return -1;
    427     }
    428 }
    429 
    430 U_CAPI UChar32 U_EXPORT2
    431 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
    432     return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
    433 }
    434 
    435 U_CAPI uint8_t U_EXPORT2
    436 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
    437     return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
    438 }
    439 
    440 U_CAPI UBool U_EXPORT2
    441 unorm2_isNormalized(const UNormalizer2 *norm2,
    442                     const UChar *s, int32_t length,
    443                     UErrorCode *pErrorCode) {
    444     if(U_FAILURE(*pErrorCode)) {
    445         return 0;
    446     }
    447     if((s==NULL && length!=0) || length<-1) {
    448         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    449         return 0;
    450     }
    451     UnicodeString sString(length<0, s, length);
    452     return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
    453 }
    454 
    455 U_CAPI UNormalizationCheckResult U_EXPORT2
    456 unorm2_quickCheck(const UNormalizer2 *norm2,
    457                   const UChar *s, int32_t length,
    458                   UErrorCode *pErrorCode) {
    459     if(U_FAILURE(*pErrorCode)) {
    460         return UNORM_NO;
    461     }
    462     if((s==NULL && length!=0) || length<-1) {
    463         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    464         return UNORM_NO;
    465     }
    466     UnicodeString sString(length<0, s, length);
    467     return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
    468 }
    469 
    470 U_CAPI int32_t U_EXPORT2
    471 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
    472                          const UChar *s, int32_t length,
    473                          UErrorCode *pErrorCode) {
    474     if(U_FAILURE(*pErrorCode)) {
    475         return 0;
    476     }
    477     if((s==NULL && length!=0) || length<-1) {
    478         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    479         return 0;
    480     }
    481     UnicodeString sString(length<0, s, length);
    482     return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
    483 }
    484 
    485 U_CAPI UBool U_EXPORT2
    486 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
    487     return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
    488 }
    489 
    490 U_CAPI UBool U_EXPORT2
    491 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
    492     return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
    493 }
    494 
    495 U_CAPI UBool U_EXPORT2
    496 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
    497     return ((const Normalizer2 *)norm2)->isInert(c);
    498 }
    499 
    500 // Some properties APIs ---------------------------------------------------- ***
    501 
    502 U_CAPI uint8_t U_EXPORT2
    503 u_getCombiningClass(UChar32 c) {
    504     UErrorCode errorCode=U_ZERO_ERROR;
    505     const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
    506     if(U_SUCCESS(errorCode)) {
    507         return nfd->getCombiningClass(c);
    508     } else {
    509         return 0;
    510     }
    511 }
    512 
    513 U_CFUNC uint16_t
    514 unorm_getFCD16(UChar32 c) {
    515     UErrorCode errorCode=U_ZERO_ERROR;
    516     const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
    517     if(U_SUCCESS(errorCode)) {
    518         return impl->getFCD16(c);
    519     } else {
    520         return 0;
    521     }
    522 }
    523 
    524 #endif  // !UCONFIG_NO_NORMALIZATION
    525