Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2014, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  normalizer2.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009nov22
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_NORMALIZATION
     20 
     21 #include "unicode/normalizer2.h"
     22 #include "unicode/unistr.h"
     23 #include "unicode/unorm.h"
     24 #include "cstring.h"
     25 #include "mutex.h"
     26 #include "norm2allmodes.h"
     27 #include "normalizer2impl.h"
     28 #include "uassert.h"
     29 #include "ucln_cmn.h"
     30 
     31 using icu::Normalizer2Impl;
     32 
     33 // NFC/NFD data machine-generated by gennorm2 --csource
     34 #include "norm2_nfc_data.h"
     35 
     36 U_NAMESPACE_BEGIN
     37 
     38 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
     39 
     40 Normalizer2::~Normalizer2() {}
     41 
     42 UBool
     43 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
     44     return FALSE;
     45 }
     46 
     47 UChar32
     48 Normalizer2::composePair(UChar32, UChar32) const {
     49     return U_SENTINEL;
     50 }
     51 
     52 uint8_t
     53 Normalizer2::getCombiningClass(UChar32 /*c*/) const {
     54     return 0;
     55 }
     56 
     57 // Normalizer2 implementation for the old UNORM_NONE.
     58 class NoopNormalizer2 : public Normalizer2 {
     59     virtual ~NoopNormalizer2();
     60 
     61     virtual UnicodeString &
     62     normalize(const UnicodeString &src,
     63               UnicodeString &dest,
     64               UErrorCode &errorCode) const {
     65         if(U_SUCCESS(errorCode)) {
     66             if(&dest!=&src) {
     67                 dest=src;
     68             } else {
     69                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     70             }
     71         }
     72         return dest;
     73     }
     74     virtual UnicodeString &
     75     normalizeSecondAndAppend(UnicodeString &first,
     76                              const UnicodeString &second,
     77                              UErrorCode &errorCode) const {
     78         if(U_SUCCESS(errorCode)) {
     79             if(&first!=&second) {
     80                 first.append(second);
     81             } else {
     82                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     83             }
     84         }
     85         return first;
     86     }
     87     virtual UnicodeString &
     88     append(UnicodeString &first,
     89            const UnicodeString &second,
     90            UErrorCode &errorCode) const {
     91         if(U_SUCCESS(errorCode)) {
     92             if(&first!=&second) {
     93                 first.append(second);
     94             } else {
     95                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     96             }
     97         }
     98         return first;
     99     }
    100     virtual UBool
    101     getDecomposition(UChar32, UnicodeString &) const {
    102         return FALSE;
    103     }
    104     // No need to override the default getRawDecomposition().
    105     virtual UBool
    106     isNormalized(const UnicodeString &, UErrorCode &) const {
    107         return TRUE;
    108     }
    109     virtual UNormalizationCheckResult
    110     quickCheck(const UnicodeString &, UErrorCode &) const {
    111         return UNORM_YES;
    112     }
    113     virtual int32_t
    114     spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const {
    115         return s.length();
    116     }
    117     virtual UBool hasBoundaryBefore(UChar32) const { return TRUE; }
    118     virtual UBool hasBoundaryAfter(UChar32) const { return TRUE; }
    119     virtual UBool isInert(UChar32) const { return TRUE; }
    120 };
    121 
    122 NoopNormalizer2::~NoopNormalizer2() {}
    123 
    124 Normalizer2WithImpl::~Normalizer2WithImpl() {}
    125 
    126 DecomposeNormalizer2::~DecomposeNormalizer2() {}
    127 
    128 ComposeNormalizer2::~ComposeNormalizer2() {}
    129 
    130 FCDNormalizer2::~FCDNormalizer2() {}
    131 
    132 // instance cache ---------------------------------------------------------- ***
    133 
    134 Norm2AllModes::~Norm2AllModes() {
    135     delete impl;
    136 }
    137 
    138 Norm2AllModes *
    139 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
    140     if(U_FAILURE(errorCode)) {
    141         delete impl;
    142         return NULL;
    143     }
    144     Norm2AllModes *allModes=new Norm2AllModes(impl);
    145     if(allModes==NULL) {
    146         errorCode=U_MEMORY_ALLOCATION_ERROR;
    147         delete impl;
    148         return NULL;
    149     }
    150     return allModes;
    151 }
    152 
    153 Norm2AllModes *
    154 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
    155     if(U_FAILURE(errorCode)) {
    156         return NULL;
    157     }
    158     Normalizer2Impl *impl=new Normalizer2Impl;
    159     if(impl==NULL) {
    160         errorCode=U_MEMORY_ALLOCATION_ERROR;
    161         return NULL;
    162     }
    163     impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
    164                norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
    165     return createInstance(impl, errorCode);
    166 }
    167 
    168 U_CDECL_BEGIN
    169 static UBool U_CALLCONV uprv_normalizer2_cleanup();
    170 U_CDECL_END
    171 
    172 static Norm2AllModes *nfcSingleton;
    173 static Normalizer2   *noopSingleton;
    174 
    175 static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
    176 static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
    177 
    178 // UInitOnce singleton initialization functions
    179 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
    180     nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
    181     ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
    182 }
    183 
    184 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
    185     if(U_FAILURE(errorCode)) {
    186         return;
    187     }
    188     noopSingleton=new NoopNormalizer2;
    189     if(noopSingleton==NULL) {
    190         errorCode=U_MEMORY_ALLOCATION_ERROR;
    191         return;
    192     }
    193     ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
    194 }
    195 
    196 U_CDECL_BEGIN
    197 
    198 static UBool U_CALLCONV uprv_normalizer2_cleanup() {
    199     delete nfcSingleton;
    200     nfcSingleton = NULL;
    201     delete noopSingleton;
    202     noopSingleton = NULL;
    203     nfcInitOnce.reset();
    204     noopInitOnce.reset();
    205     return TRUE;
    206 }
    207 
    208 U_CDECL_END
    209 
    210 const Norm2AllModes *
    211 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
    212     if(U_FAILURE(errorCode)) { return NULL; }
    213     umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
    214     return nfcSingleton;
    215 }
    216 
    217 const Normalizer2 *
    218 Normalizer2::getNFCInstance(UErrorCode &errorCode) {
    219     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    220     return allModes!=NULL ? &allModes->comp : NULL;
    221 }
    222 
    223 const Normalizer2 *
    224 Normalizer2::getNFDInstance(UErrorCode &errorCode) {
    225     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    226     return allModes!=NULL ? &allModes->decomp : NULL;
    227 }
    228 
    229 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
    230     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    231     return allModes!=NULL ? &allModes->fcd : NULL;
    232 }
    233 
    234 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
    235     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    236     return allModes!=NULL ? &allModes->fcc : NULL;
    237 }
    238 
    239 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
    240     if(U_FAILURE(errorCode)) { return NULL; }
    241     umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
    242     return noopSingleton;
    243 }
    244 
    245 const Normalizer2Impl *
    246 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
    247     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    248     return allModes!=NULL ? allModes->impl : NULL;
    249 }
    250 
    251 const Normalizer2Impl *
    252 Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
    253     return &((Normalizer2WithImpl *)norm2)->impl;
    254 }
    255 
    256 U_NAMESPACE_END
    257 
    258 // C API ------------------------------------------------------------------- ***
    259 
    260 U_NAMESPACE_USE
    261 
    262 U_CAPI const UNormalizer2 * U_EXPORT2
    263 unorm2_getNFCInstance(UErrorCode *pErrorCode) {
    264     return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
    265 }
    266 
    267 U_CAPI const UNormalizer2 * U_EXPORT2
    268 unorm2_getNFDInstance(UErrorCode *pErrorCode) {
    269     return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
    270 }
    271 
    272 U_CAPI void U_EXPORT2
    273 unorm2_close(UNormalizer2 *norm2) {
    274     delete (Normalizer2 *)norm2;
    275 }
    276 
    277 U_CAPI int32_t U_EXPORT2
    278 unorm2_normalize(const UNormalizer2 *norm2,
    279                  const UChar *src, int32_t length,
    280                  UChar *dest, int32_t capacity,
    281                  UErrorCode *pErrorCode) {
    282     if(U_FAILURE(*pErrorCode)) {
    283         return 0;
    284     }
    285     if( (src==NULL ? length!=0 : length<-1) ||
    286         (dest==NULL ? capacity!=0 : capacity<0) ||
    287         (src==dest && src!=NULL)
    288     ) {
    289         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    290         return 0;
    291     }
    292     UnicodeString destString(dest, 0, capacity);
    293     // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
    294     if(length!=0) {
    295         const Normalizer2 *n2=(const Normalizer2 *)norm2;
    296         const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
    297         if(n2wi!=NULL) {
    298             // Avoid duplicate argument checking and support NUL-terminated src.
    299             ReorderingBuffer buffer(n2wi->impl, destString);
    300             if(buffer.init(length, *pErrorCode)) {
    301                 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
    302             }
    303         } else {
    304             UnicodeString srcString(length<0, src, length);
    305             n2->normalize(srcString, destString, *pErrorCode);
    306         }
    307     }
    308     return destString.extract(dest, capacity, *pErrorCode);
    309 }
    310 
    311 static int32_t
    312 normalizeSecondAndAppend(const UNormalizer2 *norm2,
    313                          UChar *first, int32_t firstLength, int32_t firstCapacity,
    314                          const UChar *second, int32_t secondLength,
    315                          UBool doNormalize,
    316                          UErrorCode *pErrorCode) {
    317     if(U_FAILURE(*pErrorCode)) {
    318         return 0;
    319     }
    320     if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
    321         (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
    322                        (firstCapacity<0 || firstLength<-1)) ||
    323         (first==second && first!=NULL)
    324     ) {
    325         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    326         return 0;
    327     }
    328     UnicodeString firstString(first, firstLength, firstCapacity);
    329     firstLength=firstString.length();  // In case it was -1.
    330     // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
    331     if(secondLength!=0) {
    332         const Normalizer2 *n2=(const Normalizer2 *)norm2;
    333         const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
    334         if(n2wi!=NULL) {
    335             // Avoid duplicate argument checking and support NUL-terminated src.
    336             UnicodeString safeMiddle;
    337             {
    338                 ReorderingBuffer buffer(n2wi->impl, firstString);
    339                 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) {  // destCapacity>=-1
    340                     n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
    341                                              doNormalize, safeMiddle, buffer, *pErrorCode);
    342                 }
    343             }  // The ReorderingBuffer destructor finalizes firstString.
    344             if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
    345                 // Restore the modified suffix of the first string.
    346                 // This does not restore first[] array contents between firstLength and firstCapacity.
    347                 // (That might be uninitialized memory, as far as we know.)
    348                 if(first!=NULL) { /* don't dereference NULL */
    349                   safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
    350                   if(firstLength<firstCapacity) {
    351                     first[firstLength]=0;  // NUL-terminate in case it was originally.
    352                   }
    353                 }
    354             }
    355         } else {
    356             UnicodeString secondString(secondLength<0, second, secondLength);
    357             if(doNormalize) {
    358                 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
    359             } else {
    360                 n2->append(firstString, secondString, *pErrorCode);
    361             }
    362         }
    363     }
    364     return firstString.extract(first, firstCapacity, *pErrorCode);
    365 }
    366 
    367 U_CAPI int32_t U_EXPORT2
    368 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
    369                                 UChar *first, int32_t firstLength, int32_t firstCapacity,
    370                                 const UChar *second, int32_t secondLength,
    371                                 UErrorCode *pErrorCode) {
    372     return normalizeSecondAndAppend(norm2,
    373                                     first, firstLength, firstCapacity,
    374                                     second, secondLength,
    375                                     TRUE, pErrorCode);
    376 }
    377 
    378 U_CAPI int32_t U_EXPORT2
    379 unorm2_append(const UNormalizer2 *norm2,
    380               UChar *first, int32_t firstLength, int32_t firstCapacity,
    381               const UChar *second, int32_t secondLength,
    382               UErrorCode *pErrorCode) {
    383     return normalizeSecondAndAppend(norm2,
    384                                     first, firstLength, firstCapacity,
    385                                     second, secondLength,
    386                                     FALSE, pErrorCode);
    387 }
    388 
    389 U_CAPI int32_t U_EXPORT2
    390 unorm2_getDecomposition(const UNormalizer2 *norm2,
    391                         UChar32 c, UChar *decomposition, int32_t capacity,
    392                         UErrorCode *pErrorCode) {
    393     if(U_FAILURE(*pErrorCode)) {
    394         return 0;
    395     }
    396     if(decomposition==NULL ? capacity!=0 : capacity<0) {
    397         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    398         return 0;
    399     }
    400     UnicodeString destString(decomposition, 0, capacity);
    401     if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
    402         return destString.extract(decomposition, capacity, *pErrorCode);
    403     } else {
    404         return -1;
    405     }
    406 }
    407 
    408 U_CAPI int32_t U_EXPORT2
    409 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
    410                            UChar32 c, UChar *decomposition, int32_t capacity,
    411                            UErrorCode *pErrorCode) {
    412     if(U_FAILURE(*pErrorCode)) {
    413         return 0;
    414     }
    415     if(decomposition==NULL ? capacity!=0 : capacity<0) {
    416         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    417         return 0;
    418     }
    419     UnicodeString destString(decomposition, 0, capacity);
    420     if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
    421         return destString.extract(decomposition, capacity, *pErrorCode);
    422     } else {
    423         return -1;
    424     }
    425 }
    426 
    427 U_CAPI UChar32 U_EXPORT2
    428 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
    429     return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
    430 }
    431 
    432 U_CAPI uint8_t U_EXPORT2
    433 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
    434     return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
    435 }
    436 
    437 U_CAPI UBool U_EXPORT2
    438 unorm2_isNormalized(const UNormalizer2 *norm2,
    439                     const UChar *s, int32_t length,
    440                     UErrorCode *pErrorCode) {
    441     if(U_FAILURE(*pErrorCode)) {
    442         return 0;
    443     }
    444     if((s==NULL && length!=0) || length<-1) {
    445         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    446         return 0;
    447     }
    448     UnicodeString sString(length<0, s, length);
    449     return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
    450 }
    451 
    452 U_CAPI UNormalizationCheckResult U_EXPORT2
    453 unorm2_quickCheck(const UNormalizer2 *norm2,
    454                   const UChar *s, int32_t length,
    455                   UErrorCode *pErrorCode) {
    456     if(U_FAILURE(*pErrorCode)) {
    457         return UNORM_NO;
    458     }
    459     if((s==NULL && length!=0) || length<-1) {
    460         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    461         return UNORM_NO;
    462     }
    463     UnicodeString sString(length<0, s, length);
    464     return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
    465 }
    466 
    467 U_CAPI int32_t U_EXPORT2
    468 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
    469                          const UChar *s, int32_t length,
    470                          UErrorCode *pErrorCode) {
    471     if(U_FAILURE(*pErrorCode)) {
    472         return 0;
    473     }
    474     if((s==NULL && length!=0) || length<-1) {
    475         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    476         return 0;
    477     }
    478     UnicodeString sString(length<0, s, length);
    479     return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
    480 }
    481 
    482 U_CAPI UBool U_EXPORT2
    483 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
    484     return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
    485 }
    486 
    487 U_CAPI UBool U_EXPORT2
    488 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
    489     return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
    490 }
    491 
    492 U_CAPI UBool U_EXPORT2
    493 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
    494     return ((const Normalizer2 *)norm2)->isInert(c);
    495 }
    496 
    497 // Some properties APIs ---------------------------------------------------- ***
    498 
    499 U_CAPI uint8_t U_EXPORT2
    500 u_getCombiningClass(UChar32 c) {
    501     UErrorCode errorCode=U_ZERO_ERROR;
    502     const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
    503     if(U_SUCCESS(errorCode)) {
    504         return nfd->getCombiningClass(c);
    505     } else {
    506         return 0;
    507     }
    508 }
    509 
    510 U_CFUNC uint16_t
    511 unorm_getFCD16(UChar32 c) {
    512     UErrorCode errorCode=U_ZERO_ERROR;
    513     const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
    514     if(U_SUCCESS(errorCode)) {
    515         return impl->getFCD16(c);
    516     } else {
    517         return 0;
    518     }
    519 }
    520 
    521 #endif  // !UCONFIG_NO_NORMALIZATION
    522