Home | History | Annotate | Download | only in common
      1 //  2018 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // characterproperties.cpp
      5 // created: 2018sep03 Markus W. Scherer
      6 
      7 #include "unicode/utypes.h"
      8 #include "unicode/localpointer.h"
      9 #include "unicode/uchar.h"
     10 #include "unicode/ucpmap.h"
     11 #include "unicode/ucptrie.h"
     12 #include "unicode/umutablecptrie.h"
     13 #include "unicode/uniset.h"
     14 #include "unicode/uscript.h"
     15 #include "unicode/uset.h"
     16 #include "cmemory.h"
     17 #include "mutex.h"
     18 #include "normalizer2impl.h"
     19 #include "uassert.h"
     20 #include "ubidi_props.h"
     21 #include "ucase.h"
     22 #include "ucln_cmn.h"
     23 #include "umutex.h"
     24 #include "uprops.h"
     25 
     26 using icu::LocalPointer;
     27 using icu::Normalizer2Factory;
     28 using icu::Normalizer2Impl;
     29 using icu::UInitOnce;
     30 using icu::UnicodeSet;
     31 
     32 namespace {
     33 
     34 UBool U_CALLCONV characterproperties_cleanup();
     35 
     36 constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START;
     37 
     38 struct Inclusion {
     39     UnicodeSet  *fSet;
     40     UInitOnce    fInitOnce;
     41 };
     42 Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
     43 
     44 UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
     45 
     46 UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
     47 
     48 UMutex cpMutex = U_MUTEX_INITIALIZER;
     49 
     50 //----------------------------------------------------------------
     51 // Inclusions list
     52 //----------------------------------------------------------------
     53 
     54 // USetAdder implementation
     55 // Does not use uset.h to reduce code dependencies
     56 void U_CALLCONV
     57 _set_add(USet *set, UChar32 c) {
     58     ((UnicodeSet *)set)->add(c);
     59 }
     60 
     61 void U_CALLCONV
     62 _set_addRange(USet *set, UChar32 start, UChar32 end) {
     63     ((UnicodeSet *)set)->add(start, end);
     64 }
     65 
     66 void U_CALLCONV
     67 _set_addString(USet *set, const UChar *str, int32_t length) {
     68     ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
     69 }
     70 
     71 UBool U_CALLCONV characterproperties_cleanup() {
     72     for (Inclusion &in: gInclusions) {
     73         delete in.fSet;
     74         in.fSet = nullptr;
     75         in.fInitOnce.reset();
     76     }
     77     for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
     78         delete sets[i];
     79         sets[i] = nullptr;
     80     }
     81     for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
     82         ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
     83         maps[i] = nullptr;
     84     }
     85     return TRUE;
     86 }
     87 
     88 void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
     89     // This function is invoked only via umtx_initOnce().
     90     U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
     91     if (src == UPROPS_SRC_NONE) {
     92         errorCode = U_INTERNAL_PROGRAM_ERROR;
     93         return;
     94     }
     95     U_ASSERT(gInclusions[src].fSet == nullptr);
     96 
     97     LocalPointer<UnicodeSet> incl(new UnicodeSet());
     98     if (incl.isNull()) {
     99         errorCode = U_MEMORY_ALLOCATION_ERROR;
    100         return;
    101     }
    102     USetAdder sa = {
    103         (USet *)incl.getAlias(),
    104         _set_add,
    105         _set_addRange,
    106         _set_addString,
    107         nullptr, // don't need remove()
    108         nullptr // don't need removeRange()
    109     };
    110 
    111     switch(src) {
    112     case UPROPS_SRC_CHAR:
    113         uchar_addPropertyStarts(&sa, &errorCode);
    114         break;
    115     case UPROPS_SRC_PROPSVEC:
    116         upropsvec_addPropertyStarts(&sa, &errorCode);
    117         break;
    118     case UPROPS_SRC_CHAR_AND_PROPSVEC:
    119         uchar_addPropertyStarts(&sa, &errorCode);
    120         upropsvec_addPropertyStarts(&sa, &errorCode);
    121         break;
    122 #if !UCONFIG_NO_NORMALIZATION
    123     case UPROPS_SRC_CASE_AND_NORM: {
    124         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
    125         if(U_SUCCESS(errorCode)) {
    126             impl->addPropertyStarts(&sa, errorCode);
    127         }
    128         ucase_addPropertyStarts(&sa, &errorCode);
    129         break;
    130     }
    131     case UPROPS_SRC_NFC: {
    132         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
    133         if(U_SUCCESS(errorCode)) {
    134             impl->addPropertyStarts(&sa, errorCode);
    135         }
    136         break;
    137     }
    138     case UPROPS_SRC_NFKC: {
    139         const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
    140         if(U_SUCCESS(errorCode)) {
    141             impl->addPropertyStarts(&sa, errorCode);
    142         }
    143         break;
    144     }
    145     case UPROPS_SRC_NFKC_CF: {
    146         const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
    147         if(U_SUCCESS(errorCode)) {
    148             impl->addPropertyStarts(&sa, errorCode);
    149         }
    150         break;
    151     }
    152     case UPROPS_SRC_NFC_CANON_ITER: {
    153         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
    154         if(U_SUCCESS(errorCode)) {
    155             impl->addCanonIterPropertyStarts(&sa, errorCode);
    156         }
    157         break;
    158     }
    159 #endif
    160     case UPROPS_SRC_CASE:
    161         ucase_addPropertyStarts(&sa, &errorCode);
    162         break;
    163     case UPROPS_SRC_BIDI:
    164         ubidi_addPropertyStarts(&sa, &errorCode);
    165         break;
    166     case UPROPS_SRC_INPC:
    167     case UPROPS_SRC_INSC:
    168     case UPROPS_SRC_VO:
    169         uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
    170         break;
    171     default:
    172         errorCode = U_INTERNAL_PROGRAM_ERROR;
    173         break;
    174     }
    175 
    176     if (U_FAILURE(errorCode)) {
    177         return;
    178     }
    179     if (incl->isBogus()) {
    180         errorCode = U_MEMORY_ALLOCATION_ERROR;
    181         return;
    182     }
    183     // Compact for caching.
    184     incl->compact();
    185     gInclusions[src].fSet = incl.orphan();
    186     ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
    187 }
    188 
    189 const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
    190     if (U_FAILURE(errorCode)) { return nullptr; }
    191     if (src < 0 || UPROPS_SRC_COUNT <= src) {
    192         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    193         return nullptr;
    194     }
    195     Inclusion &i = gInclusions[src];
    196     umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
    197     return i.fSet;
    198 }
    199 
    200 void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
    201     // This function is invoked only via umtx_initOnce().
    202     U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
    203     int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
    204     U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
    205     UPropertySource src = uprops_getSource(prop);
    206     const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
    207     if (U_FAILURE(errorCode)) {
    208         return;
    209     }
    210 
    211     LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
    212     if (intPropIncl.isNull()) {
    213         errorCode = U_MEMORY_ALLOCATION_ERROR;
    214         return;
    215     }
    216     int32_t numRanges = incl->getRangeCount();
    217     int32_t prevValue = 0;
    218     for (int32_t i = 0; i < numRanges; ++i) {
    219         UChar32 rangeEnd = incl->getRangeEnd(i);
    220         for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
    221             // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
    222             int32_t value = u_getIntPropertyValue(c, prop);
    223             if (value != prevValue) {
    224                 intPropIncl->add(c);
    225                 prevValue = value;
    226             }
    227         }
    228     }
    229 
    230     if (intPropIncl->isBogus()) {
    231         errorCode = U_MEMORY_ALLOCATION_ERROR;
    232         return;
    233     }
    234     // Compact for caching.
    235     intPropIncl->compact();
    236     gInclusions[inclIndex].fSet = intPropIncl.orphan();
    237     ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
    238 }
    239 
    240 }  // namespace
    241 
    242 U_NAMESPACE_BEGIN
    243 
    244 const UnicodeSet *CharacterProperties::getInclusionsForProperty(
    245         UProperty prop, UErrorCode &errorCode) {
    246     if (U_FAILURE(errorCode)) { return nullptr; }
    247     if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
    248         int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
    249         Inclusion &i = gInclusions[inclIndex];
    250         umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
    251         return i.fSet;
    252     } else {
    253         UPropertySource src = uprops_getSource(prop);
    254         return getInclusionsForSource(src, errorCode);
    255     }
    256 }
    257 
    258 U_NAMESPACE_END
    259 
    260 namespace {
    261 
    262 UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
    263     if (U_FAILURE(errorCode)) { return nullptr; }
    264     LocalPointer<UnicodeSet> set(new UnicodeSet());
    265     if (set.isNull()) {
    266         errorCode = U_MEMORY_ALLOCATION_ERROR;
    267         return nullptr;
    268     }
    269     const UnicodeSet *inclusions =
    270         icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
    271     if (U_FAILURE(errorCode)) { return nullptr; }
    272     int32_t numRanges = inclusions->getRangeCount();
    273     UChar32 startHasProperty = -1;
    274 
    275     for (int32_t i = 0; i < numRanges; ++i) {
    276         UChar32 rangeEnd = inclusions->getRangeEnd(i);
    277         for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
    278             // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
    279             if (u_hasBinaryProperty(c, property)) {
    280                 if (startHasProperty < 0) {
    281                     // Transition from false to true.
    282                     startHasProperty = c;
    283                 }
    284             } else if (startHasProperty >= 0) {
    285                 // Transition from true to false.
    286                 set->add(startHasProperty, c - 1);
    287                 startHasProperty = -1;
    288             }
    289         }
    290     }
    291     if (startHasProperty >= 0) {
    292         set->add(startHasProperty, 0x10FFFF);
    293     }
    294     set->freeze();
    295     return set.orphan();
    296 }
    297 
    298 UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
    299     if (U_FAILURE(errorCode)) { return nullptr; }
    300     uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
    301     icu::LocalUMutableCPTriePointer mutableTrie(
    302         umutablecptrie_open(nullValue, nullValue, &errorCode));
    303     const UnicodeSet *inclusions =
    304         icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
    305     if (U_FAILURE(errorCode)) { return nullptr; }
    306     int32_t numRanges = inclusions->getRangeCount();
    307     UChar32 start = 0;
    308     uint32_t value = nullValue;
    309 
    310     for (int32_t i = 0; i < numRanges; ++i) {
    311         UChar32 rangeEnd = inclusions->getRangeEnd(i);
    312         for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
    313             // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
    314             uint32_t nextValue = u_getIntPropertyValue(c, property);
    315             if (value != nextValue) {
    316                 if (value != nullValue) {
    317                     umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
    318                 }
    319                 start = c;
    320                 value = nextValue;
    321             }
    322         }
    323     }
    324     if (value != 0) {
    325         umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
    326     }
    327 
    328     UCPTrieType type;
    329     if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
    330         type = UCPTRIE_TYPE_FAST;
    331     } else {
    332         type = UCPTRIE_TYPE_SMALL;
    333     }
    334     UCPTrieValueWidth valueWidth;
    335     // TODO: UCharacterProperty.IntProperty
    336     int32_t max = u_getIntPropertyMaxValue(property);
    337     if (max <= 0xff) {
    338         valueWidth = UCPTRIE_VALUE_BITS_8;
    339     } else if (max <= 0xffff) {
    340         valueWidth = UCPTRIE_VALUE_BITS_16;
    341     } else {
    342         valueWidth = UCPTRIE_VALUE_BITS_32;
    343     }
    344     return reinterpret_cast<UCPMap *>(
    345         umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
    346 }
    347 
    348 }  // namespace
    349 
    350 U_NAMESPACE_USE
    351 
    352 U_CAPI const USet * U_EXPORT2
    353 u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
    354     if (U_FAILURE(*pErrorCode)) { return nullptr; }
    355     if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
    356         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    357         return nullptr;
    358     }
    359     Mutex m(&cpMutex);
    360     UnicodeSet *set = sets[property];
    361     if (set == nullptr) {
    362         sets[property] = set = makeSet(property, *pErrorCode);
    363     }
    364     if (U_FAILURE(*pErrorCode)) { return nullptr; }
    365     return set->toUSet();
    366 }
    367 
    368 U_CAPI const UCPMap * U_EXPORT2
    369 u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
    370     if (U_FAILURE(*pErrorCode)) { return nullptr; }
    371     if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
    372         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    373         return nullptr;
    374     }
    375     Mutex m(&cpMutex);
    376     UCPMap *map = maps[property - UCHAR_INT_START];
    377     if (map == nullptr) {
    378         maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
    379     }
    380     return map;
    381 }
    382