Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2009-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  filterednormalizer2.cpp
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009dec10
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_NORMALIZATION
     22 
     23 #include "unicode/normalizer2.h"
     24 #include "unicode/uniset.h"
     25 #include "unicode/unistr.h"
     26 #include "unicode/unorm.h"
     27 #include "cpputils.h"
     28 
     29 U_NAMESPACE_BEGIN
     30 
     31 FilteredNormalizer2::~FilteredNormalizer2() {}
     32 
     33 UnicodeString &
     34 FilteredNormalizer2::normalize(const UnicodeString &src,
     35                                UnicodeString &dest,
     36                                UErrorCode &errorCode) const {
     37     uprv_checkCanGetBuffer(src, errorCode);
     38     if(U_FAILURE(errorCode)) {
     39         dest.setToBogus();
     40         return dest;
     41     }
     42     if(&dest==&src) {
     43         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     44         return dest;
     45     }
     46     dest.remove();
     47     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
     48 }
     49 
     50 // Internal: No argument checking, and appends to dest.
     51 // Pass as input spanCondition the one that is likely to yield a non-zero
     52 // span length at the start of src.
     53 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
     54 // USET_SPAN_SIMPLE should be passed in for the start of src
     55 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
     56 // an in-filter prefix.
     57 UnicodeString &
     58 FilteredNormalizer2::normalize(const UnicodeString &src,
     59                                UnicodeString &dest,
     60                                USetSpanCondition spanCondition,
     61                                UErrorCode &errorCode) const {
     62     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
     63     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
     64         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
     65         int32_t spanLength=spanLimit-prevSpanLimit;
     66         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
     67             if(spanLength!=0) {
     68                 dest.append(src, prevSpanLimit, spanLength);
     69             }
     70             spanCondition=USET_SPAN_SIMPLE;
     71         } else {
     72             if(spanLength!=0) {
     73                 // Not norm2.normalizeSecondAndAppend() because we do not want
     74                 // to modify the non-filter part of dest.
     75                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
     76                                             tempDest, errorCode));
     77                 if(U_FAILURE(errorCode)) {
     78                     break;
     79                 }
     80             }
     81             spanCondition=USET_SPAN_NOT_CONTAINED;
     82         }
     83         prevSpanLimit=spanLimit;
     84     }
     85     return dest;
     86 }
     87 
     88 UnicodeString &
     89 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
     90                                               const UnicodeString &second,
     91                                               UErrorCode &errorCode) const {
     92     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
     93 }
     94 
     95 UnicodeString &
     96 FilteredNormalizer2::append(UnicodeString &first,
     97                             const UnicodeString &second,
     98                             UErrorCode &errorCode) const {
     99     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
    100 }
    101 
    102 UnicodeString &
    103 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
    104                                               const UnicodeString &second,
    105                                               UBool doNormalize,
    106                                               UErrorCode &errorCode) const {
    107     uprv_checkCanGetBuffer(first, errorCode);
    108     uprv_checkCanGetBuffer(second, errorCode);
    109     if(U_FAILURE(errorCode)) {
    110         return first;
    111     }
    112     if(&first==&second) {
    113         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    114         return first;
    115     }
    116     if(first.isEmpty()) {
    117         if(doNormalize) {
    118             return normalize(second, first, errorCode);
    119         } else {
    120             return first=second;
    121         }
    122     }
    123     // merge the in-filter suffix of the first string with the in-filter prefix of the second
    124     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
    125     if(prefixLimit!=0) {
    126         UnicodeString prefix(second.tempSubString(0, prefixLimit));
    127         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
    128         if(suffixStart==0) {
    129             if(doNormalize) {
    130                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
    131             } else {
    132                 norm2.append(first, prefix, errorCode);
    133             }
    134         } else {
    135             UnicodeString middle(first, suffixStart, INT32_MAX);
    136             if(doNormalize) {
    137                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
    138             } else {
    139                 norm2.append(middle, prefix, errorCode);
    140             }
    141             first.replace(suffixStart, INT32_MAX, middle);
    142         }
    143     }
    144     if(prefixLimit<second.length()) {
    145         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
    146         if(doNormalize) {
    147             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
    148         } else {
    149             first.append(rest);
    150         }
    151     }
    152     return first;
    153 }
    154 
    155 UBool
    156 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
    157     return set.contains(c) && norm2.getDecomposition(c, decomposition);
    158 }
    159 
    160 UBool
    161 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
    162     return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
    163 }
    164 
    165 UChar32
    166 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
    167     return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
    168 }
    169 
    170 uint8_t
    171 FilteredNormalizer2::getCombiningClass(UChar32 c) const {
    172     return set.contains(c) ? norm2.getCombiningClass(c) : 0;
    173 }
    174 
    175 UBool
    176 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
    177     uprv_checkCanGetBuffer(s, errorCode);
    178     if(U_FAILURE(errorCode)) {
    179         return FALSE;
    180     }
    181     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    182     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    183         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    184         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    185             spanCondition=USET_SPAN_SIMPLE;
    186         } else {
    187             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
    188                 U_FAILURE(errorCode)
    189             ) {
    190                 return FALSE;
    191             }
    192             spanCondition=USET_SPAN_NOT_CONTAINED;
    193         }
    194         prevSpanLimit=spanLimit;
    195     }
    196     return TRUE;
    197 }
    198 
    199 UNormalizationCheckResult
    200 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
    201     uprv_checkCanGetBuffer(s, errorCode);
    202     if(U_FAILURE(errorCode)) {
    203         return UNORM_MAYBE;
    204     }
    205     UNormalizationCheckResult result=UNORM_YES;
    206     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    207     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    208         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    209         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    210             spanCondition=USET_SPAN_SIMPLE;
    211         } else {
    212             UNormalizationCheckResult qcResult=
    213                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
    214             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
    215                 return qcResult;
    216             } else if(qcResult==UNORM_MAYBE) {
    217                 result=qcResult;
    218             }
    219             spanCondition=USET_SPAN_NOT_CONTAINED;
    220         }
    221         prevSpanLimit=spanLimit;
    222     }
    223     return result;
    224 }
    225 
    226 int32_t
    227 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
    228     uprv_checkCanGetBuffer(s, errorCode);
    229     if(U_FAILURE(errorCode)) {
    230         return 0;
    231     }
    232     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    233     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    234         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    235         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    236             spanCondition=USET_SPAN_SIMPLE;
    237         } else {
    238             int32_t yesLimit=
    239                 prevSpanLimit+
    240                 norm2.spanQuickCheckYes(
    241                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
    242             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
    243                 return yesLimit;
    244             }
    245             spanCondition=USET_SPAN_NOT_CONTAINED;
    246         }
    247         prevSpanLimit=spanLimit;
    248     }
    249     return s.length();
    250 }
    251 
    252 UBool
    253 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
    254     return !set.contains(c) || norm2.hasBoundaryBefore(c);
    255 }
    256 
    257 UBool
    258 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
    259     return !set.contains(c) || norm2.hasBoundaryAfter(c);
    260 }
    261 
    262 UBool
    263 FilteredNormalizer2::isInert(UChar32 c) const {
    264     return !set.contains(c) || norm2.isInert(c);
    265 }
    266 
    267 U_NAMESPACE_END
    268 
    269 // C API ------------------------------------------------------------------- ***
    270 
    271 U_NAMESPACE_USE
    272 
    273 U_CAPI UNormalizer2 * U_EXPORT2
    274 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
    275     if(U_FAILURE(*pErrorCode)) {
    276         return NULL;
    277     }
    278     if(filterSet==NULL) {
    279         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    280         return NULL;
    281     }
    282     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
    283                                              *UnicodeSet::fromUSet(filterSet));
    284     if(fn2==NULL) {
    285         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    286     }
    287     return (UNormalizer2 *)fn2;
    288 }
    289 
    290 #endif  // !UCONFIG_NO_NORMALIZATION
    291