Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  filterednormalizer2.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009dec10
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_NORMALIZATION
     20 
     21 #include "unicode/normalizer2.h"
     22 #include "unicode/uniset.h"
     23 #include "unicode/unistr.h"
     24 #include "unicode/unorm.h"
     25 #include "cpputils.h"
     26 
     27 U_NAMESPACE_BEGIN
     28 
     29 UnicodeString &
     30 FilteredNormalizer2::normalize(const UnicodeString &src,
     31                                UnicodeString &dest,
     32                                UErrorCode &errorCode) const {
     33     uprv_checkCanGetBuffer(src, errorCode);
     34     if(U_FAILURE(errorCode)) {
     35         dest.setToBogus();
     36         return dest;
     37     }
     38     if(&dest==&src) {
     39         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     40         return dest;
     41     }
     42     dest.remove();
     43     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
     44 }
     45 
     46 // Internal: No argument checking, and appends to dest.
     47 // Pass as input spanCondition the one that is likely to yield a non-zero
     48 // span length at the start of src.
     49 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
     50 // USET_SPAN_SIMPLE should be passed in for the start of src
     51 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
     52 // an in-filter prefix.
     53 UnicodeString &
     54 FilteredNormalizer2::normalize(const UnicodeString &src,
     55                                UnicodeString &dest,
     56                                USetSpanCondition spanCondition,
     57                                UErrorCode &errorCode) const {
     58     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
     59     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
     60         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
     61         int32_t spanLength=spanLimit-prevSpanLimit;
     62         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
     63             if(spanLength!=0) {
     64                 dest.append(src, prevSpanLimit, spanLength);
     65             }
     66             spanCondition=USET_SPAN_SIMPLE;
     67         } else {
     68             if(spanLength!=0) {
     69                 // Not norm2.normalizeSecondAndAppend() because we do not want
     70                 // to modify the non-filter part of dest.
     71                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
     72                                             tempDest, errorCode));
     73                 if(U_FAILURE(errorCode)) {
     74                     break;
     75                 }
     76             }
     77             spanCondition=USET_SPAN_NOT_CONTAINED;
     78         }
     79         prevSpanLimit=spanLimit;
     80     }
     81     return dest;
     82 }
     83 
     84 UnicodeString &
     85 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
     86                                               const UnicodeString &second,
     87                                               UErrorCode &errorCode) const {
     88     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
     89 }
     90 
     91 UnicodeString &
     92 FilteredNormalizer2::append(UnicodeString &first,
     93                             const UnicodeString &second,
     94                             UErrorCode &errorCode) const {
     95     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
     96 }
     97 
     98 UnicodeString &
     99 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
    100                                               const UnicodeString &second,
    101                                               UBool doNormalize,
    102                                               UErrorCode &errorCode) const {
    103     uprv_checkCanGetBuffer(first, errorCode);
    104     uprv_checkCanGetBuffer(second, errorCode);
    105     if(U_FAILURE(errorCode)) {
    106         return first;
    107     }
    108     if(&first==&second) {
    109         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    110         return first;
    111     }
    112     if(first.isEmpty()) {
    113         if(doNormalize) {
    114             return normalize(second, first, errorCode);
    115         } else {
    116             return first=second;
    117         }
    118     }
    119     // merge the in-filter suffix of the first string with the in-filter prefix of the second
    120     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
    121     if(prefixLimit!=0) {
    122         UnicodeString prefix(second.tempSubString(0, prefixLimit));
    123         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
    124         if(suffixStart==0) {
    125             if(doNormalize) {
    126                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
    127             } else {
    128                 norm2.append(first, prefix, errorCode);
    129             }
    130         } else {
    131             UnicodeString middle(first, suffixStart, INT32_MAX);
    132             if(doNormalize) {
    133                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
    134             } else {
    135                 norm2.append(middle, prefix, errorCode);
    136             }
    137             first.replace(suffixStart, INT32_MAX, middle);
    138         }
    139     }
    140     if(prefixLimit<second.length()) {
    141         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
    142         if(doNormalize) {
    143             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
    144         } else {
    145             first.append(rest);
    146         }
    147     }
    148     return first;
    149 }
    150 
    151 UBool
    152 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
    153     return set.contains(c) && norm2.getDecomposition(c, decomposition);
    154 }
    155 
    156 UBool
    157 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
    158     uprv_checkCanGetBuffer(s, errorCode);
    159     if(U_FAILURE(errorCode)) {
    160         return FALSE;
    161     }
    162     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    163     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    164         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    165         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    166             spanCondition=USET_SPAN_SIMPLE;
    167         } else {
    168             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
    169                 U_FAILURE(errorCode)
    170             ) {
    171                 return FALSE;
    172             }
    173             spanCondition=USET_SPAN_NOT_CONTAINED;
    174         }
    175         prevSpanLimit=spanLimit;
    176     }
    177     return TRUE;
    178 }
    179 
    180 UNormalizationCheckResult
    181 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
    182     uprv_checkCanGetBuffer(s, errorCode);
    183     if(U_FAILURE(errorCode)) {
    184         return UNORM_MAYBE;
    185     }
    186     UNormalizationCheckResult result=UNORM_YES;
    187     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    188     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    189         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    190         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    191             spanCondition=USET_SPAN_SIMPLE;
    192         } else {
    193             UNormalizationCheckResult qcResult=
    194                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
    195             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
    196                 return qcResult;
    197             } else if(qcResult==UNORM_MAYBE) {
    198                 result=qcResult;
    199             }
    200             spanCondition=USET_SPAN_NOT_CONTAINED;
    201         }
    202         prevSpanLimit=spanLimit;
    203     }
    204     return result;
    205 }
    206 
    207 int32_t
    208 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
    209     uprv_checkCanGetBuffer(s, errorCode);
    210     if(U_FAILURE(errorCode)) {
    211         return 0;
    212     }
    213     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    214     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    215         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    216         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    217             spanCondition=USET_SPAN_SIMPLE;
    218         } else {
    219             int32_t yesLimit=
    220                 prevSpanLimit+
    221                 norm2.spanQuickCheckYes(
    222                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
    223             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
    224                 return yesLimit;
    225             }
    226             spanCondition=USET_SPAN_NOT_CONTAINED;
    227         }
    228         prevSpanLimit=spanLimit;
    229     }
    230     return s.length();
    231 }
    232 
    233 UBool
    234 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
    235     return !set.contains(c) || norm2.hasBoundaryBefore(c);
    236 }
    237 
    238 UBool
    239 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
    240     return !set.contains(c) || norm2.hasBoundaryAfter(c);
    241 }
    242 
    243 UBool
    244 FilteredNormalizer2::isInert(UChar32 c) const {
    245     return !set.contains(c) || norm2.isInert(c);
    246 }
    247 
    248 U_NAMESPACE_END
    249 
    250 // C API ------------------------------------------------------------------- ***
    251 
    252 U_NAMESPACE_USE
    253 
    254 U_DRAFT UNormalizer2 * U_EXPORT2
    255 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
    256     if(U_FAILURE(*pErrorCode)) {
    257         return NULL;
    258     }
    259     if(filterSet==NULL) {
    260         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    261         return NULL;
    262     }
    263     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
    264                                              *UnicodeSet::fromUSet(filterSet));
    265     if(fn2==NULL) {
    266         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    267     }
    268     return (UNormalizer2 *)fn2;
    269 }
    270 
    271 #endif  // !UCONFIG_NO_NORMALIZATION
    272