Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  filterednormalizer2.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009dec10
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_NORMALIZATION
     20 
     21 #include "unicode/normalizer2.h"
     22 #include "unicode/uniset.h"
     23 #include "unicode/unistr.h"
     24 #include "unicode/unorm.h"
     25 #include "cpputils.h"
     26 
     27 U_NAMESPACE_BEGIN
     28 
     29 FilteredNormalizer2::~FilteredNormalizer2() {}
     30 
     31 UnicodeString &
     32 FilteredNormalizer2::normalize(const UnicodeString &src,
     33                                UnicodeString &dest,
     34                                UErrorCode &errorCode) const {
     35     uprv_checkCanGetBuffer(src, errorCode);
     36     if(U_FAILURE(errorCode)) {
     37         dest.setToBogus();
     38         return dest;
     39     }
     40     if(&dest==&src) {
     41         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     42         return dest;
     43     }
     44     dest.remove();
     45     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
     46 }
     47 
     48 // Internal: No argument checking, and appends to dest.
     49 // Pass as input spanCondition the one that is likely to yield a non-zero
     50 // span length at the start of src.
     51 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
     52 // USET_SPAN_SIMPLE should be passed in for the start of src
     53 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
     54 // an in-filter prefix.
     55 UnicodeString &
     56 FilteredNormalizer2::normalize(const UnicodeString &src,
     57                                UnicodeString &dest,
     58                                USetSpanCondition spanCondition,
     59                                UErrorCode &errorCode) const {
     60     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
     61     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
     62         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
     63         int32_t spanLength=spanLimit-prevSpanLimit;
     64         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
     65             if(spanLength!=0) {
     66                 dest.append(src, prevSpanLimit, spanLength);
     67             }
     68             spanCondition=USET_SPAN_SIMPLE;
     69         } else {
     70             if(spanLength!=0) {
     71                 // Not norm2.normalizeSecondAndAppend() because we do not want
     72                 // to modify the non-filter part of dest.
     73                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
     74                                             tempDest, errorCode));
     75                 if(U_FAILURE(errorCode)) {
     76                     break;
     77                 }
     78             }
     79             spanCondition=USET_SPAN_NOT_CONTAINED;
     80         }
     81         prevSpanLimit=spanLimit;
     82     }
     83     return dest;
     84 }
     85 
     86 UnicodeString &
     87 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
     88                                               const UnicodeString &second,
     89                                               UErrorCode &errorCode) const {
     90     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
     91 }
     92 
     93 UnicodeString &
     94 FilteredNormalizer2::append(UnicodeString &first,
     95                             const UnicodeString &second,
     96                             UErrorCode &errorCode) const {
     97     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
     98 }
     99 
    100 UnicodeString &
    101 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
    102                                               const UnicodeString &second,
    103                                               UBool doNormalize,
    104                                               UErrorCode &errorCode) const {
    105     uprv_checkCanGetBuffer(first, errorCode);
    106     uprv_checkCanGetBuffer(second, errorCode);
    107     if(U_FAILURE(errorCode)) {
    108         return first;
    109     }
    110     if(&first==&second) {
    111         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    112         return first;
    113     }
    114     if(first.isEmpty()) {
    115         if(doNormalize) {
    116             return normalize(second, first, errorCode);
    117         } else {
    118             return first=second;
    119         }
    120     }
    121     // merge the in-filter suffix of the first string with the in-filter prefix of the second
    122     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
    123     if(prefixLimit!=0) {
    124         UnicodeString prefix(second.tempSubString(0, prefixLimit));
    125         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
    126         if(suffixStart==0) {
    127             if(doNormalize) {
    128                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
    129             } else {
    130                 norm2.append(first, prefix, errorCode);
    131             }
    132         } else {
    133             UnicodeString middle(first, suffixStart, INT32_MAX);
    134             if(doNormalize) {
    135                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
    136             } else {
    137                 norm2.append(middle, prefix, errorCode);
    138             }
    139             first.replace(suffixStart, INT32_MAX, middle);
    140         }
    141     }
    142     if(prefixLimit<second.length()) {
    143         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
    144         if(doNormalize) {
    145             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
    146         } else {
    147             first.append(rest);
    148         }
    149     }
    150     return first;
    151 }
    152 
    153 UBool
    154 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
    155     return set.contains(c) && norm2.getDecomposition(c, decomposition);
    156 }
    157 
    158 UBool
    159 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
    160     return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
    161 }
    162 
    163 UChar32
    164 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
    165     return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
    166 }
    167 
    168 uint8_t
    169 FilteredNormalizer2::getCombiningClass(UChar32 c) const {
    170     return set.contains(c) ? norm2.getCombiningClass(c) : 0;
    171 }
    172 
    173 UBool
    174 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
    175     uprv_checkCanGetBuffer(s, errorCode);
    176     if(U_FAILURE(errorCode)) {
    177         return FALSE;
    178     }
    179     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    180     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    181         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    182         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    183             spanCondition=USET_SPAN_SIMPLE;
    184         } else {
    185             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
    186                 U_FAILURE(errorCode)
    187             ) {
    188                 return FALSE;
    189             }
    190             spanCondition=USET_SPAN_NOT_CONTAINED;
    191         }
    192         prevSpanLimit=spanLimit;
    193     }
    194     return TRUE;
    195 }
    196 
    197 UNormalizationCheckResult
    198 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
    199     uprv_checkCanGetBuffer(s, errorCode);
    200     if(U_FAILURE(errorCode)) {
    201         return UNORM_MAYBE;
    202     }
    203     UNormalizationCheckResult result=UNORM_YES;
    204     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    205     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    206         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    207         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    208             spanCondition=USET_SPAN_SIMPLE;
    209         } else {
    210             UNormalizationCheckResult qcResult=
    211                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
    212             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
    213                 return qcResult;
    214             } else if(qcResult==UNORM_MAYBE) {
    215                 result=qcResult;
    216             }
    217             spanCondition=USET_SPAN_NOT_CONTAINED;
    218         }
    219         prevSpanLimit=spanLimit;
    220     }
    221     return result;
    222 }
    223 
    224 int32_t
    225 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
    226     uprv_checkCanGetBuffer(s, errorCode);
    227     if(U_FAILURE(errorCode)) {
    228         return 0;
    229     }
    230     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    231     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    232         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    233         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    234             spanCondition=USET_SPAN_SIMPLE;
    235         } else {
    236             int32_t yesLimit=
    237                 prevSpanLimit+
    238                 norm2.spanQuickCheckYes(
    239                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
    240             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
    241                 return yesLimit;
    242             }
    243             spanCondition=USET_SPAN_NOT_CONTAINED;
    244         }
    245         prevSpanLimit=spanLimit;
    246     }
    247     return s.length();
    248 }
    249 
    250 UBool
    251 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
    252     return !set.contains(c) || norm2.hasBoundaryBefore(c);
    253 }
    254 
    255 UBool
    256 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
    257     return !set.contains(c) || norm2.hasBoundaryAfter(c);
    258 }
    259 
    260 UBool
    261 FilteredNormalizer2::isInert(UChar32 c) const {
    262     return !set.contains(c) || norm2.isInert(c);
    263 }
    264 
    265 U_NAMESPACE_END
    266 
    267 // C API ------------------------------------------------------------------- ***
    268 
    269 U_NAMESPACE_USE
    270 
    271 U_CAPI UNormalizer2 * U_EXPORT2
    272 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
    273     if(U_FAILURE(*pErrorCode)) {
    274         return NULL;
    275     }
    276     if(filterSet==NULL) {
    277         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    278         return NULL;
    279     }
    280     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
    281                                              *UnicodeSet::fromUSet(filterSet));
    282     if(fn2==NULL) {
    283         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    284     }
    285     return (UNormalizer2 *)fn2;
    286 }
    287 
    288 #endif  // !UCONFIG_NO_NORMALIZATION
    289