Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2009-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  filterednormalizer2.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009dec10
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_NORMALIZATION
     22 
     23 #include "unicode/edits.h"
     24 #include "unicode/normalizer2.h"
     25 #include "unicode/stringoptions.h"
     26 #include "unicode/uniset.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/unorm.h"
     29 #include "cpputils.h"
     30 
     31 U_NAMESPACE_BEGIN
     32 
     33 FilteredNormalizer2::~FilteredNormalizer2() {}
     34 
     35 UnicodeString &
     36 FilteredNormalizer2::normalize(const UnicodeString &src,
     37                                UnicodeString &dest,
     38                                UErrorCode &errorCode) const {
     39     uprv_checkCanGetBuffer(src, errorCode);
     40     if(U_FAILURE(errorCode)) {
     41         dest.setToBogus();
     42         return dest;
     43     }
     44     if(&dest==&src) {
     45         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     46         return dest;
     47     }
     48     dest.remove();
     49     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
     50 }
     51 
     52 // Internal: No argument checking, and appends to dest.
     53 // Pass as input spanCondition the one that is likely to yield a non-zero
     54 // span length at the start of src.
     55 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
     56 // USET_SPAN_SIMPLE should be passed in for the start of src
     57 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
     58 // an in-filter prefix.
     59 UnicodeString &
     60 FilteredNormalizer2::normalize(const UnicodeString &src,
     61                                UnicodeString &dest,
     62                                USetSpanCondition spanCondition,
     63                                UErrorCode &errorCode) const {
     64     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
     65     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
     66         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
     67         int32_t spanLength=spanLimit-prevSpanLimit;
     68         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
     69             if(spanLength!=0) {
     70                 dest.append(src, prevSpanLimit, spanLength);
     71             }
     72             spanCondition=USET_SPAN_SIMPLE;
     73         } else {
     74             if(spanLength!=0) {
     75                 // Not norm2.normalizeSecondAndAppend() because we do not want
     76                 // to modify the non-filter part of dest.
     77                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
     78                                             tempDest, errorCode));
     79                 if(U_FAILURE(errorCode)) {
     80                     break;
     81                 }
     82             }
     83             spanCondition=USET_SPAN_NOT_CONTAINED;
     84         }
     85         prevSpanLimit=spanLimit;
     86     }
     87     return dest;
     88 }
     89 
     90 void
     91 FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
     92                                    Edits *edits, UErrorCode &errorCode) const {
     93     if (U_FAILURE(errorCode)) {
     94         return;
     95     }
     96     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
     97         edits->reset();
     98     }
     99     options |= U_EDITS_NO_RESET;  // Do not reset for each span.
    100     normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode);
    101 }
    102 
    103 void
    104 FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length,
    105                                    ByteSink &sink, Edits *edits,
    106                                    USetSpanCondition spanCondition,
    107                                    UErrorCode &errorCode) const {
    108     while (length > 0) {
    109         int32_t spanLength = set.spanUTF8(src, length, spanCondition);
    110         if (spanCondition == USET_SPAN_NOT_CONTAINED) {
    111             if (spanLength != 0) {
    112                 if (edits != nullptr) {
    113                     edits->addUnchanged(spanLength);
    114                 }
    115                 if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
    116                     sink.Append(src, spanLength);
    117                 }
    118             }
    119             spanCondition = USET_SPAN_SIMPLE;
    120         } else {
    121             if (spanLength != 0) {
    122                 // Not norm2.normalizeSecondAndAppend() because we do not want
    123                 // to modify the non-filter part of dest.
    124                 norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode);
    125                 if (U_FAILURE(errorCode)) {
    126                     break;
    127                 }
    128             }
    129             spanCondition = USET_SPAN_NOT_CONTAINED;
    130         }
    131         src += spanLength;
    132         length -= spanLength;
    133     }
    134 }
    135 
    136 UnicodeString &
    137 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
    138                                               const UnicodeString &second,
    139                                               UErrorCode &errorCode) const {
    140     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
    141 }
    142 
    143 UnicodeString &
    144 FilteredNormalizer2::append(UnicodeString &first,
    145                             const UnicodeString &second,
    146                             UErrorCode &errorCode) const {
    147     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
    148 }
    149 
    150 UnicodeString &
    151 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
    152                                               const UnicodeString &second,
    153                                               UBool doNormalize,
    154                                               UErrorCode &errorCode) const {
    155     uprv_checkCanGetBuffer(first, errorCode);
    156     uprv_checkCanGetBuffer(second, errorCode);
    157     if(U_FAILURE(errorCode)) {
    158         return first;
    159     }
    160     if(&first==&second) {
    161         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    162         return first;
    163     }
    164     if(first.isEmpty()) {
    165         if(doNormalize) {
    166             return normalize(second, first, errorCode);
    167         } else {
    168             return first=second;
    169         }
    170     }
    171     // merge the in-filter suffix of the first string with the in-filter prefix of the second
    172     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
    173     if(prefixLimit!=0) {
    174         UnicodeString prefix(second.tempSubString(0, prefixLimit));
    175         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
    176         if(suffixStart==0) {
    177             if(doNormalize) {
    178                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
    179             } else {
    180                 norm2.append(first, prefix, errorCode);
    181             }
    182         } else {
    183             UnicodeString middle(first, suffixStart, INT32_MAX);
    184             if(doNormalize) {
    185                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
    186             } else {
    187                 norm2.append(middle, prefix, errorCode);
    188             }
    189             first.replace(suffixStart, INT32_MAX, middle);
    190         }
    191     }
    192     if(prefixLimit<second.length()) {
    193         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
    194         if(doNormalize) {
    195             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
    196         } else {
    197             first.append(rest);
    198         }
    199     }
    200     return first;
    201 }
    202 
    203 UBool
    204 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
    205     return set.contains(c) && norm2.getDecomposition(c, decomposition);
    206 }
    207 
    208 UBool
    209 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
    210     return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
    211 }
    212 
    213 UChar32
    214 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
    215     return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
    216 }
    217 
    218 uint8_t
    219 FilteredNormalizer2::getCombiningClass(UChar32 c) const {
    220     return set.contains(c) ? norm2.getCombiningClass(c) : 0;
    221 }
    222 
    223 UBool
    224 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
    225     uprv_checkCanGetBuffer(s, errorCode);
    226     if(U_FAILURE(errorCode)) {
    227         return FALSE;
    228     }
    229     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    230     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    231         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    232         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    233             spanCondition=USET_SPAN_SIMPLE;
    234         } else {
    235             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
    236                 U_FAILURE(errorCode)
    237             ) {
    238                 return FALSE;
    239             }
    240             spanCondition=USET_SPAN_NOT_CONTAINED;
    241         }
    242         prevSpanLimit=spanLimit;
    243     }
    244     return TRUE;
    245 }
    246 
    247 UBool
    248 FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
    249     if(U_FAILURE(errorCode)) {
    250         return FALSE;
    251     }
    252     const char *s = sp.data();
    253     int32_t length = sp.length();
    254     USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
    255     while (length > 0) {
    256         int32_t spanLength = set.spanUTF8(s, length, spanCondition);
    257         if (spanCondition == USET_SPAN_NOT_CONTAINED) {
    258             spanCondition = USET_SPAN_SIMPLE;
    259         } else {
    260             if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
    261                     U_FAILURE(errorCode)) {
    262                 return FALSE;
    263             }
    264             spanCondition = USET_SPAN_NOT_CONTAINED;
    265         }
    266         s += spanLength;
    267         length -= spanLength;
    268     }
    269     return TRUE;
    270 }
    271 
    272 UNormalizationCheckResult
    273 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
    274     uprv_checkCanGetBuffer(s, errorCode);
    275     if(U_FAILURE(errorCode)) {
    276         return UNORM_MAYBE;
    277     }
    278     UNormalizationCheckResult result=UNORM_YES;
    279     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    280     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    281         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    282         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    283             spanCondition=USET_SPAN_SIMPLE;
    284         } else {
    285             UNormalizationCheckResult qcResult=
    286                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
    287             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
    288                 return qcResult;
    289             } else if(qcResult==UNORM_MAYBE) {
    290                 result=qcResult;
    291             }
    292             spanCondition=USET_SPAN_NOT_CONTAINED;
    293         }
    294         prevSpanLimit=spanLimit;
    295     }
    296     return result;
    297 }
    298 
    299 int32_t
    300 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
    301     uprv_checkCanGetBuffer(s, errorCode);
    302     if(U_FAILURE(errorCode)) {
    303         return 0;
    304     }
    305     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
    306     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
    307         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
    308         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    309             spanCondition=USET_SPAN_SIMPLE;
    310         } else {
    311             int32_t yesLimit=
    312                 prevSpanLimit+
    313                 norm2.spanQuickCheckYes(
    314                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
    315             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
    316                 return yesLimit;
    317             }
    318             spanCondition=USET_SPAN_NOT_CONTAINED;
    319         }
    320         prevSpanLimit=spanLimit;
    321     }
    322     return s.length();
    323 }
    324 
    325 UBool
    326 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
    327     return !set.contains(c) || norm2.hasBoundaryBefore(c);
    328 }
    329 
    330 UBool
    331 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
    332     return !set.contains(c) || norm2.hasBoundaryAfter(c);
    333 }
    334 
    335 UBool
    336 FilteredNormalizer2::isInert(UChar32 c) const {
    337     return !set.contains(c) || norm2.isInert(c);
    338 }
    339 
    340 U_NAMESPACE_END
    341 
    342 // C API ------------------------------------------------------------------- ***
    343 
    344 U_NAMESPACE_USE
    345 
    346 U_CAPI UNormalizer2 * U_EXPORT2
    347 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
    348     if(U_FAILURE(*pErrorCode)) {
    349         return NULL;
    350     }
    351     if(filterSet==NULL) {
    352         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    353         return NULL;
    354     }
    355     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
    356                                              *UnicodeSet::fromUSet(filterSet));
    357     if(fn2==NULL) {
    358         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    359     }
    360     return (UNormalizer2 *)fn2;
    361 }
    362 
    363 #endif  // !UCONFIG_NO_NORMALIZATION
    364