1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: filterednormalizer2.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009dec10 16 * created by: Markus W. Scherer 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_NORMALIZATION 22 23 #include "unicode/edits.h" 24 #include "unicode/normalizer2.h" 25 #include "unicode/stringoptions.h" 26 #include "unicode/uniset.h" 27 #include "unicode/unistr.h" 28 #include "unicode/unorm.h" 29 #include "cpputils.h" 30 31 U_NAMESPACE_BEGIN 32 33 FilteredNormalizer2::~FilteredNormalizer2() {} 34 35 UnicodeString & 36 FilteredNormalizer2::normalize(const UnicodeString &src, 37 UnicodeString &dest, 38 UErrorCode &errorCode) const { 39 uprv_checkCanGetBuffer(src, errorCode); 40 if(U_FAILURE(errorCode)) { 41 dest.setToBogus(); 42 return dest; 43 } 44 if(&dest==&src) { 45 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 46 return dest; 47 } 48 dest.remove(); 49 return normalize(src, dest, USET_SPAN_SIMPLE, errorCode); 50 } 51 52 // Internal: No argument checking, and appends to dest. 53 // Pass as input spanCondition the one that is likely to yield a non-zero 54 // span length at the start of src. 55 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, 56 // USET_SPAN_SIMPLE should be passed in for the start of src 57 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after 58 // an in-filter prefix. 59 UnicodeString & 60 FilteredNormalizer2::normalize(const UnicodeString &src, 61 UnicodeString &dest, 62 USetSpanCondition spanCondition, 63 UErrorCode &errorCode) const { 64 UnicodeString tempDest; // Don't throw away destination buffer between iterations. 65 for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) { 66 int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition); 67 int32_t spanLength=spanLimit-prevSpanLimit; 68 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 69 if(spanLength!=0) { 70 dest.append(src, prevSpanLimit, spanLength); 71 } 72 spanCondition=USET_SPAN_SIMPLE; 73 } else { 74 if(spanLength!=0) { 75 // Not norm2.normalizeSecondAndAppend() because we do not want 76 // to modify the non-filter part of dest. 77 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit), 78 tempDest, errorCode)); 79 if(U_FAILURE(errorCode)) { 80 break; 81 } 82 } 83 spanCondition=USET_SPAN_NOT_CONTAINED; 84 } 85 prevSpanLimit=spanLimit; 86 } 87 return dest; 88 } 89 90 void 91 FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 92 Edits *edits, UErrorCode &errorCode) const { 93 if (U_FAILURE(errorCode)) { 94 return; 95 } 96 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { 97 edits->reset(); 98 } 99 options |= U_EDITS_NO_RESET; // Do not reset for each span. 100 normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode); 101 } 102 103 void 104 FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length, 105 ByteSink &sink, Edits *edits, 106 USetSpanCondition spanCondition, 107 UErrorCode &errorCode) const { 108 while (length > 0) { 109 int32_t spanLength = set.spanUTF8(src, length, spanCondition); 110 if (spanCondition == USET_SPAN_NOT_CONTAINED) { 111 if (spanLength != 0) { 112 if (edits != nullptr) { 113 edits->addUnchanged(spanLength); 114 } 115 if ((options & U_OMIT_UNCHANGED_TEXT) == 0) { 116 sink.Append(src, spanLength); 117 } 118 } 119 spanCondition = USET_SPAN_SIMPLE; 120 } else { 121 if (spanLength != 0) { 122 // Not norm2.normalizeSecondAndAppend() because we do not want 123 // to modify the non-filter part of dest. 124 norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode); 125 if (U_FAILURE(errorCode)) { 126 break; 127 } 128 } 129 spanCondition = USET_SPAN_NOT_CONTAINED; 130 } 131 src += spanLength; 132 length -= spanLength; 133 } 134 } 135 136 UnicodeString & 137 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, 138 const UnicodeString &second, 139 UErrorCode &errorCode) const { 140 return normalizeSecondAndAppend(first, second, TRUE, errorCode); 141 } 142 143 UnicodeString & 144 FilteredNormalizer2::append(UnicodeString &first, 145 const UnicodeString &second, 146 UErrorCode &errorCode) const { 147 return normalizeSecondAndAppend(first, second, FALSE, errorCode); 148 } 149 150 UnicodeString & 151 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, 152 const UnicodeString &second, 153 UBool doNormalize, 154 UErrorCode &errorCode) const { 155 uprv_checkCanGetBuffer(first, errorCode); 156 uprv_checkCanGetBuffer(second, errorCode); 157 if(U_FAILURE(errorCode)) { 158 return first; 159 } 160 if(&first==&second) { 161 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 162 return first; 163 } 164 if(first.isEmpty()) { 165 if(doNormalize) { 166 return normalize(second, first, errorCode); 167 } else { 168 return first=second; 169 } 170 } 171 // merge the in-filter suffix of the first string with the in-filter prefix of the second 172 int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE); 173 if(prefixLimit!=0) { 174 UnicodeString prefix(second.tempSubString(0, prefixLimit)); 175 int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE); 176 if(suffixStart==0) { 177 if(doNormalize) { 178 norm2.normalizeSecondAndAppend(first, prefix, errorCode); 179 } else { 180 norm2.append(first, prefix, errorCode); 181 } 182 } else { 183 UnicodeString middle(first, suffixStart, INT32_MAX); 184 if(doNormalize) { 185 norm2.normalizeSecondAndAppend(middle, prefix, errorCode); 186 } else { 187 norm2.append(middle, prefix, errorCode); 188 } 189 first.replace(suffixStart, INT32_MAX, middle); 190 } 191 } 192 if(prefixLimit<second.length()) { 193 UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX)); 194 if(doNormalize) { 195 normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode); 196 } else { 197 first.append(rest); 198 } 199 } 200 return first; 201 } 202 203 UBool 204 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const { 205 return set.contains(c) && norm2.getDecomposition(c, decomposition); 206 } 207 208 UBool 209 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const { 210 return set.contains(c) && norm2.getRawDecomposition(c, decomposition); 211 } 212 213 UChar32 214 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const { 215 return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL; 216 } 217 218 uint8_t 219 FilteredNormalizer2::getCombiningClass(UChar32 c) const { 220 return set.contains(c) ? norm2.getCombiningClass(c) : 0; 221 } 222 223 UBool 224 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const { 225 uprv_checkCanGetBuffer(s, errorCode); 226 if(U_FAILURE(errorCode)) { 227 return FALSE; 228 } 229 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 230 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 231 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 232 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 233 spanCondition=USET_SPAN_SIMPLE; 234 } else { 235 if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) || 236 U_FAILURE(errorCode) 237 ) { 238 return FALSE; 239 } 240 spanCondition=USET_SPAN_NOT_CONTAINED; 241 } 242 prevSpanLimit=spanLimit; 243 } 244 return TRUE; 245 } 246 247 UBool 248 FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const { 249 if(U_FAILURE(errorCode)) { 250 return FALSE; 251 } 252 const char *s = sp.data(); 253 int32_t length = sp.length(); 254 USetSpanCondition spanCondition = USET_SPAN_SIMPLE; 255 while (length > 0) { 256 int32_t spanLength = set.spanUTF8(s, length, spanCondition); 257 if (spanCondition == USET_SPAN_NOT_CONTAINED) { 258 spanCondition = USET_SPAN_SIMPLE; 259 } else { 260 if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) || 261 U_FAILURE(errorCode)) { 262 return FALSE; 263 } 264 spanCondition = USET_SPAN_NOT_CONTAINED; 265 } 266 s += spanLength; 267 length -= spanLength; 268 } 269 return TRUE; 270 } 271 272 UNormalizationCheckResult 273 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const { 274 uprv_checkCanGetBuffer(s, errorCode); 275 if(U_FAILURE(errorCode)) { 276 return UNORM_MAYBE; 277 } 278 UNormalizationCheckResult result=UNORM_YES; 279 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 280 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 281 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 282 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 283 spanCondition=USET_SPAN_SIMPLE; 284 } else { 285 UNormalizationCheckResult qcResult= 286 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); 287 if(U_FAILURE(errorCode) || qcResult==UNORM_NO) { 288 return qcResult; 289 } else if(qcResult==UNORM_MAYBE) { 290 result=qcResult; 291 } 292 spanCondition=USET_SPAN_NOT_CONTAINED; 293 } 294 prevSpanLimit=spanLimit; 295 } 296 return result; 297 } 298 299 int32_t 300 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const { 301 uprv_checkCanGetBuffer(s, errorCode); 302 if(U_FAILURE(errorCode)) { 303 return 0; 304 } 305 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 306 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 307 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 308 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 309 spanCondition=USET_SPAN_SIMPLE; 310 } else { 311 int32_t yesLimit= 312 prevSpanLimit+ 313 norm2.spanQuickCheckYes( 314 s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); 315 if(U_FAILURE(errorCode) || yesLimit<spanLimit) { 316 return yesLimit; 317 } 318 spanCondition=USET_SPAN_NOT_CONTAINED; 319 } 320 prevSpanLimit=spanLimit; 321 } 322 return s.length(); 323 } 324 325 UBool 326 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const { 327 return !set.contains(c) || norm2.hasBoundaryBefore(c); 328 } 329 330 UBool 331 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const { 332 return !set.contains(c) || norm2.hasBoundaryAfter(c); 333 } 334 335 UBool 336 FilteredNormalizer2::isInert(UChar32 c) const { 337 return !set.contains(c) || norm2.isInert(c); 338 } 339 340 U_NAMESPACE_END 341 342 // C API ------------------------------------------------------------------- *** 343 344 U_NAMESPACE_USE 345 346 U_CAPI UNormalizer2 * U_EXPORT2 347 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) { 348 if(U_FAILURE(*pErrorCode)) { 349 return NULL; 350 } 351 if(filterSet==NULL) { 352 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 353 return NULL; 354 } 355 Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2, 356 *UnicodeSet::fromUSet(filterSet)); 357 if(fn2==NULL) { 358 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 359 } 360 return (UNormalizer2 *)fn2; 361 } 362 363 #endif // !UCONFIG_NO_NORMALIZATION 364