1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: filterednormalizer2.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009dec10 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_NORMALIZATION 20 21 #include "unicode/normalizer2.h" 22 #include "unicode/uniset.h" 23 #include "unicode/unistr.h" 24 #include "unicode/unorm.h" 25 #include "cpputils.h" 26 27 U_NAMESPACE_BEGIN 28 29 FilteredNormalizer2::~FilteredNormalizer2() {} 30 31 UnicodeString & 32 FilteredNormalizer2::normalize(const UnicodeString &src, 33 UnicodeString &dest, 34 UErrorCode &errorCode) const { 35 uprv_checkCanGetBuffer(src, errorCode); 36 if(U_FAILURE(errorCode)) { 37 dest.setToBogus(); 38 return dest; 39 } 40 if(&dest==&src) { 41 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 42 return dest; 43 } 44 dest.remove(); 45 return normalize(src, dest, USET_SPAN_SIMPLE, errorCode); 46 } 47 48 // Internal: No argument checking, and appends to dest. 49 // Pass as input spanCondition the one that is likely to yield a non-zero 50 // span length at the start of src. 51 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, 52 // USET_SPAN_SIMPLE should be passed in for the start of src 53 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after 54 // an in-filter prefix. 55 UnicodeString & 56 FilteredNormalizer2::normalize(const UnicodeString &src, 57 UnicodeString &dest, 58 USetSpanCondition spanCondition, 59 UErrorCode &errorCode) const { 60 UnicodeString tempDest; // Don't throw away destination buffer between iterations. 61 for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) { 62 int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition); 63 int32_t spanLength=spanLimit-prevSpanLimit; 64 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 65 if(spanLength!=0) { 66 dest.append(src, prevSpanLimit, spanLength); 67 } 68 spanCondition=USET_SPAN_SIMPLE; 69 } else { 70 if(spanLength!=0) { 71 // Not norm2.normalizeSecondAndAppend() because we do not want 72 // to modify the non-filter part of dest. 73 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit), 74 tempDest, errorCode)); 75 if(U_FAILURE(errorCode)) { 76 break; 77 } 78 } 79 spanCondition=USET_SPAN_NOT_CONTAINED; 80 } 81 prevSpanLimit=spanLimit; 82 } 83 return dest; 84 } 85 86 UnicodeString & 87 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, 88 const UnicodeString &second, 89 UErrorCode &errorCode) const { 90 return normalizeSecondAndAppend(first, second, TRUE, errorCode); 91 } 92 93 UnicodeString & 94 FilteredNormalizer2::append(UnicodeString &first, 95 const UnicodeString &second, 96 UErrorCode &errorCode) const { 97 return normalizeSecondAndAppend(first, second, FALSE, errorCode); 98 } 99 100 UnicodeString & 101 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, 102 const UnicodeString &second, 103 UBool doNormalize, 104 UErrorCode &errorCode) const { 105 uprv_checkCanGetBuffer(first, errorCode); 106 uprv_checkCanGetBuffer(second, errorCode); 107 if(U_FAILURE(errorCode)) { 108 return first; 109 } 110 if(&first==&second) { 111 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 112 return first; 113 } 114 if(first.isEmpty()) { 115 if(doNormalize) { 116 return normalize(second, first, errorCode); 117 } else { 118 return first=second; 119 } 120 } 121 // merge the in-filter suffix of the first string with the in-filter prefix of the second 122 int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE); 123 if(prefixLimit!=0) { 124 UnicodeString prefix(second.tempSubString(0, prefixLimit)); 125 int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE); 126 if(suffixStart==0) { 127 if(doNormalize) { 128 norm2.normalizeSecondAndAppend(first, prefix, errorCode); 129 } else { 130 norm2.append(first, prefix, errorCode); 131 } 132 } else { 133 UnicodeString middle(first, suffixStart, INT32_MAX); 134 if(doNormalize) { 135 norm2.normalizeSecondAndAppend(middle, prefix, errorCode); 136 } else { 137 norm2.append(middle, prefix, errorCode); 138 } 139 first.replace(suffixStart, INT32_MAX, middle); 140 } 141 } 142 if(prefixLimit<second.length()) { 143 UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX)); 144 if(doNormalize) { 145 normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode); 146 } else { 147 first.append(rest); 148 } 149 } 150 return first; 151 } 152 153 UBool 154 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const { 155 return set.contains(c) && norm2.getDecomposition(c, decomposition); 156 } 157 158 UBool 159 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const { 160 return set.contains(c) && norm2.getRawDecomposition(c, decomposition); 161 } 162 163 UChar32 164 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const { 165 return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL; 166 } 167 168 uint8_t 169 FilteredNormalizer2::getCombiningClass(UChar32 c) const { 170 return set.contains(c) ? norm2.getCombiningClass(c) : 0; 171 } 172 173 UBool 174 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const { 175 uprv_checkCanGetBuffer(s, errorCode); 176 if(U_FAILURE(errorCode)) { 177 return FALSE; 178 } 179 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 180 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 181 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 182 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 183 spanCondition=USET_SPAN_SIMPLE; 184 } else { 185 if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) || 186 U_FAILURE(errorCode) 187 ) { 188 return FALSE; 189 } 190 spanCondition=USET_SPAN_NOT_CONTAINED; 191 } 192 prevSpanLimit=spanLimit; 193 } 194 return TRUE; 195 } 196 197 UNormalizationCheckResult 198 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const { 199 uprv_checkCanGetBuffer(s, errorCode); 200 if(U_FAILURE(errorCode)) { 201 return UNORM_MAYBE; 202 } 203 UNormalizationCheckResult result=UNORM_YES; 204 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 205 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 206 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 207 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 208 spanCondition=USET_SPAN_SIMPLE; 209 } else { 210 UNormalizationCheckResult qcResult= 211 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); 212 if(U_FAILURE(errorCode) || qcResult==UNORM_NO) { 213 return qcResult; 214 } else if(qcResult==UNORM_MAYBE) { 215 result=qcResult; 216 } 217 spanCondition=USET_SPAN_NOT_CONTAINED; 218 } 219 prevSpanLimit=spanLimit; 220 } 221 return result; 222 } 223 224 int32_t 225 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const { 226 uprv_checkCanGetBuffer(s, errorCode); 227 if(U_FAILURE(errorCode)) { 228 return 0; 229 } 230 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 231 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 232 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 233 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 234 spanCondition=USET_SPAN_SIMPLE; 235 } else { 236 int32_t yesLimit= 237 prevSpanLimit+ 238 norm2.spanQuickCheckYes( 239 s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); 240 if(U_FAILURE(errorCode) || yesLimit<spanLimit) { 241 return yesLimit; 242 } 243 spanCondition=USET_SPAN_NOT_CONTAINED; 244 } 245 prevSpanLimit=spanLimit; 246 } 247 return s.length(); 248 } 249 250 UBool 251 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const { 252 return !set.contains(c) || norm2.hasBoundaryBefore(c); 253 } 254 255 UBool 256 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const { 257 return !set.contains(c) || norm2.hasBoundaryAfter(c); 258 } 259 260 UBool 261 FilteredNormalizer2::isInert(UChar32 c) const { 262 return !set.contains(c) || norm2.isInert(c); 263 } 264 265 U_NAMESPACE_END 266 267 // C API ------------------------------------------------------------------- *** 268 269 U_NAMESPACE_USE 270 271 U_CAPI UNormalizer2 * U_EXPORT2 272 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) { 273 if(U_FAILURE(*pErrorCode)) { 274 return NULL; 275 } 276 if(filterSet==NULL) { 277 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 278 return NULL; 279 } 280 Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2, 281 *UnicodeSet::fromUSet(filterSet)); 282 if(fn2==NULL) { 283 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 284 } 285 return (UNormalizer2 *)fn2; 286 } 287 288 #endif // !UCONFIG_NO_NORMALIZATION 289