1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: filterednormalizer2.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009dec10 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_NORMALIZATION 20 21 #include "unicode/normalizer2.h" 22 #include "unicode/uniset.h" 23 #include "unicode/unistr.h" 24 #include "unicode/unorm.h" 25 #include "cpputils.h" 26 27 U_NAMESPACE_BEGIN 28 29 UnicodeString & 30 FilteredNormalizer2::normalize(const UnicodeString &src, 31 UnicodeString &dest, 32 UErrorCode &errorCode) const { 33 uprv_checkCanGetBuffer(src, errorCode); 34 if(U_FAILURE(errorCode)) { 35 dest.setToBogus(); 36 return dest; 37 } 38 if(&dest==&src) { 39 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 40 return dest; 41 } 42 dest.remove(); 43 return normalize(src, dest, USET_SPAN_SIMPLE, errorCode); 44 } 45 46 // Internal: No argument checking, and appends to dest. 47 // Pass as input spanCondition the one that is likely to yield a non-zero 48 // span length at the start of src. 49 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, 50 // USET_SPAN_SIMPLE should be passed in for the start of src 51 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after 52 // an in-filter prefix. 53 UnicodeString & 54 FilteredNormalizer2::normalize(const UnicodeString &src, 55 UnicodeString &dest, 56 USetSpanCondition spanCondition, 57 UErrorCode &errorCode) const { 58 UnicodeString tempDest; // Don't throw away destination buffer between iterations. 59 for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) { 60 int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition); 61 int32_t spanLength=spanLimit-prevSpanLimit; 62 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 63 if(spanLength!=0) { 64 dest.append(src, prevSpanLimit, spanLength); 65 } 66 spanCondition=USET_SPAN_SIMPLE; 67 } else { 68 if(spanLength!=0) { 69 // Not norm2.normalizeSecondAndAppend() because we do not want 70 // to modify the non-filter part of dest. 71 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit), 72 tempDest, errorCode)); 73 if(U_FAILURE(errorCode)) { 74 break; 75 } 76 } 77 spanCondition=USET_SPAN_NOT_CONTAINED; 78 } 79 prevSpanLimit=spanLimit; 80 } 81 return dest; 82 } 83 84 UnicodeString & 85 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, 86 const UnicodeString &second, 87 UErrorCode &errorCode) const { 88 return normalizeSecondAndAppend(first, second, TRUE, errorCode); 89 } 90 91 UnicodeString & 92 FilteredNormalizer2::append(UnicodeString &first, 93 const UnicodeString &second, 94 UErrorCode &errorCode) const { 95 return normalizeSecondAndAppend(first, second, FALSE, errorCode); 96 } 97 98 UnicodeString & 99 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, 100 const UnicodeString &second, 101 UBool doNormalize, 102 UErrorCode &errorCode) const { 103 uprv_checkCanGetBuffer(first, errorCode); 104 uprv_checkCanGetBuffer(second, errorCode); 105 if(U_FAILURE(errorCode)) { 106 return first; 107 } 108 if(&first==&second) { 109 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 110 return first; 111 } 112 if(first.isEmpty()) { 113 if(doNormalize) { 114 return normalize(second, first, errorCode); 115 } else { 116 return first=second; 117 } 118 } 119 // merge the in-filter suffix of the first string with the in-filter prefix of the second 120 int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE); 121 if(prefixLimit!=0) { 122 UnicodeString prefix(second.tempSubString(0, prefixLimit)); 123 int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE); 124 if(suffixStart==0) { 125 if(doNormalize) { 126 norm2.normalizeSecondAndAppend(first, prefix, errorCode); 127 } else { 128 norm2.append(first, prefix, errorCode); 129 } 130 } else { 131 UnicodeString middle(first, suffixStart, INT32_MAX); 132 if(doNormalize) { 133 norm2.normalizeSecondAndAppend(middle, prefix, errorCode); 134 } else { 135 norm2.append(middle, prefix, errorCode); 136 } 137 first.replace(suffixStart, INT32_MAX, middle); 138 } 139 } 140 if(prefixLimit<second.length()) { 141 UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX)); 142 if(doNormalize) { 143 normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode); 144 } else { 145 first.append(rest); 146 } 147 } 148 return first; 149 } 150 151 UBool 152 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const { 153 return set.contains(c) && norm2.getDecomposition(c, decomposition); 154 } 155 156 UBool 157 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const { 158 uprv_checkCanGetBuffer(s, errorCode); 159 if(U_FAILURE(errorCode)) { 160 return FALSE; 161 } 162 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 163 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 164 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 165 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 166 spanCondition=USET_SPAN_SIMPLE; 167 } else { 168 if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) || 169 U_FAILURE(errorCode) 170 ) { 171 return FALSE; 172 } 173 spanCondition=USET_SPAN_NOT_CONTAINED; 174 } 175 prevSpanLimit=spanLimit; 176 } 177 return TRUE; 178 } 179 180 UNormalizationCheckResult 181 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const { 182 uprv_checkCanGetBuffer(s, errorCode); 183 if(U_FAILURE(errorCode)) { 184 return UNORM_MAYBE; 185 } 186 UNormalizationCheckResult result=UNORM_YES; 187 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 188 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 189 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 190 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 191 spanCondition=USET_SPAN_SIMPLE; 192 } else { 193 UNormalizationCheckResult qcResult= 194 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); 195 if(U_FAILURE(errorCode) || qcResult==UNORM_NO) { 196 return qcResult; 197 } else if(qcResult==UNORM_MAYBE) { 198 result=qcResult; 199 } 200 spanCondition=USET_SPAN_NOT_CONTAINED; 201 } 202 prevSpanLimit=spanLimit; 203 } 204 return result; 205 } 206 207 int32_t 208 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const { 209 uprv_checkCanGetBuffer(s, errorCode); 210 if(U_FAILURE(errorCode)) { 211 return 0; 212 } 213 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 214 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 215 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 216 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 217 spanCondition=USET_SPAN_SIMPLE; 218 } else { 219 int32_t yesLimit= 220 prevSpanLimit+ 221 norm2.spanQuickCheckYes( 222 s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); 223 if(U_FAILURE(errorCode) || yesLimit<spanLimit) { 224 return yesLimit; 225 } 226 spanCondition=USET_SPAN_NOT_CONTAINED; 227 } 228 prevSpanLimit=spanLimit; 229 } 230 return s.length(); 231 } 232 233 UBool 234 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const { 235 return !set.contains(c) || norm2.hasBoundaryBefore(c); 236 } 237 238 UBool 239 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const { 240 return !set.contains(c) || norm2.hasBoundaryAfter(c); 241 } 242 243 UBool 244 FilteredNormalizer2::isInert(UChar32 c) const { 245 return !set.contains(c) || norm2.isInert(c); 246 } 247 248 U_NAMESPACE_END 249 250 // C API ------------------------------------------------------------------- *** 251 252 U_NAMESPACE_USE 253 254 U_DRAFT UNormalizer2 * U_EXPORT2 255 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) { 256 if(U_FAILURE(*pErrorCode)) { 257 return NULL; 258 } 259 if(filterSet==NULL) { 260 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 261 return NULL; 262 } 263 Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2, 264 *UnicodeSet::fromUSet(filterSet)); 265 if(fn2==NULL) { 266 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 267 } 268 return (UNormalizer2 *)fn2; 269 } 270 271 #endif // !UCONFIG_NO_NORMALIZATION 272