1 /* 2 ****************************************************************************** 3 * Copyright (c) 1996-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ****************************************************************************** 6 * File unorm.cpp 7 * 8 * Created by: Vladimir Weinstein 12052000 9 * 10 * Modification history : 11 * 12 * Date Name Description 13 * 02/01/01 synwee Added normalization quickcheck enum and method. 14 * 02/12/01 synwee Commented out quickcheck util api has been approved 15 * Added private method for doing FCD checks 16 * 02/23/01 synwee Modified quickcheck and checkFCE to run through 17 * string for codepoints < 0x300 for the normalization 18 * mode NFC. 19 * 05/25/01+ Markus Scherer total rewrite, implement all normalization here 20 * instead of just wrappers around normlzr.cpp, 21 * load unorm.dat, support Unicode 3.1 with 22 * supplementary code points, etc. 23 * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code 24 */ 25 26 #include "unicode/utypes.h" 27 28 #if !UCONFIG_NO_NORMALIZATION 29 30 #include "unicode/udata.h" 31 #include "unicode/ustring.h" 32 #include "unicode/uiter.h" 33 #include "unicode/unorm.h" 34 #include "unicode/unorm2.h" 35 #include "normalizer2impl.h" 36 #include "unormimp.h" 37 #include "uprops.h" 38 #include "ustr_imp.h" 39 40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 41 42 U_NAMESPACE_USE 43 44 /* quick check functions ---------------------------------------------------- */ 45 46 U_CAPI UNormalizationCheckResult U_EXPORT2 47 unorm_quickCheck(const UChar *src, 48 int32_t srcLength, 49 UNormalizationMode mode, 50 UErrorCode *pErrorCode) { 51 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 52 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 53 } 54 55 U_CAPI UNormalizationCheckResult U_EXPORT2 56 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, 57 UNormalizationMode mode, int32_t options, 58 UErrorCode *pErrorCode) { 59 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 60 if(options&UNORM_UNICODE_3_2) { 61 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 62 return unorm2_quickCheck( 63 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 64 src, srcLength, pErrorCode); 65 } else { 66 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 67 } 68 } 69 70 U_CAPI UBool U_EXPORT2 71 unorm_isNormalized(const UChar *src, int32_t srcLength, 72 UNormalizationMode mode, 73 UErrorCode *pErrorCode) { 74 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 75 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 76 } 77 78 U_CAPI UBool U_EXPORT2 79 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, 80 UNormalizationMode mode, int32_t options, 81 UErrorCode *pErrorCode) { 82 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 83 if(options&UNORM_UNICODE_3_2) { 84 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 85 return unorm2_isNormalized( 86 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 87 src, srcLength, pErrorCode); 88 } else { 89 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 90 } 91 } 92 93 /* normalize() API ---------------------------------------------------------- */ 94 95 /** Public API for normalizing. */ 96 U_CAPI int32_t U_EXPORT2 97 unorm_normalize(const UChar *src, int32_t srcLength, 98 UNormalizationMode mode, int32_t options, 99 UChar *dest, int32_t destCapacity, 100 UErrorCode *pErrorCode) { 101 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 102 if(options&UNORM_UNICODE_3_2) { 103 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 104 return unorm2_normalize( 105 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 106 src, srcLength, dest, destCapacity, pErrorCode); 107 } else { 108 return unorm2_normalize((const UNormalizer2 *)n2, 109 src, srcLength, dest, destCapacity, pErrorCode); 110 } 111 } 112 113 114 /* iteration functions ------------------------------------------------------ */ 115 116 static int32_t 117 _iterate(UCharIterator *src, UBool forward, 118 UChar *dest, int32_t destCapacity, 119 const Normalizer2 *n2, 120 UBool doNormalize, UBool *pNeededToNormalize, 121 UErrorCode *pErrorCode) { 122 if(U_FAILURE(*pErrorCode)) { 123 return 0; 124 } 125 if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) { 126 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 127 return 0; 128 } 129 130 if(pNeededToNormalize!=NULL) { 131 *pNeededToNormalize=FALSE; 132 } 133 if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { 134 return u_terminateUChars(dest, destCapacity, 0, pErrorCode); 135 } 136 137 UnicodeString buffer; 138 UChar32 c; 139 if(forward) { 140 /* get one character and ignore its properties */ 141 buffer.append(uiter_next32(src)); 142 /* get all following characters until we see a boundary */ 143 while((c=uiter_next32(src))>=0) { 144 if(n2->hasBoundaryBefore(c)) { 145 /* back out the latest movement to stop at the boundary */ 146 src->move(src, -U16_LENGTH(c), UITER_CURRENT); 147 break; 148 } else { 149 buffer.append(c); 150 } 151 } 152 } else { 153 while((c=uiter_previous32(src))>=0) { 154 /* always write this character to the front of the buffer */ 155 buffer.insert(0, c); 156 /* stop if this just-copied character is a boundary */ 157 if(n2->hasBoundaryBefore(c)) { 158 break; 159 } 160 } 161 } 162 163 UnicodeString destString(dest, 0, destCapacity); 164 if(buffer.length()>0 && doNormalize) { 165 n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); 166 if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { 167 *pNeededToNormalize= destString!=buffer; 168 } 169 return destString.length(); 170 } else { 171 /* just copy the source characters */ 172 return buffer.extract(dest, destCapacity, *pErrorCode); 173 } 174 } 175 176 static int32_t 177 unorm_iterate(UCharIterator *src, UBool forward, 178 UChar *dest, int32_t destCapacity, 179 UNormalizationMode mode, int32_t options, 180 UBool doNormalize, UBool *pNeededToNormalize, 181 UErrorCode *pErrorCode) { 182 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 183 if(options&UNORM_UNICODE_3_2) { 184 const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); 185 if(U_FAILURE(*pErrorCode)) { 186 return 0; 187 } 188 FilteredNormalizer2 fn2(*n2, *uni32); 189 return _iterate(src, forward, dest, destCapacity, 190 &fn2, doNormalize, pNeededToNormalize, pErrorCode); 191 } 192 return _iterate(src, forward, dest, destCapacity, 193 n2, doNormalize, pNeededToNormalize, pErrorCode); 194 } 195 196 U_CAPI int32_t U_EXPORT2 197 unorm_previous(UCharIterator *src, 198 UChar *dest, int32_t destCapacity, 199 UNormalizationMode mode, int32_t options, 200 UBool doNormalize, UBool *pNeededToNormalize, 201 UErrorCode *pErrorCode) { 202 return unorm_iterate(src, FALSE, 203 dest, destCapacity, 204 mode, options, 205 doNormalize, pNeededToNormalize, 206 pErrorCode); 207 } 208 209 U_CAPI int32_t U_EXPORT2 210 unorm_next(UCharIterator *src, 211 UChar *dest, int32_t destCapacity, 212 UNormalizationMode mode, int32_t options, 213 UBool doNormalize, UBool *pNeededToNormalize, 214 UErrorCode *pErrorCode) { 215 return unorm_iterate(src, TRUE, 216 dest, destCapacity, 217 mode, options, 218 doNormalize, pNeededToNormalize, 219 pErrorCode); 220 } 221 222 /* Concatenation of normalized strings -------------------------------------- */ 223 224 static int32_t 225 _concatenate(const UChar *left, int32_t leftLength, 226 const UChar *right, int32_t rightLength, 227 UChar *dest, int32_t destCapacity, 228 const Normalizer2 *n2, 229 UErrorCode *pErrorCode) { 230 if(U_FAILURE(*pErrorCode)) { 231 return 0; 232 } 233 if(destCapacity<0 || (dest==NULL && destCapacity>0) || 234 left==NULL || leftLength<-1 || right==NULL || rightLength<-1) { 235 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 236 return 0; 237 } 238 239 /* check for overlapping right and destination */ 240 if( dest!=NULL && 241 ((right>=dest && right<(dest+destCapacity)) || 242 (rightLength>0 && dest>=right && dest<(right+rightLength))) 243 ) { 244 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 245 return 0; 246 } 247 248 /* allow left==dest */ 249 UnicodeString destString; 250 if(left==dest) { 251 destString.setTo(dest, leftLength, destCapacity); 252 } else { 253 destString.setTo(dest, 0, destCapacity); 254 destString.append(left, leftLength); 255 } 256 return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). 257 extract(dest, destCapacity, *pErrorCode); 258 } 259 260 U_CAPI int32_t U_EXPORT2 261 unorm_concatenate(const UChar *left, int32_t leftLength, 262 const UChar *right, int32_t rightLength, 263 UChar *dest, int32_t destCapacity, 264 UNormalizationMode mode, int32_t options, 265 UErrorCode *pErrorCode) { 266 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 267 if(options&UNORM_UNICODE_3_2) { 268 const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); 269 if(U_FAILURE(*pErrorCode)) { 270 return 0; 271 } 272 FilteredNormalizer2 fn2(*n2, *uni32); 273 return _concatenate(left, leftLength, right, rightLength, 274 dest, destCapacity, &fn2, pErrorCode); 275 } 276 return _concatenate(left, leftLength, right, rightLength, 277 dest, destCapacity, n2, pErrorCode); 278 } 279 280 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 281