1 /* 2 ****************************************************************************** 3 * Copyright (c) 1996-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ****************************************************************************** 6 * File unorm.cpp 7 * 8 * Created by: Vladimir Weinstein 12052000 9 * 10 * Modification history : 11 * 12 * Date Name Description 13 * 02/01/01 synwee Added normalization quickcheck enum and method. 14 * 02/12/01 synwee Commented out quickcheck util api has been approved 15 * Added private method for doing FCD checks 16 * 02/23/01 synwee Modified quickcheck and checkFCE to run through 17 * string for codepoints < 0x300 for the normalization 18 * mode NFC. 19 * 05/25/01+ Markus Scherer total rewrite, implement all normalization here 20 * instead of just wrappers around normlzr.cpp, 21 * load unorm.dat, support Unicode 3.1 with 22 * supplementary code points, etc. 23 * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code 24 */ 25 26 #include "unicode/utypes.h" 27 28 #if !UCONFIG_NO_NORMALIZATION 29 30 #include "unicode/udata.h" 31 #include "unicode/ustring.h" 32 #include "unicode/uiter.h" 33 #include "unicode/unorm.h" 34 #include "unicode/unorm2.h" 35 #include "normalizer2impl.h" 36 #include "unormimp.h" 37 #include "uprops.h" 38 #include "ustr_imp.h" 39 40 U_NAMESPACE_USE 41 42 /* quick check functions ---------------------------------------------------- */ 43 44 U_CAPI UNormalizationCheckResult U_EXPORT2 45 unorm_quickCheck(const UChar *src, 46 int32_t srcLength, 47 UNormalizationMode mode, 48 UErrorCode *pErrorCode) { 49 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 50 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 51 } 52 53 U_CAPI UNormalizationCheckResult U_EXPORT2 54 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, 55 UNormalizationMode mode, int32_t options, 56 UErrorCode *pErrorCode) { 57 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 58 if(options&UNORM_UNICODE_3_2) { 59 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 60 return unorm2_quickCheck( 61 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 62 src, srcLength, pErrorCode); 63 } else { 64 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 65 } 66 } 67 68 U_CAPI UBool U_EXPORT2 69 unorm_isNormalized(const UChar *src, int32_t srcLength, 70 UNormalizationMode mode, 71 UErrorCode *pErrorCode) { 72 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 73 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 74 } 75 76 U_CAPI UBool U_EXPORT2 77 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, 78 UNormalizationMode mode, int32_t options, 79 UErrorCode *pErrorCode) { 80 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 81 if(options&UNORM_UNICODE_3_2) { 82 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 83 return unorm2_isNormalized( 84 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 85 src, srcLength, pErrorCode); 86 } else { 87 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 88 } 89 } 90 91 /* normalize() API ---------------------------------------------------------- */ 92 93 /** Public API for normalizing. */ 94 U_CAPI int32_t U_EXPORT2 95 unorm_normalize(const UChar *src, int32_t srcLength, 96 UNormalizationMode mode, int32_t options, 97 UChar *dest, int32_t destCapacity, 98 UErrorCode *pErrorCode) { 99 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 100 if(options&UNORM_UNICODE_3_2) { 101 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 102 return unorm2_normalize( 103 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 104 src, srcLength, dest, destCapacity, pErrorCode); 105 } else { 106 return unorm2_normalize((const UNormalizer2 *)n2, 107 src, srcLength, dest, destCapacity, pErrorCode); 108 } 109 } 110 111 112 /* iteration functions ------------------------------------------------------ */ 113 114 static int32_t 115 _iterate(UCharIterator *src, UBool forward, 116 UChar *dest, int32_t destCapacity, 117 const Normalizer2 *n2, 118 UBool doNormalize, UBool *pNeededToNormalize, 119 UErrorCode *pErrorCode) { 120 if(U_FAILURE(*pErrorCode)) { 121 return 0; 122 } 123 if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) { 124 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 125 return 0; 126 } 127 128 if(pNeededToNormalize!=NULL) { 129 *pNeededToNormalize=FALSE; 130 } 131 if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { 132 return u_terminateUChars(dest, destCapacity, 0, pErrorCode); 133 } 134 135 UnicodeString buffer; 136 UChar32 c; 137 if(forward) { 138 /* get one character and ignore its properties */ 139 buffer.append(uiter_next32(src)); 140 /* get all following characters until we see a boundary */ 141 while((c=uiter_next32(src))>=0) { 142 if(n2->hasBoundaryBefore(c)) { 143 /* back out the latest movement to stop at the boundary */ 144 src->move(src, -U16_LENGTH(c), UITER_CURRENT); 145 break; 146 } else { 147 buffer.append(c); 148 } 149 } 150 } else { 151 while((c=uiter_previous32(src))>=0) { 152 /* always write this character to the front of the buffer */ 153 buffer.insert(0, c); 154 /* stop if this just-copied character is a boundary */ 155 if(n2->hasBoundaryBefore(c)) { 156 break; 157 } 158 } 159 } 160 161 UnicodeString destString(dest, 0, destCapacity); 162 if(buffer.length()>0 && doNormalize) { 163 n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); 164 if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { 165 *pNeededToNormalize= destString!=buffer; 166 } 167 return destString.length(); 168 } else { 169 /* just copy the source characters */ 170 return buffer.extract(dest, destCapacity, *pErrorCode); 171 } 172 } 173 174 static int32_t 175 unorm_iterate(UCharIterator *src, UBool forward, 176 UChar *dest, int32_t destCapacity, 177 UNormalizationMode mode, int32_t options, 178 UBool doNormalize, UBool *pNeededToNormalize, 179 UErrorCode *pErrorCode) { 180 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 181 if(options&UNORM_UNICODE_3_2) { 182 const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); 183 if(U_FAILURE(*pErrorCode)) { 184 return 0; 185 } 186 FilteredNormalizer2 fn2(*n2, *uni32); 187 return _iterate(src, forward, dest, destCapacity, 188 &fn2, doNormalize, pNeededToNormalize, pErrorCode); 189 } 190 return _iterate(src, forward, dest, destCapacity, 191 n2, doNormalize, pNeededToNormalize, pErrorCode); 192 } 193 194 U_CAPI int32_t U_EXPORT2 195 unorm_previous(UCharIterator *src, 196 UChar *dest, int32_t destCapacity, 197 UNormalizationMode mode, int32_t options, 198 UBool doNormalize, UBool *pNeededToNormalize, 199 UErrorCode *pErrorCode) { 200 return unorm_iterate(src, FALSE, 201 dest, destCapacity, 202 mode, options, 203 doNormalize, pNeededToNormalize, 204 pErrorCode); 205 } 206 207 U_CAPI int32_t U_EXPORT2 208 unorm_next(UCharIterator *src, 209 UChar *dest, int32_t destCapacity, 210 UNormalizationMode mode, int32_t options, 211 UBool doNormalize, UBool *pNeededToNormalize, 212 UErrorCode *pErrorCode) { 213 return unorm_iterate(src, TRUE, 214 dest, destCapacity, 215 mode, options, 216 doNormalize, pNeededToNormalize, 217 pErrorCode); 218 } 219 220 /* Concatenation of normalized strings -------------------------------------- */ 221 222 static int32_t 223 _concatenate(const UChar *left, int32_t leftLength, 224 const UChar *right, int32_t rightLength, 225 UChar *dest, int32_t destCapacity, 226 const Normalizer2 *n2, 227 UErrorCode *pErrorCode) { 228 if(U_FAILURE(*pErrorCode)) { 229 return 0; 230 } 231 if(destCapacity<0 || (dest==NULL && destCapacity>0) || 232 left==NULL || leftLength<-1 || right==NULL || rightLength<-1) { 233 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 234 return 0; 235 } 236 237 /* check for overlapping right and destination */ 238 if( dest!=NULL && 239 ((right>=dest && right<(dest+destCapacity)) || 240 (rightLength>0 && dest>=right && dest<(right+rightLength))) 241 ) { 242 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 243 return 0; 244 } 245 246 /* allow left==dest */ 247 UnicodeString destString; 248 if(left==dest) { 249 destString.setTo(dest, leftLength, destCapacity); 250 } else { 251 destString.setTo(dest, 0, destCapacity); 252 destString.append(left, leftLength); 253 } 254 return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). 255 extract(dest, destCapacity, *pErrorCode); 256 } 257 258 U_CAPI int32_t U_EXPORT2 259 unorm_concatenate(const UChar *left, int32_t leftLength, 260 const UChar *right, int32_t rightLength, 261 UChar *dest, int32_t destCapacity, 262 UNormalizationMode mode, int32_t options, 263 UErrorCode *pErrorCode) { 264 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 265 if(options&UNORM_UNICODE_3_2) { 266 const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); 267 if(U_FAILURE(*pErrorCode)) { 268 return 0; 269 } 270 FilteredNormalizer2 fn2(*n2, *uni32); 271 return _concatenate(left, leftLength, right, rightLength, 272 dest, destCapacity, &fn2, pErrorCode); 273 } 274 return _concatenate(left, leftLength, right, rightLength, 275 dest, destCapacity, n2, pErrorCode); 276 } 277 278 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 279