1 /* 2 ************************************************************************* 3 * COPYRIGHT: 4 * Copyright (c) 1996-2012, International Business Machines Corporation and 5 * others. All Rights Reserved. 6 ************************************************************************* 7 */ 8 9 #include "unicode/utypes.h" 10 11 #if !UCONFIG_NO_NORMALIZATION 12 13 #include "unicode/uniset.h" 14 #include "unicode/unistr.h" 15 #include "unicode/chariter.h" 16 #include "unicode/schriter.h" 17 #include "unicode/uchriter.h" 18 #include "unicode/normlzr.h" 19 #include "unicode/utf16.h" 20 #include "cmemory.h" 21 #include "normalizer2impl.h" 22 #include "uprops.h" // for uniset_getUnicode32Instance() 23 24 U_NAMESPACE_BEGIN 25 26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) 27 28 //------------------------------------------------------------------------- 29 // Constructors and other boilerplate 30 //------------------------------------------------------------------------- 31 32 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : 33 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 34 text(new StringCharacterIterator(str)), 35 currentIndex(0), nextIndex(0), 36 buffer(), bufferPos(0) 37 { 38 init(); 39 } 40 41 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) : 42 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 43 text(new UCharCharacterIterator(str, length)), 44 currentIndex(0), nextIndex(0), 45 buffer(), bufferPos(0) 46 { 47 init(); 48 } 49 50 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : 51 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 52 text(iter.clone()), 53 currentIndex(0), nextIndex(0), 54 buffer(), bufferPos(0) 55 { 56 init(); 57 } 58 59 Normalizer::Normalizer(const Normalizer ©) : 60 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions), 61 text(copy.text->clone()), 62 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), 63 buffer(copy.buffer), bufferPos(copy.bufferPos) 64 { 65 init(); 66 } 67 68 void 69 Normalizer::init() { 70 UErrorCode errorCode=U_ZERO_ERROR; 71 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); 72 if(fOptions&UNORM_UNICODE_3_2) { 73 delete fFilteredNorm2; 74 fNorm2=fFilteredNorm2= 75 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); 76 } 77 if(U_FAILURE(errorCode)) { 78 errorCode=U_ZERO_ERROR; 79 fNorm2=Normalizer2Factory::getNoopInstance(errorCode); 80 } 81 } 82 83 Normalizer::~Normalizer() 84 { 85 delete fFilteredNorm2; 86 delete text; 87 } 88 89 Normalizer* 90 Normalizer::clone() const 91 { 92 return new Normalizer(*this); 93 } 94 95 /** 96 * Generates a hash code for this iterator. 97 */ 98 int32_t Normalizer::hashCode() const 99 { 100 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; 101 } 102 103 UBool Normalizer::operator==(const Normalizer& that) const 104 { 105 return 106 this==&that || 107 (fUMode==that.fUMode && 108 fOptions==that.fOptions && 109 *text==*that.text && 110 buffer==that.buffer && 111 bufferPos==that.bufferPos && 112 nextIndex==that.nextIndex); 113 } 114 115 //------------------------------------------------------------------------- 116 // Static utility methods 117 //------------------------------------------------------------------------- 118 119 void U_EXPORT2 120 Normalizer::normalize(const UnicodeString& source, 121 UNormalizationMode mode, int32_t options, 122 UnicodeString& result, 123 UErrorCode &status) { 124 if(source.isBogus() || U_FAILURE(status)) { 125 result.setToBogus(); 126 if(U_SUCCESS(status)) { 127 status=U_ILLEGAL_ARGUMENT_ERROR; 128 } 129 } else { 130 UnicodeString localDest; 131 UnicodeString *dest; 132 133 if(&source!=&result) { 134 dest=&result; 135 } else { 136 // the source and result strings are the same object, use a temporary one 137 dest=&localDest; 138 } 139 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 140 if(U_SUCCESS(status)) { 141 if(options&UNORM_UNICODE_3_2) { 142 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 143 normalize(source, *dest, status); 144 } else { 145 n2->normalize(source, *dest, status); 146 } 147 } 148 if(dest==&localDest && U_SUCCESS(status)) { 149 result=*dest; 150 } 151 } 152 } 153 154 void U_EXPORT2 155 Normalizer::compose(const UnicodeString& source, 156 UBool compat, int32_t options, 157 UnicodeString& result, 158 UErrorCode &status) { 159 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); 160 } 161 162 void U_EXPORT2 163 Normalizer::decompose(const UnicodeString& source, 164 UBool compat, int32_t options, 165 UnicodeString& result, 166 UErrorCode &status) { 167 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); 168 } 169 170 UNormalizationCheckResult 171 Normalizer::quickCheck(const UnicodeString& source, 172 UNormalizationMode mode, int32_t options, 173 UErrorCode &status) { 174 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 175 if(U_SUCCESS(status)) { 176 if(options&UNORM_UNICODE_3_2) { 177 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 178 quickCheck(source, status); 179 } else { 180 return n2->quickCheck(source, status); 181 } 182 } else { 183 return UNORM_MAYBE; 184 } 185 } 186 187 UBool 188 Normalizer::isNormalized(const UnicodeString& source, 189 UNormalizationMode mode, int32_t options, 190 UErrorCode &status) { 191 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 192 if(U_SUCCESS(status)) { 193 if(options&UNORM_UNICODE_3_2) { 194 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 195 isNormalized(source, status); 196 } else { 197 return n2->isNormalized(source, status); 198 } 199 } else { 200 return FALSE; 201 } 202 } 203 204 UnicodeString & U_EXPORT2 205 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right, 206 UnicodeString &result, 207 UNormalizationMode mode, int32_t options, 208 UErrorCode &errorCode) { 209 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { 210 result.setToBogus(); 211 if(U_SUCCESS(errorCode)) { 212 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 213 } 214 } else { 215 UnicodeString localDest; 216 UnicodeString *dest; 217 218 if(&right!=&result) { 219 dest=&result; 220 } else { 221 // the right and result strings are the same object, use a temporary one 222 dest=&localDest; 223 } 224 *dest=left; 225 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); 226 if(U_SUCCESS(errorCode)) { 227 if(options&UNORM_UNICODE_3_2) { 228 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). 229 append(*dest, right, errorCode); 230 } else { 231 n2->append(*dest, right, errorCode); 232 } 233 } 234 if(dest==&localDest && U_SUCCESS(errorCode)) { 235 result=*dest; 236 } 237 } 238 return result; 239 } 240 241 //------------------------------------------------------------------------- 242 // Iteration API 243 //------------------------------------------------------------------------- 244 245 /** 246 * Return the current character in the normalized text. 247 */ 248 UChar32 Normalizer::current() { 249 if(bufferPos<buffer.length() || nextNormalize()) { 250 return buffer.char32At(bufferPos); 251 } else { 252 return DONE; 253 } 254 } 255 256 /** 257 * Return the next character in the normalized text and advance 258 * the iteration position by one. If the end 259 * of the text has already been reached, {@link #DONE} is returned. 260 */ 261 UChar32 Normalizer::next() { 262 if(bufferPos<buffer.length() || nextNormalize()) { 263 UChar32 c=buffer.char32At(bufferPos); 264 bufferPos+=U16_LENGTH(c); 265 return c; 266 } else { 267 return DONE; 268 } 269 } 270 271 /** 272 * Return the previous character in the normalized text and decrement 273 * the iteration position by one. If the beginning 274 * of the text has already been reached, {@link #DONE} is returned. 275 */ 276 UChar32 Normalizer::previous() { 277 if(bufferPos>0 || previousNormalize()) { 278 UChar32 c=buffer.char32At(bufferPos-1); 279 bufferPos-=U16_LENGTH(c); 280 return c; 281 } else { 282 return DONE; 283 } 284 } 285 286 void Normalizer::reset() { 287 currentIndex=nextIndex=text->setToStart(); 288 clearBuffer(); 289 } 290 291 void 292 Normalizer::setIndexOnly(int32_t index) { 293 text->setIndex(index); // pins index 294 currentIndex=nextIndex=text->getIndex(); 295 clearBuffer(); 296 } 297 298 /** 299 * Return the first character in the normalized text. This resets 300 * the <tt>Normalizer's</tt> position to the beginning of the text. 301 */ 302 UChar32 Normalizer::first() { 303 reset(); 304 return next(); 305 } 306 307 /** 308 * Return the last character in the normalized text. This resets 309 * the <tt>Normalizer's</tt> position to be just before the 310 * the input text corresponding to that normalized character. 311 */ 312 UChar32 Normalizer::last() { 313 currentIndex=nextIndex=text->setToEnd(); 314 clearBuffer(); 315 return previous(); 316 } 317 318 /** 319 * Retrieve the current iteration position in the input text that is 320 * being normalized. This method is useful in applications such as 321 * searching, where you need to be able to determine the position in 322 * the input text that corresponds to a given normalized output character. 323 * <p> 324 * <b>Note:</b> This method sets the position in the <em>input</em>, while 325 * {@link #next} and {@link #previous} iterate through characters in the 326 * <em>output</em>. This means that there is not necessarily a one-to-one 327 * correspondence between characters returned by <tt>next</tt> and 328 * <tt>previous</tt> and the indices passed to and returned from 329 * <tt>setIndex</tt> and {@link #getIndex}. 330 * 331 */ 332 int32_t Normalizer::getIndex() const { 333 if(bufferPos<buffer.length()) { 334 return currentIndex; 335 } else { 336 return nextIndex; 337 } 338 } 339 340 /** 341 * Retrieve the index of the start of the input text. This is the begin index 342 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> 343 * over which this <tt>Normalizer</tt> is iterating 344 */ 345 int32_t Normalizer::startIndex() const { 346 return text->startIndex(); 347 } 348 349 /** 350 * Retrieve the index of the end of the input text. This is the end index 351 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 352 * over which this <tt>Normalizer</tt> is iterating 353 */ 354 int32_t Normalizer::endIndex() const { 355 return text->endIndex(); 356 } 357 358 //------------------------------------------------------------------------- 359 // Property access methods 360 //------------------------------------------------------------------------- 361 362 void 363 Normalizer::setMode(UNormalizationMode newMode) 364 { 365 fUMode = newMode; 366 init(); 367 } 368 369 UNormalizationMode 370 Normalizer::getUMode() const 371 { 372 return fUMode; 373 } 374 375 void 376 Normalizer::setOption(int32_t option, 377 UBool value) 378 { 379 if (value) { 380 fOptions |= option; 381 } else { 382 fOptions &= (~option); 383 } 384 init(); 385 } 386 387 UBool 388 Normalizer::getOption(int32_t option) const 389 { 390 return (fOptions & option) != 0; 391 } 392 393 /** 394 * Set the input text over which this <tt>Normalizer</tt> will iterate. 395 * The iteration position is set to the beginning of the input text. 396 */ 397 void 398 Normalizer::setText(const UnicodeString& newText, 399 UErrorCode &status) 400 { 401 if (U_FAILURE(status)) { 402 return; 403 } 404 CharacterIterator *newIter = new StringCharacterIterator(newText); 405 if (newIter == NULL) { 406 status = U_MEMORY_ALLOCATION_ERROR; 407 return; 408 } 409 delete text; 410 text = newIter; 411 reset(); 412 } 413 414 /** 415 * Set the input text over which this <tt>Normalizer</tt> will iterate. 416 * The iteration position is set to the beginning of the string. 417 */ 418 void 419 Normalizer::setText(const CharacterIterator& newText, 420 UErrorCode &status) 421 { 422 if (U_FAILURE(status)) { 423 return; 424 } 425 CharacterIterator *newIter = newText.clone(); 426 if (newIter == NULL) { 427 status = U_MEMORY_ALLOCATION_ERROR; 428 return; 429 } 430 delete text; 431 text = newIter; 432 reset(); 433 } 434 435 void 436 Normalizer::setText(const UChar* newText, 437 int32_t length, 438 UErrorCode &status) 439 { 440 if (U_FAILURE(status)) { 441 return; 442 } 443 CharacterIterator *newIter = new UCharCharacterIterator(newText, length); 444 if (newIter == NULL) { 445 status = U_MEMORY_ALLOCATION_ERROR; 446 return; 447 } 448 delete text; 449 text = newIter; 450 reset(); 451 } 452 453 /** 454 * Copies the text under iteration into the UnicodeString referred to by "result". 455 * @param result Receives a copy of the text under iteration. 456 */ 457 void 458 Normalizer::getText(UnicodeString& result) 459 { 460 text->getText(result); 461 } 462 463 //------------------------------------------------------------------------- 464 // Private utility methods 465 //------------------------------------------------------------------------- 466 467 void Normalizer::clearBuffer() { 468 buffer.remove(); 469 bufferPos=0; 470 } 471 472 UBool 473 Normalizer::nextNormalize() { 474 clearBuffer(); 475 currentIndex=nextIndex; 476 text->setIndex(nextIndex); 477 if(!text->hasNext()) { 478 return FALSE; 479 } 480 // Skip at least one character so we make progress. 481 UnicodeString segment(text->next32PostInc()); 482 while(text->hasNext()) { 483 UChar32 c; 484 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { 485 text->move32(-1, CharacterIterator::kCurrent); 486 break; 487 } 488 segment.append(c); 489 } 490 nextIndex=text->getIndex(); 491 UErrorCode errorCode=U_ZERO_ERROR; 492 fNorm2->normalize(segment, buffer, errorCode); 493 return U_SUCCESS(errorCode) && !buffer.isEmpty(); 494 } 495 496 UBool 497 Normalizer::previousNormalize() { 498 clearBuffer(); 499 nextIndex=currentIndex; 500 text->setIndex(currentIndex); 501 if(!text->hasPrevious()) { 502 return FALSE; 503 } 504 UnicodeString segment; 505 while(text->hasPrevious()) { 506 UChar32 c=text->previous32(); 507 segment.insert(0, c); 508 if(fNorm2->hasBoundaryBefore(c)) { 509 break; 510 } 511 } 512 currentIndex=text->getIndex(); 513 UErrorCode errorCode=U_ZERO_ERROR; 514 fNorm2->normalize(segment, buffer, errorCode); 515 bufferPos=buffer.length(); 516 return U_SUCCESS(errorCode) && !buffer.isEmpty(); 517 } 518 519 U_NAMESPACE_END 520 521 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 522