1 /* 2 ************************************************************************* 3 * COPYRIGHT: 4 * Copyright (c) 1996-2010, International Business Machines Corporation and 5 * others. All Rights Reserved. 6 ************************************************************************* 7 */ 8 9 #include "unicode/utypes.h" 10 11 #if !UCONFIG_NO_NORMALIZATION 12 13 #include "unicode/uniset.h" 14 #include "unicode/unistr.h" 15 #include "unicode/chariter.h" 16 #include "unicode/schriter.h" 17 #include "unicode/uchriter.h" 18 #include "unicode/normlzr.h" 19 #include "cmemory.h" 20 #include "normalizer2impl.h" 21 #include "uprops.h" // for uniset_getUnicode32Instance() 22 23 U_NAMESPACE_BEGIN 24 25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) 26 27 //------------------------------------------------------------------------- 28 // Constructors and other boilerplate 29 //------------------------------------------------------------------------- 30 31 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : 32 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 33 text(new StringCharacterIterator(str)), 34 currentIndex(0), nextIndex(0), 35 buffer(), bufferPos(0) 36 { 37 init(); 38 } 39 40 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) : 41 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 42 text(new UCharCharacterIterator(str, length)), 43 currentIndex(0), nextIndex(0), 44 buffer(), bufferPos(0) 45 { 46 init(); 47 } 48 49 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : 50 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 51 text(iter.clone()), 52 currentIndex(0), nextIndex(0), 53 buffer(), bufferPos(0) 54 { 55 init(); 56 } 57 58 Normalizer::Normalizer(const Normalizer ©) : 59 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions), 60 text(copy.text->clone()), 61 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), 62 buffer(copy.buffer), bufferPos(copy.bufferPos) 63 { 64 init(); 65 } 66 67 static const UChar _NUL=0; 68 69 void 70 Normalizer::init() { 71 UErrorCode errorCode=U_ZERO_ERROR; 72 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); 73 if(fOptions&UNORM_UNICODE_3_2) { 74 delete fFilteredNorm2; 75 fNorm2=fFilteredNorm2= 76 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); 77 } 78 if(U_FAILURE(errorCode)) { 79 errorCode=U_ZERO_ERROR; 80 fNorm2=Normalizer2Factory::getNoopInstance(errorCode); 81 } 82 } 83 84 Normalizer::~Normalizer() 85 { 86 delete fFilteredNorm2; 87 delete text; 88 } 89 90 Normalizer* 91 Normalizer::clone() const 92 { 93 return new Normalizer(*this); 94 } 95 96 /** 97 * Generates a hash code for this iterator. 98 */ 99 int32_t Normalizer::hashCode() const 100 { 101 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; 102 } 103 104 UBool Normalizer::operator==(const Normalizer& that) const 105 { 106 return 107 this==&that || 108 (fUMode==that.fUMode && 109 fOptions==that.fOptions && 110 *text==*that.text && 111 buffer==that.buffer && 112 bufferPos==that.bufferPos && 113 nextIndex==that.nextIndex); 114 } 115 116 //------------------------------------------------------------------------- 117 // Static utility methods 118 //------------------------------------------------------------------------- 119 120 void U_EXPORT2 121 Normalizer::normalize(const UnicodeString& source, 122 UNormalizationMode mode, int32_t options, 123 UnicodeString& result, 124 UErrorCode &status) { 125 if(source.isBogus() || U_FAILURE(status)) { 126 result.setToBogus(); 127 if(U_SUCCESS(status)) { 128 status=U_ILLEGAL_ARGUMENT_ERROR; 129 } 130 } else { 131 UnicodeString localDest; 132 UnicodeString *dest; 133 134 if(&source!=&result) { 135 dest=&result; 136 } else { 137 // the source and result strings are the same object, use a temporary one 138 dest=&localDest; 139 } 140 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 141 if(U_SUCCESS(status)) { 142 if(options&UNORM_UNICODE_3_2) { 143 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 144 normalize(source, *dest, status); 145 } else { 146 n2->normalize(source, *dest, status); 147 } 148 } 149 if(dest==&localDest && U_SUCCESS(status)) { 150 result=*dest; 151 } 152 } 153 } 154 155 void U_EXPORT2 156 Normalizer::compose(const UnicodeString& source, 157 UBool compat, int32_t options, 158 UnicodeString& result, 159 UErrorCode &status) { 160 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); 161 } 162 163 void U_EXPORT2 164 Normalizer::decompose(const UnicodeString& source, 165 UBool compat, int32_t options, 166 UnicodeString& result, 167 UErrorCode &status) { 168 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); 169 } 170 171 UNormalizationCheckResult 172 Normalizer::quickCheck(const UnicodeString& source, 173 UNormalizationMode mode, int32_t options, 174 UErrorCode &status) { 175 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 176 if(U_SUCCESS(status)) { 177 if(options&UNORM_UNICODE_3_2) { 178 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 179 quickCheck(source, status); 180 } else { 181 return n2->quickCheck(source, status); 182 } 183 } else { 184 return UNORM_MAYBE; 185 } 186 } 187 188 UBool 189 Normalizer::isNormalized(const UnicodeString& source, 190 UNormalizationMode mode, int32_t options, 191 UErrorCode &status) { 192 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 193 if(U_SUCCESS(status)) { 194 if(options&UNORM_UNICODE_3_2) { 195 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 196 isNormalized(source, status); 197 } else { 198 return n2->isNormalized(source, status); 199 } 200 } else { 201 return FALSE; 202 } 203 } 204 205 UnicodeString & U_EXPORT2 206 Normalizer::concatenate(UnicodeString &left, UnicodeString &right, 207 UnicodeString &result, 208 UNormalizationMode mode, int32_t options, 209 UErrorCode &errorCode) { 210 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { 211 result.setToBogus(); 212 if(U_SUCCESS(errorCode)) { 213 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 214 } 215 } else { 216 UnicodeString localDest; 217 UnicodeString *dest; 218 219 if(&right!=&result) { 220 dest=&result; 221 } else { 222 // the right and result strings are the same object, use a temporary one 223 dest=&localDest; 224 } 225 *dest=left; 226 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); 227 if(U_SUCCESS(errorCode)) { 228 if(options&UNORM_UNICODE_3_2) { 229 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). 230 append(*dest, right, errorCode); 231 } else { 232 n2->append(*dest, right, errorCode); 233 } 234 } 235 if(dest==&localDest && U_SUCCESS(errorCode)) { 236 result=*dest; 237 } 238 } 239 return result; 240 } 241 242 //------------------------------------------------------------------------- 243 // Iteration API 244 //------------------------------------------------------------------------- 245 246 /** 247 * Return the current character in the normalized text. 248 */ 249 UChar32 Normalizer::current() { 250 if(bufferPos<buffer.length() || nextNormalize()) { 251 return buffer.char32At(bufferPos); 252 } else { 253 return DONE; 254 } 255 } 256 257 /** 258 * Return the next character in the normalized text and advance 259 * the iteration position by one. If the end 260 * of the text has already been reached, {@link #DONE} is returned. 261 */ 262 UChar32 Normalizer::next() { 263 if(bufferPos<buffer.length() || nextNormalize()) { 264 UChar32 c=buffer.char32At(bufferPos); 265 bufferPos+=UTF_CHAR_LENGTH(c); 266 return c; 267 } else { 268 return DONE; 269 } 270 } 271 272 /** 273 * Return the previous character in the normalized text and decrement 274 * the iteration position by one. If the beginning 275 * of the text has already been reached, {@link #DONE} is returned. 276 */ 277 UChar32 Normalizer::previous() { 278 if(bufferPos>0 || previousNormalize()) { 279 UChar32 c=buffer.char32At(bufferPos-1); 280 bufferPos-=UTF_CHAR_LENGTH(c); 281 return c; 282 } else { 283 return DONE; 284 } 285 } 286 287 void Normalizer::reset() { 288 currentIndex=nextIndex=text->setToStart(); 289 clearBuffer(); 290 } 291 292 void 293 Normalizer::setIndexOnly(int32_t index) { 294 text->setIndex(index); // pins index 295 currentIndex=nextIndex=text->getIndex(); 296 clearBuffer(); 297 } 298 299 /** 300 * Return the first character in the normalized text. This resets 301 * the <tt>Normalizer's</tt> position to the beginning of the text. 302 */ 303 UChar32 Normalizer::first() { 304 reset(); 305 return next(); 306 } 307 308 /** 309 * Return the last character in the normalized text. This resets 310 * the <tt>Normalizer's</tt> position to be just before the 311 * the input text corresponding to that normalized character. 312 */ 313 UChar32 Normalizer::last() { 314 currentIndex=nextIndex=text->setToEnd(); 315 clearBuffer(); 316 return previous(); 317 } 318 319 /** 320 * Retrieve the current iteration position in the input text that is 321 * being normalized. This method is useful in applications such as 322 * searching, where you need to be able to determine the position in 323 * the input text that corresponds to a given normalized output character. 324 * <p> 325 * <b>Note:</b> This method sets the position in the <em>input</em>, while 326 * {@link #next} and {@link #previous} iterate through characters in the 327 * <em>output</em>. This means that there is not necessarily a one-to-one 328 * correspondence between characters returned by <tt>next</tt> and 329 * <tt>previous</tt> and the indices passed to and returned from 330 * <tt>setIndex</tt> and {@link #getIndex}. 331 * 332 */ 333 int32_t Normalizer::getIndex() const { 334 if(bufferPos<buffer.length()) { 335 return currentIndex; 336 } else { 337 return nextIndex; 338 } 339 } 340 341 /** 342 * Retrieve the index of the start of the input text. This is the begin index 343 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> 344 * over which this <tt>Normalizer</tt> is iterating 345 */ 346 int32_t Normalizer::startIndex() const { 347 return text->startIndex(); 348 } 349 350 /** 351 * Retrieve the index of the end of the input text. This is the end index 352 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 353 * over which this <tt>Normalizer</tt> is iterating 354 */ 355 int32_t Normalizer::endIndex() const { 356 return text->endIndex(); 357 } 358 359 //------------------------------------------------------------------------- 360 // Property access methods 361 //------------------------------------------------------------------------- 362 363 void 364 Normalizer::setMode(UNormalizationMode newMode) 365 { 366 fUMode = newMode; 367 init(); 368 } 369 370 UNormalizationMode 371 Normalizer::getUMode() const 372 { 373 return fUMode; 374 } 375 376 void 377 Normalizer::setOption(int32_t option, 378 UBool value) 379 { 380 if (value) { 381 fOptions |= option; 382 } else { 383 fOptions &= (~option); 384 } 385 init(); 386 } 387 388 UBool 389 Normalizer::getOption(int32_t option) const 390 { 391 return (fOptions & option) != 0; 392 } 393 394 /** 395 * Set the input text over which this <tt>Normalizer</tt> will iterate. 396 * The iteration position is set to the beginning of the input text. 397 */ 398 void 399 Normalizer::setText(const UnicodeString& newText, 400 UErrorCode &status) 401 { 402 if (U_FAILURE(status)) { 403 return; 404 } 405 CharacterIterator *newIter = new StringCharacterIterator(newText); 406 if (newIter == NULL) { 407 status = U_MEMORY_ALLOCATION_ERROR; 408 return; 409 } 410 delete text; 411 text = newIter; 412 reset(); 413 } 414 415 /** 416 * Set the input text over which this <tt>Normalizer</tt> will iterate. 417 * The iteration position is set to the beginning of the string. 418 */ 419 void 420 Normalizer::setText(const CharacterIterator& newText, 421 UErrorCode &status) 422 { 423 if (U_FAILURE(status)) { 424 return; 425 } 426 CharacterIterator *newIter = newText.clone(); 427 if (newIter == NULL) { 428 status = U_MEMORY_ALLOCATION_ERROR; 429 return; 430 } 431 delete text; 432 text = newIter; 433 reset(); 434 } 435 436 void 437 Normalizer::setText(const UChar* newText, 438 int32_t length, 439 UErrorCode &status) 440 { 441 if (U_FAILURE(status)) { 442 return; 443 } 444 CharacterIterator *newIter = new UCharCharacterIterator(newText, length); 445 if (newIter == NULL) { 446 status = U_MEMORY_ALLOCATION_ERROR; 447 return; 448 } 449 delete text; 450 text = newIter; 451 reset(); 452 } 453 454 /** 455 * Copies the text under iteration into the UnicodeString referred to by "result". 456 * @param result Receives a copy of the text under iteration. 457 */ 458 void 459 Normalizer::getText(UnicodeString& result) 460 { 461 text->getText(result); 462 } 463 464 //------------------------------------------------------------------------- 465 // Private utility methods 466 //------------------------------------------------------------------------- 467 468 void Normalizer::clearBuffer() { 469 buffer.remove(); 470 bufferPos=0; 471 } 472 473 UBool 474 Normalizer::nextNormalize() { 475 clearBuffer(); 476 currentIndex=nextIndex; 477 text->setIndex(nextIndex); 478 if(!text->hasNext()) { 479 return FALSE; 480 } 481 // Skip at least one character so we make progress. 482 UnicodeString segment(text->next32PostInc()); 483 while(text->hasNext()) { 484 UChar32 c; 485 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { 486 text->move32(-1, CharacterIterator::kCurrent); 487 break; 488 } 489 segment.append(c); 490 } 491 nextIndex=text->getIndex(); 492 UErrorCode errorCode=U_ZERO_ERROR; 493 fNorm2->normalize(segment, buffer, errorCode); 494 return U_SUCCESS(errorCode) && !buffer.isEmpty(); 495 } 496 497 UBool 498 Normalizer::previousNormalize() { 499 clearBuffer(); 500 nextIndex=currentIndex; 501 text->setIndex(currentIndex); 502 if(!text->hasPrevious()) { 503 return FALSE; 504 } 505 UnicodeString segment; 506 while(text->hasPrevious()) { 507 UChar32 c=text->previous32(); 508 segment.insert(0, c); 509 if(fNorm2->hasBoundaryBefore(c)) { 510 break; 511 } 512 } 513 currentIndex=text->getIndex(); 514 UErrorCode errorCode=U_ZERO_ERROR; 515 fNorm2->normalize(segment, buffer, errorCode); 516 bufferPos=buffer.length(); 517 return U_SUCCESS(errorCode) && !buffer.isEmpty(); 518 } 519 520 U_NAMESPACE_END 521 522 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 523