1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2001-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: ustrcase.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002feb20 16 * created by: Markus W. Scherer 17 * 18 * Implementation file for string casing C API functions. 19 * Uses functions from uchar.c for basic functionality that requires access 20 * to the Unicode Character Database (uprops.dat). 21 */ 22 23 #include "unicode/utypes.h" 24 #include "unicode/brkiter.h" 25 #include "unicode/casemap.h" 26 #include "unicode/edits.h" 27 #include "unicode/stringoptions.h" 28 #include "unicode/ustring.h" 29 #include "unicode/ucasemap.h" 30 #include "unicode/ubrk.h" 31 #include "unicode/utf.h" 32 #include "unicode/utf16.h" 33 #include "cmemory.h" 34 #include "ucase.h" 35 #include "ucasemap_imp.h" 36 #include "ustr_imp.h" 37 #include "uassert.h" 38 39 U_NAMESPACE_BEGIN 40 41 namespace { 42 43 int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity, 44 Edits *edits, UErrorCode &errorCode) { 45 if (U_SUCCESS(errorCode)) { 46 if (destIndex > destCapacity) { 47 errorCode = U_BUFFER_OVERFLOW_ERROR; 48 } else if (edits != NULL) { 49 edits->copyErrorTo(errorCode); 50 } 51 } 52 return destIndex; 53 } 54 55 } // namespace 56 57 U_NAMESPACE_END 58 59 U_NAMESPACE_USE 60 61 /* string casing ------------------------------------------------------------ */ 62 63 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ 64 static inline int32_t 65 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, 66 int32_t result, const UChar *s, 67 int32_t cpLength, uint32_t options, icu::Edits *edits) { 68 UChar32 c; 69 int32_t length; 70 71 /* decode the result */ 72 if(result<0) { 73 /* (not) original code point */ 74 if(edits!=NULL) { 75 edits->addUnchanged(cpLength); 76 } 77 if(options & U_OMIT_UNCHANGED_TEXT) { 78 return destIndex; 79 } 80 c=~result; 81 if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath 82 dest[destIndex++]=(UChar)c; 83 return destIndex; 84 } 85 length=cpLength; 86 } else { 87 if(result<=UCASE_MAX_STRING_LENGTH) { 88 c=U_SENTINEL; 89 length=result; 90 } else if(destIndex<destCapacity && result<=0xffff) { // BMP slightly-fastpath 91 dest[destIndex++]=(UChar)result; 92 if(edits!=NULL) { 93 edits->addReplace(cpLength, 1); 94 } 95 return destIndex; 96 } else { 97 c=result; 98 length=U16_LENGTH(c); 99 } 100 if(edits!=NULL) { 101 edits->addReplace(cpLength, length); 102 } 103 } 104 if(length>(INT32_MAX-destIndex)) { 105 return -1; // integer overflow 106 } 107 108 if(destIndex<destCapacity) { 109 /* append the result */ 110 if(c>=0) { 111 /* code point */ 112 UBool isError=FALSE; 113 U16_APPEND(dest, destIndex, destCapacity, c, isError); 114 if(isError) { 115 /* overflow, nothing written */ 116 destIndex+=length; 117 } 118 } else { 119 /* string */ 120 if((destIndex+length)<=destCapacity) { 121 while(length>0) { 122 dest[destIndex++]=*s++; 123 --length; 124 } 125 } else { 126 /* overflow */ 127 destIndex+=length; 128 } 129 } 130 } else { 131 /* preflight */ 132 destIndex+=length; 133 } 134 return destIndex; 135 } 136 137 static inline int32_t 138 appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) { 139 if(destIndex<destCapacity) { 140 dest[destIndex]=c; 141 } else if(destIndex==INT32_MAX) { 142 return -1; // integer overflow 143 } 144 return destIndex+1; 145 } 146 147 static inline int32_t 148 appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity, 149 const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) { 150 if(length>0) { 151 if(edits!=NULL) { 152 edits->addUnchanged(length); 153 } 154 if(options & U_OMIT_UNCHANGED_TEXT) { 155 return destIndex; 156 } 157 if(length>(INT32_MAX-destIndex)) { 158 return -1; // integer overflow 159 } 160 if((destIndex+length)<=destCapacity) { 161 u_memcpy(dest+destIndex, s, length); 162 } 163 destIndex+=length; 164 } 165 return destIndex; 166 } 167 168 static UChar32 U_CALLCONV 169 utf16_caseContextIterator(void *context, int8_t dir) { 170 UCaseContext *csc=(UCaseContext *)context; 171 UChar32 c; 172 173 if(dir<0) { 174 /* reset for backward iteration */ 175 csc->index=csc->cpStart; 176 csc->dir=dir; 177 } else if(dir>0) { 178 /* reset for forward iteration */ 179 csc->index=csc->cpLimit; 180 csc->dir=dir; 181 } else { 182 /* continue current iteration direction */ 183 dir=csc->dir; 184 } 185 186 if(dir<0) { 187 if(csc->start<csc->index) { 188 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c); 189 return c; 190 } 191 } else { 192 if(csc->index<csc->limit) { 193 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c); 194 return c; 195 } 196 } 197 return U_SENTINEL; 198 } 199 200 /* 201 * Case-maps [srcStart..srcLimit[ but takes 202 * context [0..srcLength[ into account. 203 */ 204 static int32_t 205 _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map, 206 UChar *dest, int32_t destCapacity, 207 const UChar *src, UCaseContext *csc, 208 int32_t srcStart, int32_t srcLimit, 209 icu::Edits *edits, 210 UErrorCode &errorCode) { 211 /* case mapping loop */ 212 int32_t srcIndex=srcStart; 213 int32_t destIndex=0; 214 while(srcIndex<srcLimit) { 215 int32_t cpStart; 216 csc->cpStart=cpStart=srcIndex; 217 UChar32 c; 218 U16_NEXT(src, srcIndex, srcLimit, c); 219 csc->cpLimit=srcIndex; 220 const UChar *s; 221 c=map(c, utf16_caseContextIterator, csc, &s, caseLocale); 222 destIndex = appendResult(dest, destIndex, destCapacity, c, s, 223 srcIndex - cpStart, options, edits); 224 if (destIndex < 0) { 225 errorCode = U_INDEX_OUTOFBOUNDS_ERROR; 226 return 0; 227 } 228 } 229 230 return destIndex; 231 } 232 233 #if !UCONFIG_NO_BREAK_ITERATION 234 235 U_CFUNC int32_t U_CALLCONV 236 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter, 237 UChar *dest, int32_t destCapacity, 238 const UChar *src, int32_t srcLength, 239 icu::Edits *edits, 240 UErrorCode &errorCode) { 241 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) { 242 return 0; 243 } 244 245 /* set up local variables */ 246 UCaseContext csc=UCASECONTEXT_INITIALIZER; 247 csc.p=(void *)src; 248 csc.limit=srcLength; 249 int32_t destIndex=0; 250 int32_t prev=0; 251 UBool isFirstIndex=TRUE; 252 253 /* titlecasing loop */ 254 while(prev<srcLength) { 255 /* find next index where to titlecase */ 256 int32_t index; 257 if(isFirstIndex) { 258 isFirstIndex=FALSE; 259 index=iter->first(); 260 } else { 261 index=iter->next(); 262 } 263 if(index==UBRK_DONE || index>srcLength) { 264 index=srcLength; 265 } 266 267 /* 268 * Segment [prev..index[ into 3 parts: 269 * a) skipped characters (copy as-is) [prev..titleStart[ 270 * b) first letter (titlecase) [titleStart..titleLimit[ 271 * c) subsequent characters (lowercase) [titleLimit..index[ 272 */ 273 if(prev<index) { 274 // Find and copy skipped characters [prev..titleStart[ 275 int32_t titleStart=prev; 276 int32_t titleLimit=prev; 277 UChar32 c; 278 U16_NEXT(src, titleLimit, index, c); 279 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) { 280 // Adjust the titlecasing index to the next cased character, 281 // or to the next letter/number/symbol/private use. 282 // Stop with titleStart<titleLimit<=index 283 // if there is a character to be titlecased, 284 // or else stop with titleStart==titleLimit==index. 285 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; 286 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) { 287 titleStart=titleLimit; 288 if(titleLimit==index) { 289 break; 290 } 291 U16_NEXT(src, titleLimit, index, c); 292 } 293 if (prev < titleStart) { 294 destIndex=appendUnchanged(dest, destIndex, destCapacity, 295 src+prev, titleStart-prev, options, edits); 296 if(destIndex<0) { 297 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 298 return 0; 299 } 300 } 301 } 302 303 if(titleStart<titleLimit) { 304 /* titlecase c which is from [titleStart..titleLimit[ */ 305 csc.cpStart=titleStart; 306 csc.cpLimit=titleLimit; 307 const UChar *s; 308 c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale); 309 destIndex=appendResult(dest, destIndex, destCapacity, c, s, 310 titleLimit-titleStart, options, edits); 311 if(destIndex<0) { 312 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 313 return 0; 314 } 315 316 /* Special case Dutch IJ titlecasing */ 317 if (titleStart+1 < index && 318 caseLocale == UCASE_LOC_DUTCH && 319 (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { 320 if (src[titleStart+1] == 0x006A) { 321 destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A); 322 if(destIndex<0) { 323 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 324 return 0; 325 } 326 if(edits!=NULL) { 327 edits->addReplace(1, 1); 328 } 329 titleLimit++; 330 } else if (src[titleStart+1] == 0x004A) { 331 // Keep the capital J from getting lowercased. 332 destIndex=appendUnchanged(dest, destIndex, destCapacity, 333 src+titleStart+1, 1, options, edits); 334 if(destIndex<0) { 335 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 336 return 0; 337 } 338 titleLimit++; 339 } 340 } 341 342 /* lowercase [titleLimit..index[ */ 343 if(titleLimit<index) { 344 if((options&U_TITLECASE_NO_LOWERCASE)==0) { 345 /* Normal operation: Lowercase the rest of the word. */ 346 destIndex+= 347 _caseMap( 348 caseLocale, options, ucase_toFullLower, 349 dest+destIndex, destCapacity-destIndex, 350 src, &csc, 351 titleLimit, index, 352 edits, errorCode); 353 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 354 errorCode=U_ZERO_ERROR; 355 } 356 if(U_FAILURE(errorCode)) { 357 return destIndex; 358 } 359 } else { 360 /* Optionally just copy the rest of the word unchanged. */ 361 destIndex=appendUnchanged(dest, destIndex, destCapacity, 362 src+titleLimit, index-titleLimit, options, edits); 363 if(destIndex<0) { 364 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 365 return 0; 366 } 367 } 368 } 369 } 370 } 371 372 prev=index; 373 } 374 375 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); 376 } 377 378 #endif // !UCONFIG_NO_BREAK_ITERATION 379 380 U_NAMESPACE_BEGIN 381 namespace GreekUpper { 382 383 // Data generated by prototype code, see 384 // http://site.icu-project.org/design/case/greek-upper 385 // TODO: Move this data into ucase.icu. 386 static const uint16_t data0370[] = { 387 // U+0370..03FF 388 0x0370, 389 0x0370, 390 0x0372, 391 0x0372, 392 0, 393 0, 394 0x0376, 395 0x0376, 396 0, 397 0, 398 0x037A, 399 0x03FD, 400 0x03FE, 401 0x03FF, 402 0, 403 0x037F, 404 0, 405 0, 406 0, 407 0, 408 0, 409 0, 410 0x0391 | HAS_VOWEL | HAS_ACCENT, 411 0, 412 0x0395 | HAS_VOWEL | HAS_ACCENT, 413 0x0397 | HAS_VOWEL | HAS_ACCENT, 414 0x0399 | HAS_VOWEL | HAS_ACCENT, 415 0, 416 0x039F | HAS_VOWEL | HAS_ACCENT, 417 0, 418 0x03A5 | HAS_VOWEL | HAS_ACCENT, 419 0x03A9 | HAS_VOWEL | HAS_ACCENT, 420 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, 421 0x0391 | HAS_VOWEL, 422 0x0392, 423 0x0393, 424 0x0394, 425 0x0395 | HAS_VOWEL, 426 0x0396, 427 0x0397 | HAS_VOWEL, 428 0x0398, 429 0x0399 | HAS_VOWEL, 430 0x039A, 431 0x039B, 432 0x039C, 433 0x039D, 434 0x039E, 435 0x039F | HAS_VOWEL, 436 0x03A0, 437 0x03A1, 438 0, 439 0x03A3, 440 0x03A4, 441 0x03A5 | HAS_VOWEL, 442 0x03A6, 443 0x03A7, 444 0x03A8, 445 0x03A9 | HAS_VOWEL, 446 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, 447 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, 448 0x0391 | HAS_VOWEL | HAS_ACCENT, 449 0x0395 | HAS_VOWEL | HAS_ACCENT, 450 0x0397 | HAS_VOWEL | HAS_ACCENT, 451 0x0399 | HAS_VOWEL | HAS_ACCENT, 452 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, 453 0x0391 | HAS_VOWEL, 454 0x0392, 455 0x0393, 456 0x0394, 457 0x0395 | HAS_VOWEL, 458 0x0396, 459 0x0397 | HAS_VOWEL, 460 0x0398, 461 0x0399 | HAS_VOWEL, 462 0x039A, 463 0x039B, 464 0x039C, 465 0x039D, 466 0x039E, 467 0x039F | HAS_VOWEL, 468 0x03A0, 469 0x03A1, 470 0x03A3, 471 0x03A3, 472 0x03A4, 473 0x03A5 | HAS_VOWEL, 474 0x03A6, 475 0x03A7, 476 0x03A8, 477 0x03A9 | HAS_VOWEL, 478 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, 479 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, 480 0x039F | HAS_VOWEL | HAS_ACCENT, 481 0x03A5 | HAS_VOWEL | HAS_ACCENT, 482 0x03A9 | HAS_VOWEL | HAS_ACCENT, 483 0x03CF, 484 0x0392, 485 0x0398, 486 0x03D2, 487 0x03D2 | HAS_ACCENT, 488 0x03D2 | HAS_DIALYTIKA, 489 0x03A6, 490 0x03A0, 491 0x03CF, 492 0x03D8, 493 0x03D8, 494 0x03DA, 495 0x03DA, 496 0x03DC, 497 0x03DC, 498 0x03DE, 499 0x03DE, 500 0x03E0, 501 0x03E0, 502 0, 503 0, 504 0, 505 0, 506 0, 507 0, 508 0, 509 0, 510 0, 511 0, 512 0, 513 0, 514 0, 515 0, 516 0x039A, 517 0x03A1, 518 0x03F9, 519 0x037F, 520 0x03F4, 521 0x0395 | HAS_VOWEL, 522 0, 523 0x03F7, 524 0x03F7, 525 0x03F9, 526 0x03FA, 527 0x03FA, 528 0x03FC, 529 0x03FD, 530 0x03FE, 531 0x03FF, 532 }; 533 534 static const uint16_t data1F00[] = { 535 // U+1F00..1FFF 536 0x0391 | HAS_VOWEL, 537 0x0391 | HAS_VOWEL, 538 0x0391 | HAS_VOWEL | HAS_ACCENT, 539 0x0391 | HAS_VOWEL | HAS_ACCENT, 540 0x0391 | HAS_VOWEL | HAS_ACCENT, 541 0x0391 | HAS_VOWEL | HAS_ACCENT, 542 0x0391 | HAS_VOWEL | HAS_ACCENT, 543 0x0391 | HAS_VOWEL | HAS_ACCENT, 544 0x0391 | HAS_VOWEL, 545 0x0391 | HAS_VOWEL, 546 0x0391 | HAS_VOWEL | HAS_ACCENT, 547 0x0391 | HAS_VOWEL | HAS_ACCENT, 548 0x0391 | HAS_VOWEL | HAS_ACCENT, 549 0x0391 | HAS_VOWEL | HAS_ACCENT, 550 0x0391 | HAS_VOWEL | HAS_ACCENT, 551 0x0391 | HAS_VOWEL | HAS_ACCENT, 552 0x0395 | HAS_VOWEL, 553 0x0395 | HAS_VOWEL, 554 0x0395 | HAS_VOWEL | HAS_ACCENT, 555 0x0395 | HAS_VOWEL | HAS_ACCENT, 556 0x0395 | HAS_VOWEL | HAS_ACCENT, 557 0x0395 | HAS_VOWEL | HAS_ACCENT, 558 0, 559 0, 560 0x0395 | HAS_VOWEL, 561 0x0395 | HAS_VOWEL, 562 0x0395 | HAS_VOWEL | HAS_ACCENT, 563 0x0395 | HAS_VOWEL | HAS_ACCENT, 564 0x0395 | HAS_VOWEL | HAS_ACCENT, 565 0x0395 | HAS_VOWEL | HAS_ACCENT, 566 0, 567 0, 568 0x0397 | HAS_VOWEL, 569 0x0397 | HAS_VOWEL, 570 0x0397 | HAS_VOWEL | HAS_ACCENT, 571 0x0397 | HAS_VOWEL | HAS_ACCENT, 572 0x0397 | HAS_VOWEL | HAS_ACCENT, 573 0x0397 | HAS_VOWEL | HAS_ACCENT, 574 0x0397 | HAS_VOWEL | HAS_ACCENT, 575 0x0397 | HAS_VOWEL | HAS_ACCENT, 576 0x0397 | HAS_VOWEL, 577 0x0397 | HAS_VOWEL, 578 0x0397 | HAS_VOWEL | HAS_ACCENT, 579 0x0397 | HAS_VOWEL | HAS_ACCENT, 580 0x0397 | HAS_VOWEL | HAS_ACCENT, 581 0x0397 | HAS_VOWEL | HAS_ACCENT, 582 0x0397 | HAS_VOWEL | HAS_ACCENT, 583 0x0397 | HAS_VOWEL | HAS_ACCENT, 584 0x0399 | HAS_VOWEL, 585 0x0399 | HAS_VOWEL, 586 0x0399 | HAS_VOWEL | HAS_ACCENT, 587 0x0399 | HAS_VOWEL | HAS_ACCENT, 588 0x0399 | HAS_VOWEL | HAS_ACCENT, 589 0x0399 | HAS_VOWEL | HAS_ACCENT, 590 0x0399 | HAS_VOWEL | HAS_ACCENT, 591 0x0399 | HAS_VOWEL | HAS_ACCENT, 592 0x0399 | HAS_VOWEL, 593 0x0399 | HAS_VOWEL, 594 0x0399 | HAS_VOWEL | HAS_ACCENT, 595 0x0399 | HAS_VOWEL | HAS_ACCENT, 596 0x0399 | HAS_VOWEL | HAS_ACCENT, 597 0x0399 | HAS_VOWEL | HAS_ACCENT, 598 0x0399 | HAS_VOWEL | HAS_ACCENT, 599 0x0399 | HAS_VOWEL | HAS_ACCENT, 600 0x039F | HAS_VOWEL, 601 0x039F | HAS_VOWEL, 602 0x039F | HAS_VOWEL | HAS_ACCENT, 603 0x039F | HAS_VOWEL | HAS_ACCENT, 604 0x039F | HAS_VOWEL | HAS_ACCENT, 605 0x039F | HAS_VOWEL | HAS_ACCENT, 606 0, 607 0, 608 0x039F | HAS_VOWEL, 609 0x039F | HAS_VOWEL, 610 0x039F | HAS_VOWEL | HAS_ACCENT, 611 0x039F | HAS_VOWEL | HAS_ACCENT, 612 0x039F | HAS_VOWEL | HAS_ACCENT, 613 0x039F | HAS_VOWEL | HAS_ACCENT, 614 0, 615 0, 616 0x03A5 | HAS_VOWEL, 617 0x03A5 | HAS_VOWEL, 618 0x03A5 | HAS_VOWEL | HAS_ACCENT, 619 0x03A5 | HAS_VOWEL | HAS_ACCENT, 620 0x03A5 | HAS_VOWEL | HAS_ACCENT, 621 0x03A5 | HAS_VOWEL | HAS_ACCENT, 622 0x03A5 | HAS_VOWEL | HAS_ACCENT, 623 0x03A5 | HAS_VOWEL | HAS_ACCENT, 624 0, 625 0x03A5 | HAS_VOWEL, 626 0, 627 0x03A5 | HAS_VOWEL | HAS_ACCENT, 628 0, 629 0x03A5 | HAS_VOWEL | HAS_ACCENT, 630 0, 631 0x03A5 | HAS_VOWEL | HAS_ACCENT, 632 0x03A9 | HAS_VOWEL, 633 0x03A9 | HAS_VOWEL, 634 0x03A9 | HAS_VOWEL | HAS_ACCENT, 635 0x03A9 | HAS_VOWEL | HAS_ACCENT, 636 0x03A9 | HAS_VOWEL | HAS_ACCENT, 637 0x03A9 | HAS_VOWEL | HAS_ACCENT, 638 0x03A9 | HAS_VOWEL | HAS_ACCENT, 639 0x03A9 | HAS_VOWEL | HAS_ACCENT, 640 0x03A9 | HAS_VOWEL, 641 0x03A9 | HAS_VOWEL, 642 0x03A9 | HAS_VOWEL | HAS_ACCENT, 643 0x03A9 | HAS_VOWEL | HAS_ACCENT, 644 0x03A9 | HAS_VOWEL | HAS_ACCENT, 645 0x03A9 | HAS_VOWEL | HAS_ACCENT, 646 0x03A9 | HAS_VOWEL | HAS_ACCENT, 647 0x03A9 | HAS_VOWEL | HAS_ACCENT, 648 0x0391 | HAS_VOWEL | HAS_ACCENT, 649 0x0391 | HAS_VOWEL | HAS_ACCENT, 650 0x0395 | HAS_VOWEL | HAS_ACCENT, 651 0x0395 | HAS_VOWEL | HAS_ACCENT, 652 0x0397 | HAS_VOWEL | HAS_ACCENT, 653 0x0397 | HAS_VOWEL | HAS_ACCENT, 654 0x0399 | HAS_VOWEL | HAS_ACCENT, 655 0x0399 | HAS_VOWEL | HAS_ACCENT, 656 0x039F | HAS_VOWEL | HAS_ACCENT, 657 0x039F | HAS_VOWEL | HAS_ACCENT, 658 0x03A5 | HAS_VOWEL | HAS_ACCENT, 659 0x03A5 | HAS_VOWEL | HAS_ACCENT, 660 0x03A9 | HAS_VOWEL | HAS_ACCENT, 661 0x03A9 | HAS_VOWEL | HAS_ACCENT, 662 0, 663 0, 664 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 665 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 666 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 667 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 668 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 669 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 670 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 671 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 672 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 673 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 674 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 675 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 676 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 677 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 678 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 679 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 680 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 681 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 682 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 683 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 684 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 685 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 686 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 687 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 688 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 689 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 690 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 691 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 692 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 693 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 694 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 695 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 696 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 697 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 698 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 699 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 700 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 701 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 702 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 703 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 704 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 705 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 706 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 707 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 708 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 709 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 710 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 711 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 712 0x0391 | HAS_VOWEL, 713 0x0391 | HAS_VOWEL, 714 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 715 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 716 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 717 0, 718 0x0391 | HAS_VOWEL | HAS_ACCENT, 719 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 720 0x0391 | HAS_VOWEL, 721 0x0391 | HAS_VOWEL, 722 0x0391 | HAS_VOWEL | HAS_ACCENT, 723 0x0391 | HAS_VOWEL | HAS_ACCENT, 724 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 725 0, 726 0x0399 | HAS_VOWEL, 727 0, 728 0, 729 0, 730 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 731 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 732 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 733 0, 734 0x0397 | HAS_VOWEL | HAS_ACCENT, 735 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 736 0x0395 | HAS_VOWEL | HAS_ACCENT, 737 0x0395 | HAS_VOWEL | HAS_ACCENT, 738 0x0397 | HAS_VOWEL | HAS_ACCENT, 739 0x0397 | HAS_VOWEL | HAS_ACCENT, 740 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 741 0, 742 0, 743 0, 744 0x0399 | HAS_VOWEL, 745 0x0399 | HAS_VOWEL, 746 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, 747 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, 748 0, 749 0, 750 0x0399 | HAS_VOWEL | HAS_ACCENT, 751 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, 752 0x0399 | HAS_VOWEL, 753 0x0399 | HAS_VOWEL, 754 0x0399 | HAS_VOWEL | HAS_ACCENT, 755 0x0399 | HAS_VOWEL | HAS_ACCENT, 756 0, 757 0, 758 0, 759 0, 760 0x03A5 | HAS_VOWEL, 761 0x03A5 | HAS_VOWEL, 762 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, 763 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, 764 0x03A1, 765 0x03A1, 766 0x03A5 | HAS_VOWEL | HAS_ACCENT, 767 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, 768 0x03A5 | HAS_VOWEL, 769 0x03A5 | HAS_VOWEL, 770 0x03A5 | HAS_VOWEL | HAS_ACCENT, 771 0x03A5 | HAS_VOWEL | HAS_ACCENT, 772 0x03A1, 773 0, 774 0, 775 0, 776 0, 777 0, 778 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 779 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 780 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 781 0, 782 0x03A9 | HAS_VOWEL | HAS_ACCENT, 783 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, 784 0x039F | HAS_VOWEL | HAS_ACCENT, 785 0x039F | HAS_VOWEL | HAS_ACCENT, 786 0x03A9 | HAS_VOWEL | HAS_ACCENT, 787 0x03A9 | HAS_VOWEL | HAS_ACCENT, 788 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, 789 0, 790 0, 791 0, 792 }; 793 794 // U+2126 Ohm sign 795 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL; 796 797 uint32_t getLetterData(UChar32 c) { 798 if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) { 799 return 0; 800 } else if (c <= 0x3ff) { 801 return data0370[c - 0x370]; 802 } else if (c <= 0x1fff) { 803 return data1F00[c - 0x1f00]; 804 } else if (c == 0x2126) { 805 return data2126; 806 } else { 807 return 0; 808 } 809 } 810 811 uint32_t getDiacriticData(UChar32 c) { 812 switch (c) { 813 case 0x0300: // varia 814 case 0x0301: // tonos = oxia 815 case 0x0342: // perispomeni 816 case 0x0302: // circumflex can look like perispomeni 817 case 0x0303: // tilde can look like perispomeni 818 case 0x0311: // inverted breve can look like perispomeni 819 return HAS_ACCENT; 820 case 0x0308: // dialytika = diaeresis 821 return HAS_COMBINING_DIALYTIKA; 822 case 0x0344: // dialytika tonos 823 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT; 824 case 0x0345: // ypogegrammeni = iota subscript 825 return HAS_YPOGEGRAMMENI; 826 case 0x0304: // macron 827 case 0x0306: // breve 828 case 0x0313: // comma above 829 case 0x0314: // reversed comma above 830 case 0x0343: // koronis 831 return HAS_OTHER_GREEK_DIACRITIC; 832 default: 833 return 0; 834 } 835 } 836 837 UBool isFollowedByCasedLetter(const UChar *s, int32_t i, int32_t length) { 838 while (i < length) { 839 UChar32 c; 840 U16_NEXT(s, i, length, c); 841 int32_t type = ucase_getTypeOrIgnorable(c); 842 if ((type & UCASE_IGNORABLE) != 0) { 843 // Case-ignorable, continue with the loop. 844 } else if (type != UCASE_NONE) { 845 return TRUE; // Followed by cased letter. 846 } else { 847 return FALSE; // Uncased and not case-ignorable. 848 } 849 } 850 return FALSE; // Not followed by cased letter. 851 } 852 853 /** 854 * Greek string uppercasing with a state machine. 855 * Probably simpler than a stateless function that has to figure out complex context-before 856 * for each character. 857 * TODO: Try to re-consolidate one way or another with the non-Greek function. 858 */ 859 int32_t toUpper(uint32_t options, 860 UChar *dest, int32_t destCapacity, 861 const UChar *src, int32_t srcLength, 862 Edits *edits, 863 UErrorCode &errorCode) { 864 int32_t destIndex=0; 865 uint32_t state = 0; 866 for (int32_t i = 0; i < srcLength;) { 867 int32_t nextIndex = i; 868 UChar32 c; 869 U16_NEXT(src, nextIndex, srcLength, c); 870 uint32_t nextState = 0; 871 int32_t type = ucase_getTypeOrIgnorable(c); 872 if ((type & UCASE_IGNORABLE) != 0) { 873 // c is case-ignorable 874 nextState |= (state & AFTER_CASED); 875 } else if (type != UCASE_NONE) { 876 // c is cased 877 nextState |= AFTER_CASED; 878 } 879 uint32_t data = getLetterData(c); 880 if (data > 0) { 881 uint32_t upper = data & UPPER_MASK; 882 // Add a dialytika to this iota or ypsilon vowel 883 // if we removed a tonos from the previous vowel, 884 // and that previous vowel did not also have (or gain) a dialytika. 885 // Adding one only to the final vowel in a longer sequence 886 // (which does not occur in normal writing) would require lookahead. 887 // Set the same flag as for preserving an existing dialytika. 888 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && 889 (upper == 0x399 || upper == 0x3A5)) { 890 data |= HAS_DIALYTIKA; 891 } 892 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. 893 if ((data & HAS_YPOGEGRAMMENI) != 0) { 894 numYpogegrammeni = 1; 895 } 896 // Skip combining diacritics after this Greek letter. 897 while (nextIndex < srcLength) { 898 uint32_t diacriticData = getDiacriticData(src[nextIndex]); 899 if (diacriticData != 0) { 900 data |= diacriticData; 901 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { 902 ++numYpogegrammeni; 903 } 904 ++nextIndex; 905 } else { 906 break; // not a Greek diacritic 907 } 908 } 909 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { 910 nextState |= AFTER_VOWEL_WITH_ACCENT; 911 } 912 // Map according to Greek rules. 913 UBool addTonos = FALSE; 914 if (upper == 0x397 && 915 (data & HAS_ACCENT) != 0 && 916 numYpogegrammeni == 0 && 917 (state & AFTER_CASED) == 0 && 918 !isFollowedByCasedLetter(src, nextIndex, srcLength)) { 919 // Keep disjunctive "or" with (only) a tonos. 920 // We use the same "word boundary" conditions as for the Final_Sigma test. 921 if (i == nextIndex) { 922 upper = 0x389; // Preserve the precomposed form. 923 } else { 924 addTonos = TRUE; 925 } 926 } else if ((data & HAS_DIALYTIKA) != 0) { 927 // Preserve a vowel with dialytika in precomposed form if it exists. 928 if (upper == 0x399) { 929 upper = 0x3AA; 930 data &= ~HAS_EITHER_DIALYTIKA; 931 } else if (upper == 0x3A5) { 932 upper = 0x3AB; 933 data &= ~HAS_EITHER_DIALYTIKA; 934 } 935 } 936 937 UBool change; 938 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) { 939 change = TRUE; // common, simple usage 940 } else { 941 // Find out first whether we are changing the text. 942 change = src[i] != upper || numYpogegrammeni > 0; 943 int32_t i2 = i + 1; 944 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 945 change |= i2 >= nextIndex || src[i2] != 0x308; 946 ++i2; 947 } 948 if (addTonos) { 949 change |= i2 >= nextIndex || src[i2] != 0x301; 950 ++i2; 951 } 952 int32_t oldLength = nextIndex - i; 953 int32_t newLength = (i2 - i) + numYpogegrammeni; 954 change |= oldLength != newLength; 955 if (change) { 956 if (edits != NULL) { 957 edits->addReplace(oldLength, newLength); 958 } 959 } else { 960 if (edits != NULL) { 961 edits->addUnchanged(oldLength); 962 } 963 // Write unchanged text? 964 change = (options & U_OMIT_UNCHANGED_TEXT) == 0; 965 } 966 } 967 968 if (change) { 969 destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper); 970 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) { 971 destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika 972 } 973 if (destIndex >= 0 && addTonos) { 974 destIndex=appendUChar(dest, destIndex, destCapacity, 0x301); 975 } 976 while (destIndex >= 0 && numYpogegrammeni > 0) { 977 destIndex=appendUChar(dest, destIndex, destCapacity, 0x399); 978 --numYpogegrammeni; 979 } 980 if(destIndex<0) { 981 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 982 return 0; 983 } 984 } 985 } else { 986 const UChar *s; 987 c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK); 988 destIndex = appendResult(dest, destIndex, destCapacity, c, s, 989 nextIndex - i, options, edits); 990 if (destIndex < 0) { 991 errorCode = U_INDEX_OUTOFBOUNDS_ERROR; 992 return 0; 993 } 994 } 995 i = nextIndex; 996 state = nextState; 997 } 998 999 return destIndex; 1000 } 1001 1002 } // namespace GreekUpper 1003 U_NAMESPACE_END 1004 1005 /* functions available in the common library (for unistr_case.cpp) */ 1006 1007 U_CFUNC int32_t U_CALLCONV 1008 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED 1009 UChar *dest, int32_t destCapacity, 1010 const UChar *src, int32_t srcLength, 1011 icu::Edits *edits, 1012 UErrorCode &errorCode) { 1013 UCaseContext csc=UCASECONTEXT_INITIALIZER; 1014 csc.p=(void *)src; 1015 csc.limit=srcLength; 1016 int32_t destIndex = _caseMap( 1017 caseLocale, options, ucase_toFullLower, 1018 dest, destCapacity, 1019 src, &csc, 0, srcLength, 1020 edits, errorCode); 1021 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); 1022 } 1023 1024 U_CFUNC int32_t U_CALLCONV 1025 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED 1026 UChar *dest, int32_t destCapacity, 1027 const UChar *src, int32_t srcLength, 1028 icu::Edits *edits, 1029 UErrorCode &errorCode) { 1030 int32_t destIndex; 1031 if (caseLocale == UCASE_LOC_GREEK) { 1032 destIndex = GreekUpper::toUpper(options, dest, destCapacity, 1033 src, srcLength, edits, errorCode); 1034 } else { 1035 UCaseContext csc=UCASECONTEXT_INITIALIZER; 1036 csc.p=(void *)src; 1037 csc.limit=srcLength; 1038 destIndex = _caseMap( 1039 caseLocale, options, ucase_toFullUpper, 1040 dest, destCapacity, 1041 src, &csc, 0, srcLength, 1042 edits, errorCode); 1043 } 1044 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); 1045 } 1046 1047 U_CFUNC int32_t U_CALLCONV 1048 ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED 1049 UChar *dest, int32_t destCapacity, 1050 const UChar *src, int32_t srcLength, 1051 icu::Edits *edits, 1052 UErrorCode &errorCode) { 1053 /* case mapping loop */ 1054 int32_t srcIndex = 0; 1055 int32_t destIndex = 0; 1056 while (srcIndex < srcLength) { 1057 int32_t cpStart = srcIndex; 1058 UChar32 c; 1059 U16_NEXT(src, srcIndex, srcLength, c); 1060 const UChar *s; 1061 c = ucase_toFullFolding(c, &s, options); 1062 destIndex = appendResult(dest, destIndex, destCapacity, c, s, 1063 srcIndex - cpStart, options, edits); 1064 if (destIndex < 0) { 1065 errorCode = U_INDEX_OUTOFBOUNDS_ERROR; 1066 return 0; 1067 } 1068 } 1069 1070 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); 1071 } 1072 1073 U_CFUNC int32_t 1074 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM 1075 UChar *dest, int32_t destCapacity, 1076 const UChar *src, int32_t srcLength, 1077 UStringCaseMapper *stringCaseMapper, 1078 icu::Edits *edits, 1079 UErrorCode &errorCode) { 1080 int32_t destLength; 1081 1082 /* check argument values */ 1083 if(U_FAILURE(errorCode)) { 1084 return 0; 1085 } 1086 if( destCapacity<0 || 1087 (dest==NULL && destCapacity>0) || 1088 src==NULL || 1089 srcLength<-1 1090 ) { 1091 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 1092 return 0; 1093 } 1094 1095 /* get the string length */ 1096 if(srcLength==-1) { 1097 srcLength=u_strlen(src); 1098 } 1099 1100 /* check for overlapping source and destination */ 1101 if( dest!=NULL && 1102 ((src>=dest && src<(dest+destCapacity)) || 1103 (dest>=src && dest<(src+srcLength))) 1104 ) { 1105 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 1106 return 0; 1107 } 1108 1109 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { 1110 edits->reset(); 1111 } 1112 destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR 1113 dest, destCapacity, src, srcLength, edits, errorCode); 1114 return u_terminateUChars(dest, destCapacity, destLength, &errorCode); 1115 } 1116 1117 U_CFUNC int32_t 1118 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM 1119 UChar *dest, int32_t destCapacity, 1120 const UChar *src, int32_t srcLength, 1121 UStringCaseMapper *stringCaseMapper, 1122 UErrorCode &errorCode) { 1123 UChar buffer[300]; 1124 UChar *temp; 1125 1126 int32_t destLength; 1127 1128 /* check argument values */ 1129 if(U_FAILURE(errorCode)) { 1130 return 0; 1131 } 1132 if( destCapacity<0 || 1133 (dest==NULL && destCapacity>0) || 1134 src==NULL || 1135 srcLength<-1 1136 ) { 1137 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 1138 return 0; 1139 } 1140 1141 /* get the string length */ 1142 if(srcLength==-1) { 1143 srcLength=u_strlen(src); 1144 } 1145 1146 /* check for overlapping source and destination */ 1147 if( dest!=NULL && 1148 ((src>=dest && src<(dest+destCapacity)) || 1149 (dest>=src && dest<(src+srcLength))) 1150 ) { 1151 /* overlap: provide a temporary destination buffer and later copy the result */ 1152 if(destCapacity<=UPRV_LENGTHOF(buffer)) { 1153 /* the stack buffer is large enough */ 1154 temp=buffer; 1155 } else { 1156 /* allocate a buffer */ 1157 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR); 1158 if(temp==NULL) { 1159 errorCode=U_MEMORY_ALLOCATION_ERROR; 1160 return 0; 1161 } 1162 } 1163 } else { 1164 temp=dest; 1165 } 1166 1167 destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR 1168 temp, destCapacity, src, srcLength, NULL, errorCode); 1169 if(temp!=dest) { 1170 /* copy the result string to the destination buffer */ 1171 if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) { 1172 u_memmove(dest, temp, destLength); 1173 } 1174 if(temp!=buffer) { 1175 uprv_free(temp); 1176 } 1177 } 1178 1179 return u_terminateUChars(dest, destCapacity, destLength, &errorCode); 1180 } 1181 1182 /* public API functions */ 1183 1184 U_CAPI int32_t U_EXPORT2 1185 u_strFoldCase(UChar *dest, int32_t destCapacity, 1186 const UChar *src, int32_t srcLength, 1187 uint32_t options, 1188 UErrorCode *pErrorCode) { 1189 return ustrcase_mapWithOverlap( 1190 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL 1191 dest, destCapacity, 1192 src, srcLength, 1193 ustrcase_internalFold, *pErrorCode); 1194 } 1195 1196 U_NAMESPACE_BEGIN 1197 1198 int32_t CaseMap::fold( 1199 uint32_t options, 1200 const UChar *src, int32_t srcLength, 1201 UChar *dest, int32_t destCapacity, Edits *edits, 1202 UErrorCode &errorCode) { 1203 return ustrcase_map( 1204 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL 1205 dest, destCapacity, 1206 src, srcLength, 1207 ustrcase_internalFold, edits, errorCode); 1208 } 1209 1210 U_NAMESPACE_END 1211 1212 /* case-insensitive string comparisons -------------------------------------- */ 1213 1214 /* 1215 * This function is a copy of unorm_cmpEquivFold() minus the parts for 1216 * canonical equivalence. 1217 * Keep the functions in sync, and see there for how this works. 1218 * The duplication is for modularization: 1219 * It makes caseless (but not canonical caseless) matches independent of 1220 * the normalization code. 1221 */ 1222 1223 /* stack element for previous-level source/decomposition pointers */ 1224 struct CmpEquivLevel { 1225 const UChar *start, *s, *limit; 1226 }; 1227 typedef struct CmpEquivLevel CmpEquivLevel; 1228 1229 /** 1230 * Internal implementation code comparing string with case fold. 1231 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch(). 1232 * 1233 * @param s1 input string 1 1234 * @param length1 length of string 1, or -1 (NULL terminated) 1235 * @param s2 input string 2 1236 * @param length2 length of string 2, or -1 (NULL terminated) 1237 * @param options compare options 1238 * @param matchLen1 (output) length of partial prefix match in s1 1239 * @param matchLen2 (output) length of partial prefix match in s2 1240 * @param pErrorCode receives error status 1241 * @return The result of comparison 1242 */ 1243 static int32_t _cmpFold( 1244 const UChar *s1, int32_t length1, 1245 const UChar *s2, int32_t length2, 1246 uint32_t options, 1247 int32_t *matchLen1, int32_t *matchLen2, 1248 UErrorCode *pErrorCode) { 1249 int32_t cmpRes = 0; 1250 1251 /* current-level start/limit - s1/s2 as current */ 1252 const UChar *start1, *start2, *limit1, *limit2; 1253 1254 /* points to the original start address */ 1255 const UChar *org1, *org2; 1256 1257 /* points to the end of match + 1 */ 1258 const UChar *m1, *m2; 1259 1260 /* case folding variables */ 1261 const UChar *p; 1262 int32_t length; 1263 1264 /* stacks of previous-level start/current/limit */ 1265 CmpEquivLevel stack1[2], stack2[2]; 1266 1267 /* case folding buffers, only use current-level start/limit */ 1268 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; 1269 1270 /* track which is the current level per string */ 1271 int32_t level1, level2; 1272 1273 /* current code units, and code points for lookups */ 1274 UChar32 c1, c2, cp1, cp2; 1275 1276 /* no argument error checking because this itself is not an API */ 1277 1278 /* 1279 * assume that at least the option U_COMPARE_IGNORE_CASE is set 1280 * otherwise this function would have to behave exactly as uprv_strCompare() 1281 */ 1282 if(U_FAILURE(*pErrorCode)) { 1283 return 0; 1284 } 1285 1286 /* initialize */ 1287 if(matchLen1) { 1288 U_ASSERT(matchLen2 !=NULL); 1289 *matchLen1=0; 1290 *matchLen2=0; 1291 } 1292 1293 start1=m1=org1=s1; 1294 if(length1==-1) { 1295 limit1=NULL; 1296 } else { 1297 limit1=s1+length1; 1298 } 1299 1300 start2=m2=org2=s2; 1301 if(length2==-1) { 1302 limit2=NULL; 1303 } else { 1304 limit2=s2+length2; 1305 } 1306 1307 level1=level2=0; 1308 c1=c2=-1; 1309 1310 /* comparison loop */ 1311 for(;;) { 1312 /* 1313 * here a code unit value of -1 means "get another code unit" 1314 * below it will mean "this source is finished" 1315 */ 1316 1317 if(c1<0) { 1318 /* get next code unit from string 1, post-increment */ 1319 for(;;) { 1320 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { 1321 if(level1==0) { 1322 c1=-1; 1323 break; 1324 } 1325 } else { 1326 ++s1; 1327 break; 1328 } 1329 1330 /* reached end of level buffer, pop one level */ 1331 do { 1332 --level1; 1333 start1=stack1[level1].start; /*Not uninitialized*/ 1334 } while(start1==NULL); 1335 s1=stack1[level1].s; /*Not uninitialized*/ 1336 limit1=stack1[level1].limit; /*Not uninitialized*/ 1337 } 1338 } 1339 1340 if(c2<0) { 1341 /* get next code unit from string 2, post-increment */ 1342 for(;;) { 1343 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { 1344 if(level2==0) { 1345 c2=-1; 1346 break; 1347 } 1348 } else { 1349 ++s2; 1350 break; 1351 } 1352 1353 /* reached end of level buffer, pop one level */ 1354 do { 1355 --level2; 1356 start2=stack2[level2].start; /*Not uninitialized*/ 1357 } while(start2==NULL); 1358 s2=stack2[level2].s; /*Not uninitialized*/ 1359 limit2=stack2[level2].limit; /*Not uninitialized*/ 1360 } 1361 } 1362 1363 /* 1364 * compare c1 and c2 1365 * either variable c1, c2 is -1 only if the corresponding string is finished 1366 */ 1367 if(c1==c2) { 1368 const UChar *next1, *next2; 1369 1370 if(c1<0) { 1371 cmpRes=0; /* c1==c2==-1 indicating end of strings */ 1372 break; 1373 } 1374 1375 /* 1376 * Note: Move the match positions in both strings at the same time 1377 * only when corresponding code point(s) in the original strings 1378 * are fully consumed. For example, when comparing s1="Fust" and 1379 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches 1380 * the first code point in the case-folded data. But the second "s" 1381 * has no matching code point in s1, so this implementation returns 1382 * 2 as the prefix match length ("Fu"). 1383 */ 1384 next1=next2=NULL; 1385 if(level1==0) { 1386 next1=s1; 1387 } else if(s1==limit1) { 1388 /* Note: This implementation only use a single level of stack. 1389 * If this code needs to be changed to use multiple levels 1390 * of stacks, the code above should check if the current 1391 * code is at the end of all stacks. 1392 */ 1393 U_ASSERT(level1==1); 1394 1395 /* is s1 at the end of the current stack? */ 1396 next1=stack1[0].s; 1397 } 1398 1399 if (next1!=NULL) { 1400 if(level2==0) { 1401 next2=s2; 1402 } else if(s2==limit2) { 1403 U_ASSERT(level2==1); 1404 1405 /* is s2 at the end of the current stack? */ 1406 next2=stack2[0].s; 1407 } 1408 if(next2!=NULL) { 1409 m1=next1; 1410 m2=next2; 1411 } 1412 } 1413 c1=c2=-1; /* make us fetch new code units */ 1414 continue; 1415 } else if(c1<0) { 1416 cmpRes=-1; /* string 1 ends before string 2 */ 1417 break; 1418 } else if(c2<0) { 1419 cmpRes=1; /* string 2 ends before string 1 */ 1420 break; 1421 } 1422 /* c1!=c2 && c1>=0 && c2>=0 */ 1423 1424 /* get complete code points for c1, c2 for lookups if either is a surrogate */ 1425 cp1=c1; 1426 if(U_IS_SURROGATE(c1)) { 1427 UChar c; 1428 1429 if(U_IS_SURROGATE_LEAD(c1)) { 1430 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { 1431 /* advance ++s1; only below if cp1 decomposes/case-folds */ 1432 cp1=U16_GET_SUPPLEMENTARY(c1, c); 1433 } 1434 } else /* isTrail(c1) */ { 1435 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { 1436 cp1=U16_GET_SUPPLEMENTARY(c, c1); 1437 } 1438 } 1439 } 1440 1441 cp2=c2; 1442 if(U_IS_SURROGATE(c2)) { 1443 UChar c; 1444 1445 if(U_IS_SURROGATE_LEAD(c2)) { 1446 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { 1447 /* advance ++s2; only below if cp2 decomposes/case-folds */ 1448 cp2=U16_GET_SUPPLEMENTARY(c2, c); 1449 } 1450 } else /* isTrail(c2) */ { 1451 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { 1452 cp2=U16_GET_SUPPLEMENTARY(c, c2); 1453 } 1454 } 1455 } 1456 1457 /* 1458 * go down one level for each string 1459 * continue with the main loop as soon as there is a real change 1460 */ 1461 1462 if( level1==0 && 1463 (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0 1464 ) { 1465 /* cp1 case-folds to the code point "length" or to p[length] */ 1466 if(U_IS_SURROGATE(c1)) { 1467 if(U_IS_SURROGATE_LEAD(c1)) { 1468 /* advance beyond source surrogate pair if it case-folds */ 1469 ++s1; 1470 } else /* isTrail(c1) */ { 1471 /* 1472 * we got a supplementary code point when hitting its trail surrogate, 1473 * therefore the lead surrogate must have been the same as in the other string; 1474 * compare this decomposition with the lead surrogate in the other string 1475 * remember that this simulates bulk text replacement: 1476 * the decomposition would replace the entire code point 1477 */ 1478 --s2; 1479 --m2; 1480 c2=*(s2-1); 1481 } 1482 } 1483 1484 /* push current level pointers */ 1485 stack1[0].start=start1; 1486 stack1[0].s=s1; 1487 stack1[0].limit=limit1; 1488 ++level1; 1489 1490 /* copy the folding result to fold1[] */ 1491 if(length<=UCASE_MAX_STRING_LENGTH) { 1492 u_memcpy(fold1, p, length); 1493 } else { 1494 int32_t i=0; 1495 U16_APPEND_UNSAFE(fold1, i, length); 1496 length=i; 1497 } 1498 1499 /* set next level pointers to case folding */ 1500 start1=s1=fold1; 1501 limit1=fold1+length; 1502 1503 /* get ready to read from decomposition, continue with loop */ 1504 c1=-1; 1505 continue; 1506 } 1507 1508 if( level2==0 && 1509 (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0 1510 ) { 1511 /* cp2 case-folds to the code point "length" or to p[length] */ 1512 if(U_IS_SURROGATE(c2)) { 1513 if(U_IS_SURROGATE_LEAD(c2)) { 1514 /* advance beyond source surrogate pair if it case-folds */ 1515 ++s2; 1516 } else /* isTrail(c2) */ { 1517 /* 1518 * we got a supplementary code point when hitting its trail surrogate, 1519 * therefore the lead surrogate must have been the same as in the other string; 1520 * compare this decomposition with the lead surrogate in the other string 1521 * remember that this simulates bulk text replacement: 1522 * the decomposition would replace the entire code point 1523 */ 1524 --s1; 1525 --m2; 1526 c1=*(s1-1); 1527 } 1528 } 1529 1530 /* push current level pointers */ 1531 stack2[0].start=start2; 1532 stack2[0].s=s2; 1533 stack2[0].limit=limit2; 1534 ++level2; 1535 1536 /* copy the folding result to fold2[] */ 1537 if(length<=UCASE_MAX_STRING_LENGTH) { 1538 u_memcpy(fold2, p, length); 1539 } else { 1540 int32_t i=0; 1541 U16_APPEND_UNSAFE(fold2, i, length); 1542 length=i; 1543 } 1544 1545 /* set next level pointers to case folding */ 1546 start2=s2=fold2; 1547 limit2=fold2+length; 1548 1549 /* get ready to read from decomposition, continue with loop */ 1550 c2=-1; 1551 continue; 1552 } 1553 1554 /* 1555 * no decomposition/case folding, max level for both sides: 1556 * return difference result 1557 * 1558 * code point order comparison must not just return cp1-cp2 1559 * because when single surrogates are present then the surrogate pairs 1560 * that formed cp1 and cp2 may be from different string indexes 1561 * 1562 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units 1563 * c1=d800 cp1=10001 c2=dc00 cp2=10000 1564 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } 1565 * 1566 * therefore, use same fix-up as in ustring.c/uprv_strCompare() 1567 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ 1568 * so we have slightly different pointer/start/limit comparisons here 1569 */ 1570 1571 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { 1572 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 1573 if( 1574 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || 1575 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) 1576 ) { 1577 /* part of a surrogate pair, leave >=d800 */ 1578 } else { 1579 /* BMP code point - may be surrogate code point - make <d800 */ 1580 c1-=0x2800; 1581 } 1582 1583 if( 1584 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || 1585 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) 1586 ) { 1587 /* part of a surrogate pair, leave >=d800 */ 1588 } else { 1589 /* BMP code point - may be surrogate code point - make <d800 */ 1590 c2-=0x2800; 1591 } 1592 } 1593 1594 cmpRes=c1-c2; 1595 break; 1596 } 1597 1598 if(matchLen1) { 1599 *matchLen1=m1-org1; 1600 *matchLen2=m2-org2; 1601 } 1602 return cmpRes; 1603 } 1604 1605 /* internal function */ 1606 U_CFUNC int32_t 1607 u_strcmpFold(const UChar *s1, int32_t length1, 1608 const UChar *s2, int32_t length2, 1609 uint32_t options, 1610 UErrorCode *pErrorCode) { 1611 return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode); 1612 } 1613 1614 /* public API functions */ 1615 1616 U_CAPI int32_t U_EXPORT2 1617 u_strCaseCompare(const UChar *s1, int32_t length1, 1618 const UChar *s2, int32_t length2, 1619 uint32_t options, 1620 UErrorCode *pErrorCode) { 1621 /* argument checking */ 1622 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { 1623 return 0; 1624 } 1625 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { 1626 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1627 return 0; 1628 } 1629 return u_strcmpFold(s1, length1, s2, length2, 1630 options|U_COMPARE_IGNORE_CASE, 1631 pErrorCode); 1632 } 1633 1634 U_CAPI int32_t U_EXPORT2 1635 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) { 1636 UErrorCode errorCode=U_ZERO_ERROR; 1637 return u_strcmpFold(s1, -1, s2, -1, 1638 options|U_COMPARE_IGNORE_CASE, 1639 &errorCode); 1640 } 1641 1642 U_CAPI int32_t U_EXPORT2 1643 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) { 1644 UErrorCode errorCode=U_ZERO_ERROR; 1645 return u_strcmpFold(s1, length, s2, length, 1646 options|U_COMPARE_IGNORE_CASE, 1647 &errorCode); 1648 } 1649 1650 U_CAPI int32_t U_EXPORT2 1651 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) { 1652 UErrorCode errorCode=U_ZERO_ERROR; 1653 return u_strcmpFold(s1, n, s2, n, 1654 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), 1655 &errorCode); 1656 } 1657 1658 /* internal API - detect length of shared prefix */ 1659 U_CAPI void 1660 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1, 1661 const UChar *s2, int32_t length2, 1662 uint32_t options, 1663 int32_t *matchLen1, int32_t *matchLen2, 1664 UErrorCode *pErrorCode) { 1665 _cmpFold(s1, length1, s2, length2, options, 1666 matchLen1, matchLen2, pErrorCode); 1667 } 1668