1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2005-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: ucasemap.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2005may06 16 * created by: Markus W. Scherer 17 * 18 * Case mapping service object and functions using it. 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/brkiter.h" 23 #include "unicode/bytestream.h" 24 #include "unicode/casemap.h" 25 #include "unicode/edits.h" 26 #include "unicode/stringoptions.h" 27 #include "unicode/stringpiece.h" 28 #include "unicode/ubrk.h" 29 #include "unicode/uloc.h" 30 #include "unicode/ustring.h" 31 #include "unicode/ucasemap.h" 32 #if !UCONFIG_NO_BREAK_ITERATION 33 #include "unicode/utext.h" 34 #endif 35 #include "unicode/utf.h" 36 #include "unicode/utf8.h" 37 #include "unicode/utf16.h" 38 #include "bytesinkutil.h" 39 #include "cmemory.h" 40 #include "cstring.h" 41 #include "uassert.h" 42 #include "ucase.h" 43 #include "ucasemap_imp.h" 44 #include "ustr_imp.h" 45 46 U_NAMESPACE_USE 47 48 /* UCaseMap service object -------------------------------------------------- */ 49 50 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) : 51 #if !UCONFIG_NO_BREAK_ITERATION 52 iter(NULL), 53 #endif 54 caseLocale(UCASE_LOC_UNKNOWN), options(opts) { 55 ucasemap_setLocale(this, localeID, pErrorCode); 56 } 57 58 UCaseMap::~UCaseMap() { 59 #if !UCONFIG_NO_BREAK_ITERATION 60 delete iter; 61 #endif 62 } 63 64 U_CAPI UCaseMap * U_EXPORT2 65 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) { 66 if(U_FAILURE(*pErrorCode)) { 67 return NULL; 68 } 69 UCaseMap *csm = new UCaseMap(locale, options, pErrorCode); 70 if(csm==NULL) { 71 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; 72 return NULL; 73 } else if (U_FAILURE(*pErrorCode)) { 74 delete csm; 75 return NULL; 76 } 77 return csm; 78 } 79 80 U_CAPI void U_EXPORT2 81 ucasemap_close(UCaseMap *csm) { 82 delete csm; 83 } 84 85 U_CAPI const char * U_EXPORT2 86 ucasemap_getLocale(const UCaseMap *csm) { 87 return csm->locale; 88 } 89 90 U_CAPI uint32_t U_EXPORT2 91 ucasemap_getOptions(const UCaseMap *csm) { 92 return csm->options; 93 } 94 95 U_CAPI void U_EXPORT2 96 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { 97 if(U_FAILURE(*pErrorCode)) { 98 return; 99 } 100 if (locale != NULL && *locale == 0) { 101 csm->locale[0] = 0; 102 csm->caseLocale = UCASE_LOC_ROOT; 103 return; 104 } 105 106 int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); 107 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) { 108 *pErrorCode=U_ZERO_ERROR; 109 /* we only really need the language code for case mappings */ 110 length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); 111 } 112 if(length==sizeof(csm->locale)) { 113 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 114 } 115 if(U_SUCCESS(*pErrorCode)) { 116 csm->caseLocale=UCASE_LOC_UNKNOWN; 117 csm->caseLocale = ucase_getCaseLocale(csm->locale); 118 } else { 119 csm->locale[0]=0; 120 csm->caseLocale = UCASE_LOC_ROOT; 121 } 122 } 123 124 U_CAPI void U_EXPORT2 125 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) { 126 if(U_FAILURE(*pErrorCode)) { 127 return; 128 } 129 csm->options=options; 130 } 131 132 /* UTF-8 string case mappings ----------------------------------------------- */ 133 134 /* TODO(markus): Move to a new, separate utf8case.cpp file. */ 135 136 namespace { 137 138 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ 139 inline UBool 140 appendResult(int32_t cpLength, int32_t result, const UChar *s, 141 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) { 142 U_ASSERT(U_SUCCESS(errorCode)); 143 144 /* decode the result */ 145 if(result<0) { 146 /* (not) original code point */ 147 if(edits!=NULL) { 148 edits->addUnchanged(cpLength); 149 } 150 if((options & U_OMIT_UNCHANGED_TEXT) == 0) { 151 ByteSinkUtil::appendCodePoint(cpLength, ~result, sink); 152 } 153 } else { 154 if(result<=UCASE_MAX_STRING_LENGTH) { 155 // string: "result" is the UTF-16 length 156 return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode); 157 } else { 158 ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits); 159 } 160 } 161 return TRUE; 162 } 163 164 // See unicode/utf8.h U8_APPEND_UNSAFE(). 165 inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); } 166 inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); } 167 168 UChar32 U_CALLCONV 169 utf8_caseContextIterator(void *context, int8_t dir) { 170 UCaseContext *csc=(UCaseContext *)context; 171 UChar32 c; 172 173 if(dir<0) { 174 /* reset for backward iteration */ 175 csc->index=csc->cpStart; 176 csc->dir=dir; 177 } else if(dir>0) { 178 /* reset for forward iteration */ 179 csc->index=csc->cpLimit; 180 csc->dir=dir; 181 } else { 182 /* continue current iteration direction */ 183 dir=csc->dir; 184 } 185 186 if(dir<0) { 187 if(csc->start<csc->index) { 188 U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c); 189 return c; 190 } 191 } else { 192 if(csc->index<csc->limit) { 193 U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c); 194 return c; 195 } 196 } 197 return U_SENTINEL; 198 } 199 200 /** 201 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. 202 * caseLocale < 0: Case-folds [srcStart..srcLimit[. 203 */ 204 void toLower(int32_t caseLocale, uint32_t options, 205 const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit, 206 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { 207 const int8_t *latinToLower; 208 if (caseLocale == UCASE_LOC_ROOT || 209 (caseLocale >= 0 ? 210 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) : 211 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) { 212 latinToLower = LatinCase::TO_LOWER_NORMAL; 213 } else { 214 latinToLower = LatinCase::TO_LOWER_TR_LT; 215 } 216 const UTrie2 *trie = ucase_getTrie(); 217 int32_t prev = srcStart; 218 int32_t srcIndex = srcStart; 219 for (;;) { 220 // fast path for simple cases 221 int32_t cpStart; 222 UChar32 c; 223 for (;;) { 224 if (U_FAILURE(errorCode) || srcIndex >= srcLimit) { 225 c = U_SENTINEL; 226 break; 227 } 228 uint8_t lead = src[srcIndex++]; 229 if (lead <= 0x7f) { 230 int8_t d = latinToLower[lead]; 231 if (d == LatinCase::EXC) { 232 cpStart = srcIndex - 1; 233 c = lead; 234 break; 235 } 236 if (d == 0) { continue; } 237 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev, 238 sink, options, edits, errorCode); 239 char ascii = (char)(lead + d); 240 sink.Append(&ascii, 1); 241 if (edits != nullptr) { 242 edits->addReplace(1, 1); 243 } 244 prev = srcIndex; 245 continue; 246 } else if (lead < 0xe3) { 247 uint8_t t; 248 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit && 249 (t = src[srcIndex] - 0x80) <= 0x3f) { 250 // U+0080..U+017F 251 ++srcIndex; 252 c = ((lead - 0xc0) << 6) | t; 253 int8_t d = latinToLower[c]; 254 if (d == LatinCase::EXC) { 255 cpStart = srcIndex - 2; 256 break; 257 } 258 if (d == 0) { continue; } 259 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev, 260 sink, options, edits, errorCode); 261 ByteSinkUtil::appendTwoBytes(c + d, sink); 262 if (edits != nullptr) { 263 edits->addReplace(2, 2); 264 } 265 prev = srcIndex; 266 continue; 267 } 268 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) && 269 (srcIndex + 2) <= srcLimit && 270 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) { 271 // most of CJK: no case mappings 272 srcIndex += 2; 273 continue; 274 } 275 cpStart = --srcIndex; 276 U8_NEXT(src, srcIndex, srcLimit, c); 277 if (c < 0) { 278 // ill-formed UTF-8 279 continue; 280 } 281 uint16_t props = UTRIE2_GET16(trie, c); 282 if (UCASE_HAS_EXCEPTION(props)) { break; } 283 int32_t delta; 284 if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) { 285 continue; 286 } 287 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, 288 sink, options, edits, errorCode); 289 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits); 290 prev = srcIndex; 291 } 292 if (c < 0) { 293 break; 294 } 295 // slow path 296 const UChar *s; 297 if (caseLocale >= 0) { 298 csc->cpStart = cpStart; 299 csc->cpLimit = srcIndex; 300 c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale); 301 } else { 302 c = ucase_toFullFolding(c, &s, options); 303 } 304 if (c >= 0) { 305 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, 306 sink, options, edits, errorCode); 307 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); 308 prev = srcIndex; 309 } 310 } 311 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev, 312 sink, options, edits, errorCode); 313 } 314 315 void toUpper(int32_t caseLocale, uint32_t options, 316 const uint8_t *src, UCaseContext *csc, int32_t srcLength, 317 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { 318 const int8_t *latinToUpper; 319 if (caseLocale == UCASE_LOC_TURKISH) { 320 latinToUpper = LatinCase::TO_UPPER_TR; 321 } else { 322 latinToUpper = LatinCase::TO_UPPER_NORMAL; 323 } 324 const UTrie2 *trie = ucase_getTrie(); 325 int32_t prev = 0; 326 int32_t srcIndex = 0; 327 for (;;) { 328 // fast path for simple cases 329 int32_t cpStart; 330 UChar32 c; 331 for (;;) { 332 if (U_FAILURE(errorCode) || srcIndex >= srcLength) { 333 c = U_SENTINEL; 334 break; 335 } 336 uint8_t lead = src[srcIndex++]; 337 if (lead <= 0x7f) { 338 int8_t d = latinToUpper[lead]; 339 if (d == LatinCase::EXC) { 340 cpStart = srcIndex - 1; 341 c = lead; 342 break; 343 } 344 if (d == 0) { continue; } 345 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev, 346 sink, options, edits, errorCode); 347 char ascii = (char)(lead + d); 348 sink.Append(&ascii, 1); 349 if (edits != nullptr) { 350 edits->addReplace(1, 1); 351 } 352 prev = srcIndex; 353 continue; 354 } else if (lead < 0xe3) { 355 uint8_t t; 356 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength && 357 (t = src[srcIndex] - 0x80) <= 0x3f) { 358 // U+0080..U+017F 359 ++srcIndex; 360 c = ((lead - 0xc0) << 6) | t; 361 int8_t d = latinToUpper[c]; 362 if (d == LatinCase::EXC) { 363 cpStart = srcIndex - 2; 364 break; 365 } 366 if (d == 0) { continue; } 367 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev, 368 sink, options, edits, errorCode); 369 ByteSinkUtil::appendTwoBytes(c + d, sink); 370 if (edits != nullptr) { 371 edits->addReplace(2, 2); 372 } 373 prev = srcIndex; 374 continue; 375 } 376 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) && 377 (srcIndex + 2) <= srcLength && 378 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) { 379 // most of CJK: no case mappings 380 srcIndex += 2; 381 continue; 382 } 383 cpStart = --srcIndex; 384 U8_NEXT(src, srcIndex, srcLength, c); 385 if (c < 0) { 386 // ill-formed UTF-8 387 continue; 388 } 389 uint16_t props = UTRIE2_GET16(trie, c); 390 if (UCASE_HAS_EXCEPTION(props)) { break; } 391 int32_t delta; 392 if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) { 393 continue; 394 } 395 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, 396 sink, options, edits, errorCode); 397 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits); 398 prev = srcIndex; 399 } 400 if (c < 0) { 401 break; 402 } 403 // slow path 404 csc->cpStart = cpStart; 405 csc->cpLimit = srcIndex; 406 const UChar *s; 407 c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale); 408 if (c >= 0) { 409 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, 410 sink, options, edits, errorCode); 411 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); 412 prev = srcIndex; 413 } 414 } 415 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev, 416 sink, options, edits, errorCode); 417 } 418 419 } // namespace 420 421 #if !UCONFIG_NO_BREAK_ITERATION 422 423 U_CFUNC void U_CALLCONV 424 ucasemap_internalUTF8ToTitle( 425 int32_t caseLocale, uint32_t options, BreakIterator *iter, 426 const uint8_t *src, int32_t srcLength, 427 ByteSink &sink, icu::Edits *edits, 428 UErrorCode &errorCode) { 429 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) { 430 return; 431 } 432 433 /* set up local variables */ 434 UCaseContext csc=UCASECONTEXT_INITIALIZER; 435 csc.p=(void *)src; 436 csc.limit=srcLength; 437 int32_t prev=0; 438 UBool isFirstIndex=TRUE; 439 440 /* titlecasing loop */ 441 while(prev<srcLength) { 442 /* find next index where to titlecase */ 443 int32_t index; 444 if(isFirstIndex) { 445 isFirstIndex=FALSE; 446 index=iter->first(); 447 } else { 448 index=iter->next(); 449 } 450 if(index==UBRK_DONE || index>srcLength) { 451 index=srcLength; 452 } 453 454 /* 455 * Segment [prev..index[ into 3 parts: 456 * a) skipped characters (copy as-is) [prev..titleStart[ 457 * b) first letter (titlecase) [titleStart..titleLimit[ 458 * c) subsequent characters (lowercase) [titleLimit..index[ 459 */ 460 if(prev<index) { 461 /* find and copy skipped characters [prev..titleStart[ */ 462 int32_t titleStart=prev; 463 int32_t titleLimit=prev; 464 UChar32 c; 465 U8_NEXT(src, titleLimit, index, c); 466 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) { 467 // Adjust the titlecasing index to the next cased character, 468 // or to the next letter/number/symbol/private use. 469 // Stop with titleStart<titleLimit<=index 470 // if there is a character to be titlecased, 471 // or else stop with titleStart==titleLimit==index. 472 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; 473 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) { 474 titleStart=titleLimit; 475 if(titleLimit==index) { 476 break; 477 } 478 U8_NEXT(src, titleLimit, index, c); 479 } 480 if (prev < titleStart) { 481 if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev, 482 sink, options, edits, errorCode)) { 483 return; 484 } 485 } 486 } 487 488 if(titleStart<titleLimit) { 489 /* titlecase c which is from [titleStart..titleLimit[ */ 490 if(c>=0) { 491 csc.cpStart=titleStart; 492 csc.cpLimit=titleLimit; 493 const UChar *s; 494 c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale); 495 if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) { 496 return; 497 } 498 } else { 499 // Malformed UTF-8. 500 if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart, 501 sink, options, edits, errorCode)) { 502 return; 503 } 504 } 505 506 /* Special case Dutch IJ titlecasing */ 507 if (titleStart+1 < index && 508 caseLocale == UCASE_LOC_DUTCH && 509 (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { 510 if (src[titleStart+1] == 0x006A) { 511 ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits); 512 titleLimit++; 513 } else if (src[titleStart+1] == 0x004A) { 514 // Keep the capital J from getting lowercased. 515 if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1, 516 sink, options, edits, errorCode)) { 517 return; 518 } 519 titleLimit++; 520 } 521 } 522 523 /* lowercase [titleLimit..index[ */ 524 if(titleLimit<index) { 525 if((options&U_TITLECASE_NO_LOWERCASE)==0) { 526 /* Normal operation: Lowercase the rest of the word. */ 527 toLower(caseLocale, options, 528 src, &csc, titleLimit, index, 529 sink, edits, errorCode); 530 if(U_FAILURE(errorCode)) { 531 return; 532 } 533 } else { 534 /* Optionally just copy the rest of the word unchanged. */ 535 if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit, 536 sink, options, edits, errorCode)) { 537 return; 538 } 539 } 540 } 541 } 542 } 543 544 prev=index; 545 } 546 } 547 548 #endif 549 550 U_NAMESPACE_BEGIN 551 namespace GreekUpper { 552 553 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) { 554 while (i < length) { 555 UChar32 c; 556 U8_NEXT(s, i, length, c); 557 int32_t type = ucase_getTypeOrIgnorable(c); 558 if ((type & UCASE_IGNORABLE) != 0) { 559 // Case-ignorable, continue with the loop. 560 } else if (type != UCASE_NONE) { 561 return TRUE; // Followed by cased letter. 562 } else { 563 return FALSE; // Uncased and not case-ignorable. 564 } 565 } 566 return FALSE; // Not followed by cased letter. 567 } 568 569 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java. 570 void toUpper(uint32_t options, 571 const uint8_t *src, int32_t srcLength, 572 ByteSink &sink, Edits *edits, 573 UErrorCode &errorCode) { 574 uint32_t state = 0; 575 for (int32_t i = 0; i < srcLength;) { 576 int32_t nextIndex = i; 577 UChar32 c; 578 U8_NEXT(src, nextIndex, srcLength, c); 579 uint32_t nextState = 0; 580 int32_t type = ucase_getTypeOrIgnorable(c); 581 if ((type & UCASE_IGNORABLE) != 0) { 582 // c is case-ignorable 583 nextState |= (state & AFTER_CASED); 584 } else if (type != UCASE_NONE) { 585 // c is cased 586 nextState |= AFTER_CASED; 587 } 588 uint32_t data = getLetterData(c); 589 if (data > 0) { 590 uint32_t upper = data & UPPER_MASK; 591 // Add a dialytika to this iota or ypsilon vowel 592 // if we removed a tonos from the previous vowel, 593 // and that previous vowel did not also have (or gain) a dialytika. 594 // Adding one only to the final vowel in a longer sequence 595 // (which does not occur in normal writing) would require lookahead. 596 // Set the same flag as for preserving an existing dialytika. 597 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && 598 (upper == 0x399 || upper == 0x3A5)) { 599 data |= HAS_DIALYTIKA; 600 } 601 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. 602 if ((data & HAS_YPOGEGRAMMENI) != 0) { 603 numYpogegrammeni = 1; 604 } 605 // Skip combining diacritics after this Greek letter. 606 int32_t nextNextIndex = nextIndex; 607 while (nextIndex < srcLength) { 608 UChar32 c2; 609 U8_NEXT(src, nextNextIndex, srcLength, c2); 610 uint32_t diacriticData = getDiacriticData(c2); 611 if (diacriticData != 0) { 612 data |= diacriticData; 613 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { 614 ++numYpogegrammeni; 615 } 616 nextIndex = nextNextIndex; 617 } else { 618 break; // not a Greek diacritic 619 } 620 } 621 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { 622 nextState |= AFTER_VOWEL_WITH_ACCENT; 623 } 624 // Map according to Greek rules. 625 UBool addTonos = FALSE; 626 if (upper == 0x397 && 627 (data & HAS_ACCENT) != 0 && 628 numYpogegrammeni == 0 && 629 (state & AFTER_CASED) == 0 && 630 !isFollowedByCasedLetter(src, nextIndex, srcLength)) { 631 // Keep disjunctive "or" with (only) a tonos. 632 // We use the same "word boundary" conditions as for the Final_Sigma test. 633 if (i == nextIndex) { 634 upper = 0x389; // Preserve the precomposed form. 635 } else { 636 addTonos = TRUE; 637 } 638 } else if ((data & HAS_DIALYTIKA) != 0) { 639 // Preserve a vowel with dialytika in precomposed form if it exists. 640 if (upper == 0x399) { 641 upper = 0x3AA; 642 data &= ~HAS_EITHER_DIALYTIKA; 643 } else if (upper == 0x3A5) { 644 upper = 0x3AB; 645 data &= ~HAS_EITHER_DIALYTIKA; 646 } 647 } 648 649 UBool change; 650 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) { 651 change = TRUE; // common, simple usage 652 } else { 653 // Find out first whether we are changing the text. 654 U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block 655 change = (i + 2) > nextIndex || 656 src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) || 657 numYpogegrammeni > 0; 658 int32_t i2 = i + 2; 659 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 660 change |= (i2 + 2) > nextIndex || 661 src[i2] != (uint8_t)u8"\u0308"[0] || 662 src[i2 + 1] != (uint8_t)u8"\u0308"[1]; 663 i2 += 2; 664 } 665 if (addTonos) { 666 change |= (i2 + 2) > nextIndex || 667 src[i2] != (uint8_t)u8"\u0301"[0] || 668 src[i2 + 1] != (uint8_t)u8"\u0301"[1]; 669 i2 += 2; 670 } 671 int32_t oldLength = nextIndex - i; 672 int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399 673 change |= oldLength != newLength; 674 if (change) { 675 if (edits != NULL) { 676 edits->addReplace(oldLength, newLength); 677 } 678 } else { 679 if (edits != NULL) { 680 edits->addUnchanged(oldLength); 681 } 682 // Write unchanged text? 683 change = (options & U_OMIT_UNCHANGED_TEXT) == 0; 684 } 685 } 686 687 if (change) { 688 ByteSinkUtil::appendTwoBytes(upper, sink); 689 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 690 sink.Append(u8"\u0308", 2); // restore or add a dialytika 691 } 692 if (addTonos) { 693 sink.Append(u8"\u0301", 2); 694 } 695 while (numYpogegrammeni > 0) { 696 sink.Append(u8"\u0399", 2); 697 --numYpogegrammeni; 698 } 699 } 700 } else if(c>=0) { 701 const UChar *s; 702 c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK); 703 if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) { 704 return; 705 } 706 } else { 707 // Malformed UTF-8. 708 if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i, 709 sink, options, edits, errorCode)) { 710 return; 711 } 712 } 713 i = nextIndex; 714 state = nextState; 715 } 716 } 717 718 } // namespace GreekUpper 719 U_NAMESPACE_END 720 721 static void U_CALLCONV 722 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED 723 const uint8_t *src, int32_t srcLength, 724 icu::ByteSink &sink, icu::Edits *edits, 725 UErrorCode &errorCode) { 726 UCaseContext csc=UCASECONTEXT_INITIALIZER; 727 csc.p=(void *)src; 728 csc.limit=srcLength; 729 toLower( 730 caseLocale, options, 731 src, &csc, 0, srcLength, 732 sink, edits, errorCode); 733 } 734 735 static void U_CALLCONV 736 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED 737 const uint8_t *src, int32_t srcLength, 738 icu::ByteSink &sink, icu::Edits *edits, 739 UErrorCode &errorCode) { 740 if (caseLocale == UCASE_LOC_GREEK) { 741 GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode); 742 } else { 743 UCaseContext csc=UCASECONTEXT_INITIALIZER; 744 csc.p=(void *)src; 745 csc.limit=srcLength; 746 toUpper( 747 caseLocale, options, 748 src, &csc, srcLength, 749 sink, edits, errorCode); 750 } 751 } 752 753 static void U_CALLCONV 754 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED 755 const uint8_t *src, int32_t srcLength, 756 icu::ByteSink &sink, icu::Edits *edits, 757 UErrorCode &errorCode) { 758 toLower( 759 -1, options, 760 src, nullptr, 0, srcLength, 761 sink, edits, errorCode); 762 } 763 764 void 765 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM 766 const char *src, int32_t srcLength, 767 UTF8CaseMapper *stringCaseMapper, 768 icu::ByteSink &sink, icu::Edits *edits, 769 UErrorCode &errorCode) { 770 /* check argument values */ 771 if (U_FAILURE(errorCode)) { 772 return; 773 } 774 if ((src == nullptr && srcLength != 0) || srcLength < -1) { 775 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 776 return; 777 } 778 779 // Get the string length. 780 if (srcLength == -1) { 781 srcLength = (int32_t)uprv_strlen((const char *)src); 782 } 783 784 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { 785 edits->reset(); 786 } 787 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR 788 (const uint8_t *)src, srcLength, sink, edits, errorCode); 789 sink.Flush(); 790 if (U_SUCCESS(errorCode)) { 791 if (edits != nullptr) { 792 edits->copyErrorTo(errorCode); 793 } 794 } 795 } 796 797 int32_t 798 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM 799 char *dest, int32_t destCapacity, 800 const char *src, int32_t srcLength, 801 UTF8CaseMapper *stringCaseMapper, 802 icu::Edits *edits, 803 UErrorCode &errorCode) { 804 /* check argument values */ 805 if(U_FAILURE(errorCode)) { 806 return 0; 807 } 808 if( destCapacity<0 || 809 (dest==NULL && destCapacity>0) || 810 (src==NULL && srcLength!=0) || srcLength<-1 811 ) { 812 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 813 return 0; 814 } 815 816 /* get the string length */ 817 if(srcLength==-1) { 818 srcLength=(int32_t)uprv_strlen((const char *)src); 819 } 820 821 /* check for overlapping source and destination */ 822 if( dest!=NULL && 823 ((src>=dest && src<(dest+destCapacity)) || 824 (dest>=src && dest<(src+srcLength))) 825 ) { 826 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 827 return 0; 828 } 829 830 CheckedArrayByteSink sink(dest, destCapacity); 831 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { 832 edits->reset(); 833 } 834 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR 835 (const uint8_t *)src, srcLength, sink, edits, errorCode); 836 sink.Flush(); 837 if (U_SUCCESS(errorCode)) { 838 if (sink.Overflowed()) { 839 errorCode = U_BUFFER_OVERFLOW_ERROR; 840 } else if (edits != nullptr) { 841 edits->copyErrorTo(errorCode); 842 } 843 } 844 return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode); 845 } 846 847 /* public API functions */ 848 849 U_CAPI int32_t U_EXPORT2 850 ucasemap_utf8ToLower(const UCaseMap *csm, 851 char *dest, int32_t destCapacity, 852 const char *src, int32_t srcLength, 853 UErrorCode *pErrorCode) { 854 return ucasemap_mapUTF8( 855 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL 856 dest, destCapacity, 857 src, srcLength, 858 ucasemap_internalUTF8ToLower, NULL, *pErrorCode); 859 } 860 861 U_CAPI int32_t U_EXPORT2 862 ucasemap_utf8ToUpper(const UCaseMap *csm, 863 char *dest, int32_t destCapacity, 864 const char *src, int32_t srcLength, 865 UErrorCode *pErrorCode) { 866 return ucasemap_mapUTF8( 867 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL 868 dest, destCapacity, 869 src, srcLength, 870 ucasemap_internalUTF8ToUpper, NULL, *pErrorCode); 871 } 872 873 U_CAPI int32_t U_EXPORT2 874 ucasemap_utf8FoldCase(const UCaseMap *csm, 875 char *dest, int32_t destCapacity, 876 const char *src, int32_t srcLength, 877 UErrorCode *pErrorCode) { 878 return ucasemap_mapUTF8( 879 UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL 880 dest, destCapacity, 881 src, srcLength, 882 ucasemap_internalUTF8Fold, NULL, *pErrorCode); 883 } 884 885 U_NAMESPACE_BEGIN 886 887 void CaseMap::utf8ToLower( 888 const char *locale, uint32_t options, 889 StringPiece src, ByteSink &sink, Edits *edits, 890 UErrorCode &errorCode) { 891 ucasemap_mapUTF8( 892 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL 893 src.data(), src.length(), 894 ucasemap_internalUTF8ToLower, sink, edits, errorCode); 895 } 896 897 void CaseMap::utf8ToUpper( 898 const char *locale, uint32_t options, 899 StringPiece src, ByteSink &sink, Edits *edits, 900 UErrorCode &errorCode) { 901 ucasemap_mapUTF8( 902 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL 903 src.data(), src.length(), 904 ucasemap_internalUTF8ToUpper, sink, edits, errorCode); 905 } 906 907 void CaseMap::utf8Fold( 908 uint32_t options, 909 StringPiece src, ByteSink &sink, Edits *edits, 910 UErrorCode &errorCode) { 911 ucasemap_mapUTF8( 912 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL 913 src.data(), src.length(), 914 ucasemap_internalUTF8Fold, sink, edits, errorCode); 915 } 916 917 int32_t CaseMap::utf8ToLower( 918 const char *locale, uint32_t options, 919 const char *src, int32_t srcLength, 920 char *dest, int32_t destCapacity, Edits *edits, 921 UErrorCode &errorCode) { 922 return ucasemap_mapUTF8( 923 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL 924 dest, destCapacity, 925 src, srcLength, 926 ucasemap_internalUTF8ToLower, edits, errorCode); 927 } 928 929 int32_t CaseMap::utf8ToUpper( 930 const char *locale, uint32_t options, 931 const char *src, int32_t srcLength, 932 char *dest, int32_t destCapacity, Edits *edits, 933 UErrorCode &errorCode) { 934 return ucasemap_mapUTF8( 935 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL 936 dest, destCapacity, 937 src, srcLength, 938 ucasemap_internalUTF8ToUpper, edits, errorCode); 939 } 940 941 int32_t CaseMap::utf8Fold( 942 uint32_t options, 943 const char *src, int32_t srcLength, 944 char *dest, int32_t destCapacity, Edits *edits, 945 UErrorCode &errorCode) { 946 return ucasemap_mapUTF8( 947 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL 948 dest, destCapacity, 949 src, srcLength, 950 ucasemap_internalUTF8Fold, edits, errorCode); 951 } 952 953 U_NAMESPACE_END 954