1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2005-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucasemap.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2005may06 14 * created by: Markus W. Scherer 15 * 16 * Case mapping service object and functions using it. 17 */ 18 19 #include "unicode/utypes.h" 20 #include "unicode/uloc.h" 21 #include "unicode/ustring.h" 22 #include "unicode/ucasemap.h" 23 #if !UCONFIG_NO_BREAK_ITERATION 24 #include "unicode/ubrk.h" 25 #include "unicode/utext.h" 26 #endif 27 #include "cmemory.h" 28 #include "cstring.h" 29 #include "ucase.h" 30 #include "ustr_imp.h" 31 32 /* UCaseMap service object -------------------------------------------------- */ 33 34 U_CAPI UCaseMap * U_EXPORT2 35 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) { 36 UCaseMap *csm; 37 38 if(U_FAILURE(*pErrorCode)) { 39 return NULL; 40 } 41 42 csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap)); 43 if(csm==NULL) { 44 return NULL; 45 } 46 uprv_memset(csm, 0, sizeof(UCaseMap)); 47 48 csm->csp=ucase_getSingleton(); 49 ucasemap_setLocale(csm, locale, pErrorCode); 50 if(U_FAILURE(*pErrorCode)) { 51 uprv_free(csm); 52 return NULL; 53 } 54 55 csm->options=options; 56 return csm; 57 } 58 59 U_CAPI void U_EXPORT2 60 ucasemap_close(UCaseMap *csm) { 61 if(csm!=NULL) { 62 #if !UCONFIG_NO_BREAK_ITERATION 63 ubrk_close(csm->iter); 64 #endif 65 uprv_free(csm); 66 } 67 } 68 69 U_CAPI const char * U_EXPORT2 70 ucasemap_getLocale(const UCaseMap *csm) { 71 return csm->locale; 72 } 73 74 U_CAPI uint32_t U_EXPORT2 75 ucasemap_getOptions(const UCaseMap *csm) { 76 return csm->options; 77 } 78 79 U_CAPI void U_EXPORT2 80 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { 81 int32_t length; 82 83 if(U_FAILURE(*pErrorCode)) { 84 return; 85 } 86 87 length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); 88 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) { 89 *pErrorCode=U_ZERO_ERROR; 90 /* we only really need the language code for case mappings */ 91 length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); 92 } 93 if(length==sizeof(csm->locale)) { 94 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 95 } 96 csm->locCache=0; 97 if(U_SUCCESS(*pErrorCode)) { 98 ucase_getCaseLocale(csm->locale, &csm->locCache); 99 } else { 100 csm->locale[0]=0; 101 } 102 } 103 104 U_CAPI void U_EXPORT2 105 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) { 106 csm->options=options; 107 } 108 109 #if !UCONFIG_NO_BREAK_ITERATION 110 111 U_CAPI const UBreakIterator * U_EXPORT2 112 ucasemap_getBreakIterator(const UCaseMap *csm) { 113 return csm->iter; 114 } 115 116 U_CAPI void U_EXPORT2 117 ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode) { 118 ubrk_close(csm->iter); 119 csm->iter=iterToAdopt; 120 } 121 122 #endif 123 124 /* UTF-8 string case mappings ----------------------------------------------- */ 125 126 /* TODO(markus): Move to a new, separate utf8case.c file. */ 127 128 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ 129 static U_INLINE int32_t 130 appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, 131 int32_t result, const UChar *s) { 132 UChar32 c; 133 int32_t length, destLength; 134 UErrorCode errorCode; 135 136 /* decode the result */ 137 if(result<0) { 138 /* (not) original code point */ 139 c=~result; 140 length=-1; 141 } else if(result<=UCASE_MAX_STRING_LENGTH) { 142 c=U_SENTINEL; 143 length=result; 144 } else { 145 c=result; 146 length=-1; 147 } 148 149 if(destIndex<destCapacity) { 150 /* append the result */ 151 if(length<0) { 152 /* code point */ 153 UBool isError=FALSE; 154 U8_APPEND(dest, destIndex, destCapacity, c, isError); 155 if(isError) { 156 /* overflow, nothing written */ 157 destIndex+=U8_LENGTH(c); 158 } 159 } else { 160 /* string */ 161 errorCode=U_ZERO_ERROR; 162 u_strToUTF8( 163 (char *)(dest+destIndex), destCapacity-destIndex, &destLength, 164 s, length, 165 &errorCode); 166 destIndex+=destLength; 167 /* we might have an overflow, but we know the actual length */ 168 } 169 } else { 170 /* preflight */ 171 if(length<0) { 172 destIndex+=U8_LENGTH(c); 173 } else { 174 errorCode=U_ZERO_ERROR; 175 u_strToUTF8( 176 NULL, 0, &destLength, 177 s, length, 178 &errorCode); 179 destIndex+=destLength; 180 } 181 } 182 return destIndex; 183 } 184 185 static UChar32 U_CALLCONV 186 utf8_caseContextIterator(void *context, int8_t dir) { 187 UCaseContext *csc=(UCaseContext *)context; 188 UChar32 c; 189 190 if(dir<0) { 191 /* reset for backward iteration */ 192 csc->index=csc->cpStart; 193 csc->dir=dir; 194 } else if(dir>0) { 195 /* reset for forward iteration */ 196 csc->index=csc->cpLimit; 197 csc->dir=dir; 198 } else { 199 /* continue current iteration direction */ 200 dir=csc->dir; 201 } 202 203 if(dir<0) { 204 if(csc->start<csc->index) { 205 U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c); 206 return c; 207 } 208 } else { 209 if(csc->index<csc->limit) { 210 U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c); 211 return c; 212 } 213 } 214 return U_SENTINEL; 215 } 216 217 /* 218 * Case-maps [srcStart..srcLimit[ but takes 219 * context [0..srcLength[ into account. 220 */ 221 static int32_t 222 _caseMap(const UCaseMap *csm, UCaseMapFull *map, 223 uint8_t *dest, int32_t destCapacity, 224 const uint8_t *src, UCaseContext *csc, 225 int32_t srcStart, int32_t srcLimit, 226 UErrorCode *pErrorCode) { 227 const UChar *s; 228 UChar32 c, c2 = 0; 229 int32_t srcIndex, destIndex; 230 int32_t locCache; 231 232 locCache=csm->locCache; 233 234 /* case mapping loop */ 235 srcIndex=srcStart; 236 destIndex=0; 237 while(srcIndex<srcLimit) { 238 csc->cpStart=srcIndex; 239 U8_NEXT(src, srcIndex, srcLimit, c); 240 csc->cpLimit=srcIndex; 241 if(c<0) { 242 int32_t i=csc->cpStart; 243 while(destIndex<destCapacity && i<srcIndex) { 244 dest[destIndex++]=src[i++]; 245 } 246 continue; 247 } 248 c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache); 249 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) { 250 /* fast path version of appendResult() for ASCII results */ 251 dest[destIndex++]=(uint8_t)c2; 252 } else { 253 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 254 } 255 } 256 257 if(destIndex>destCapacity) { 258 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 259 } 260 return destIndex; 261 } 262 263 #if !UCONFIG_NO_BREAK_ITERATION 264 265 /* 266 * Internal titlecasing function. 267 */ 268 static int32_t 269 _toTitle(UCaseMap *csm, 270 uint8_t *dest, int32_t destCapacity, 271 const uint8_t *src, UCaseContext *csc, 272 int32_t srcLength, 273 UErrorCode *pErrorCode) { 274 UText utext=UTEXT_INITIALIZER; 275 const UChar *s; 276 UChar32 c; 277 int32_t prev, titleStart, titleLimit, idx, destIndex, length; 278 UBool isFirstIndex; 279 280 utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode); 281 if(U_FAILURE(*pErrorCode)) { 282 return 0; 283 } 284 if(csm->iter==NULL) { 285 csm->iter=ubrk_open(UBRK_WORD, csm->locale, 286 NULL, 0, 287 pErrorCode); 288 } 289 ubrk_setUText(csm->iter, &utext, pErrorCode); 290 if(U_FAILURE(*pErrorCode)) { 291 utext_close(&utext); 292 return 0; 293 } 294 295 /* set up local variables */ 296 destIndex=0; 297 prev=0; 298 isFirstIndex=TRUE; 299 300 /* titlecasing loop */ 301 while(prev<srcLength) { 302 /* find next index where to titlecase */ 303 if(isFirstIndex) { 304 isFirstIndex=FALSE; 305 idx=ubrk_first(csm->iter); 306 } else { 307 idx=ubrk_next(csm->iter); 308 } 309 if(idx==UBRK_DONE || idx>srcLength) { 310 idx=srcLength; 311 } 312 313 /* 314 * Unicode 4 & 5 section 3.13 Default Case Operations: 315 * 316 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex 317 * #29, "Text Boundaries." Between each pair of word boundaries, find the first 318 * cased character F. If F exists, map F to default_title(F); then map each 319 * subsequent character C to default_lower(C). 320 * 321 * In this implementation, segment [prev..index[ into 3 parts: 322 * a) uncased characters (copy as-is) [prev..titleStart[ 323 * b) first case letter (titlecase) [titleStart..titleLimit[ 324 * c) subsequent characters (lowercase) [titleLimit..index[ 325 */ 326 if(prev<idx) { 327 /* find and copy uncased characters [prev..titleStart[ */ 328 titleStart=titleLimit=prev; 329 U8_NEXT(src, titleLimit, idx, c); 330 if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { 331 /* Adjust the titlecasing index (titleStart) to the next cased character. */ 332 for(;;) { 333 titleStart=titleLimit; 334 if(titleLimit==idx) { 335 /* 336 * only uncased characters in [prev..index[ 337 * stop with titleStart==titleLimit==index 338 */ 339 break; 340 } 341 U8_NEXT(src, titleLimit, idx, c); 342 if(UCASE_NONE!=ucase_getType(csm->csp, c)) { 343 break; /* cased letter at [titleStart..titleLimit[ */ 344 } 345 } 346 length=titleStart-prev; 347 if(length>0) { 348 if((destIndex+length)<=destCapacity) { 349 uprv_memcpy(dest+destIndex, src+prev, length); 350 } 351 destIndex+=length; 352 } 353 } 354 355 if(titleStart<titleLimit) { 356 /* titlecase c which is from [titleStart..titleLimit[ */ 357 csc->cpStart=titleStart; 358 csc->cpLimit=titleLimit; 359 c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &csm->locCache); 360 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 361 362 363 /* Special case Dutch IJ titlecasing */ 364 if ( titleStart+1 < idx && 365 ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH && 366 ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) && 367 ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) { 368 c=0x004A; 369 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 370 titleLimit++; 371 } 372 /* lowercase [titleLimit..index[ */ 373 if(titleLimit<idx) { 374 if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { 375 /* Normal operation: Lowercase the rest of the word. */ 376 destIndex+= 377 _caseMap( 378 csm, ucase_toFullLower, 379 dest+destIndex, destCapacity-destIndex, 380 src, csc, 381 titleLimit, idx, 382 pErrorCode); 383 } else { 384 /* Optionally just copy the rest of the word unchanged. */ 385 length=idx-titleLimit; 386 if((destIndex+length)<=destCapacity) { 387 uprv_memcpy(dest+destIndex, src+titleLimit, length); 388 } 389 destIndex+=length; 390 } 391 } 392 } 393 } 394 395 prev=idx; 396 } 397 398 if(destIndex>destCapacity) { 399 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 400 } 401 utext_close(&utext); 402 return destIndex; 403 } 404 405 #endif 406 407 static int32_t 408 utf8_foldCase(const UCaseProps *csp, 409 uint8_t *dest, int32_t destCapacity, 410 const uint8_t *src, int32_t srcLength, 411 uint32_t options, 412 UErrorCode *pErrorCode) { 413 int32_t srcIndex, destIndex; 414 415 const UChar *s; 416 UChar32 c, c2; 417 int32_t start; 418 419 /* case mapping loop */ 420 srcIndex=destIndex=0; 421 while(srcIndex<srcLength) { 422 start=srcIndex; 423 U8_NEXT(src, srcIndex, srcLength, c); 424 if(c<0) { 425 while(destIndex<destCapacity && start<srcIndex) { 426 dest[destIndex++]=src[start++]; 427 } 428 continue; 429 } 430 c=ucase_toFullFolding(csp, c, &s, options); 431 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) { 432 /* fast path version of appendResult() for ASCII results */ 433 dest[destIndex++]=(uint8_t)c2; 434 } else { 435 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 436 } 437 } 438 439 if(destIndex>destCapacity) { 440 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 441 } 442 return destIndex; 443 } 444 445 /* 446 * Implement argument checking and buffer handling 447 * for string case mapping as a common function. 448 */ 449 450 /* common internal function for public API functions */ 451 452 static int32_t 453 caseMap(const UCaseMap *csm, 454 uint8_t *dest, int32_t destCapacity, 455 const uint8_t *src, int32_t srcLength, 456 int32_t toWhichCase, 457 UErrorCode *pErrorCode) { 458 int32_t destLength; 459 460 /* check argument values */ 461 if(U_FAILURE(*pErrorCode)) { 462 return 0; 463 } 464 if( destCapacity<0 || 465 (dest==NULL && destCapacity>0) || 466 src==NULL || 467 srcLength<-1 468 ) { 469 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 470 return 0; 471 } 472 473 /* get the string length */ 474 if(srcLength==-1) { 475 srcLength=(int32_t)uprv_strlen((const char *)src); 476 } 477 478 /* check for overlapping source and destination */ 479 if( dest!=NULL && 480 ((src>=dest && src<(dest+destCapacity)) || 481 (dest>=src && dest<(src+srcLength))) 482 ) { 483 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 484 return 0; 485 } 486 487 destLength=0; 488 489 if(toWhichCase==FOLD_CASE) { 490 destLength=utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength, 491 csm->options, pErrorCode); 492 } else { 493 UCaseContext csc={ NULL }; 494 495 csc.p=(void *)src; 496 csc.limit=srcLength; 497 498 if(toWhichCase==TO_LOWER) { 499 destLength=_caseMap(csm, ucase_toFullLower, 500 dest, destCapacity, 501 src, &csc, 502 0, srcLength, 503 pErrorCode); 504 } else if(toWhichCase==TO_UPPER) { 505 destLength=_caseMap(csm, ucase_toFullUpper, 506 dest, destCapacity, 507 src, &csc, 508 0, srcLength, 509 pErrorCode); 510 } else /* if(toWhichCase==TO_TITLE) */ { 511 #if UCONFIG_NO_BREAK_ITERATION 512 *pErrorCode=U_UNSUPPORTED_ERROR; 513 #else 514 /* UCaseMap is actually non-const in toTitle() APIs. */ 515 UCaseMap *tmp = (UCaseMap *)csm; 516 destLength=_toTitle(tmp, dest, destCapacity, 517 src, &csc, srcLength, 518 pErrorCode); 519 #endif 520 } 521 } 522 523 return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode); 524 } 525 526 /* public API functions */ 527 528 U_CAPI int32_t U_EXPORT2 529 ucasemap_utf8ToLower(const UCaseMap *csm, 530 char *dest, int32_t destCapacity, 531 const char *src, int32_t srcLength, 532 UErrorCode *pErrorCode) { 533 return caseMap(csm, 534 (uint8_t *)dest, destCapacity, 535 (const uint8_t *)src, srcLength, 536 TO_LOWER, pErrorCode); 537 } 538 539 U_CAPI int32_t U_EXPORT2 540 ucasemap_utf8ToUpper(const UCaseMap *csm, 541 char *dest, int32_t destCapacity, 542 const char *src, int32_t srcLength, 543 UErrorCode *pErrorCode) { 544 return caseMap(csm, 545 (uint8_t *)dest, destCapacity, 546 (const uint8_t *)src, srcLength, 547 TO_UPPER, pErrorCode); 548 } 549 550 #if !UCONFIG_NO_BREAK_ITERATION 551 552 U_CAPI int32_t U_EXPORT2 553 ucasemap_utf8ToTitle(UCaseMap *csm, 554 char *dest, int32_t destCapacity, 555 const char *src, int32_t srcLength, 556 UErrorCode *pErrorCode) { 557 return caseMap(csm, 558 (uint8_t *)dest, destCapacity, 559 (const uint8_t *)src, srcLength, 560 TO_TITLE, pErrorCode); 561 } 562 563 #endif 564 565 U_CAPI int32_t U_EXPORT2 566 ucasemap_utf8FoldCase(const UCaseMap *csm, 567 char *dest, int32_t destCapacity, 568 const char *src, int32_t srcLength, 569 UErrorCode *pErrorCode) { 570 return caseMap(csm, 571 (uint8_t *)dest, destCapacity, 572 (const uint8_t *)src, srcLength, 573 FOLD_CASE, pErrorCode); 574 } 575