1 /* 2 ******************************************************************************** 3 * Copyright (C) 1996-2008, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************** 6 * 7 * File UCHAR.C 8 * 9 * Modification History: 10 * 11 * Date Name Description 12 * 04/02/97 aliu Creation. 13 * 4/15/99 Madhu Updated all the function definitions for C Implementation 14 * 5/20/99 Madhu Added the function u_getVersion() 15 * 8/19/1999 srl Upgraded scripts to Unicode3.0 16 * 11/11/1999 weiv added u_isalnum(), cleaned comments 17 * 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion. 18 * 06/20/2000 helena OS/400 port changes; mostly typecast. 19 ****************************************************************************** 20 */ 21 22 #include "unicode/utypes.h" 23 #include "unicode/uchar.h" 24 #include "unicode/uscript.h" 25 #include "unicode/udata.h" 26 #include "umutex.h" 27 #include "cmemory.h" 28 #include "ucln_cmn.h" 29 #include "utrie2.h" 30 #include "udataswp.h" 31 #include "unormimp.h" /* JAMO_L_BASE etc. */ 32 #include "uprops.h" 33 34 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 35 36 /* dynamically loaded Unicode character properties -------------------------- */ 37 38 #define UCHAR_HARDCODE_DATA 1 39 40 #if UCHAR_HARDCODE_DATA 41 42 /* uchar_props_data.c is machine-generated by genprops --csource */ 43 #include "uchar_props_data.c" 44 45 #else 46 47 /* 48 * loaded uprops.dat - 49 * for a description of the file format, see icu/source/tools/genprops/store.c 50 */ 51 static const char DATA_NAME[] = "uprops"; 52 static const char DATA_TYPE[] = "icu"; 53 54 static UDataMemory *propsData=NULL; 55 static UErrorCode dataErrorCode=U_ZERO_ERROR; 56 57 static uint8_t formatVersion[4]={ 0, 0, 0, 0 }; 58 static UVersionInfo dataVersion={ 0, 0, 0, 0 }; 59 60 static UTrie propsTrie={ 0 }, propsVectorsTrie={ 0 }; 61 static const uint32_t *pData32=NULL, *propsVectors=NULL; 62 static int32_t countPropsVectors=0, propsVectorsColumns=0; 63 64 static int8_t havePropsData=0; /* == 0 -> Data has not been loaded. 65 * < 0 -> Error occured attempting to load data. 66 * > 0 -> Data has been successfully loaded. 67 */ 68 69 /* index values loaded from uprops.dat */ 70 static int32_t indexes[UPROPS_INDEX_COUNT]; 71 72 static UBool U_CALLCONV 73 isAcceptable(void *context, 74 const char *type, const char *name, 75 const UDataInfo *pInfo) { 76 if( 77 pInfo->size>=20 && 78 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 79 pInfo->charsetFamily==U_CHARSET_FAMILY && 80 pInfo->dataFormat[0]==0x55 && /* dataFormat="UPro" */ 81 pInfo->dataFormat[1]==0x50 && 82 pInfo->dataFormat[2]==0x72 && 83 pInfo->dataFormat[3]==0x6f && 84 pInfo->formatVersion[0]==4 && 85 pInfo->formatVersion[2]==UTRIE_SHIFT && 86 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT 87 ) { 88 uprv_memcpy(formatVersion, pInfo->formatVersion, 4); 89 uprv_memcpy(dataVersion, pInfo->dataVersion, 4); 90 return TRUE; 91 } else { 92 return FALSE; 93 } 94 } 95 96 static UBool U_CALLCONV uchar_cleanup(void) 97 { 98 if (propsData) { 99 udata_close(propsData); 100 propsData=NULL; 101 } 102 pData32=NULL; 103 propsVectors=NULL; 104 countPropsVectors=0; 105 uprv_memset(dataVersion, 0, U_MAX_VERSION_LENGTH); 106 dataErrorCode=U_ZERO_ERROR; 107 havePropsData=0; 108 109 return TRUE; 110 } 111 112 struct UCharProps { 113 UDataMemory *propsData; 114 UTrie propsTrie, propsVectorsTrie; 115 const uint32_t *pData32; 116 }; 117 typedef struct UCharProps UCharProps; 118 119 /* open uprops.icu */ 120 static void 121 _openProps(UCharProps *ucp, UErrorCode *pErrorCode) { 122 const uint32_t *p; 123 int32_t length; 124 125 ucp->propsData=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode); 126 if(U_FAILURE(*pErrorCode)) { 127 return; 128 } 129 130 ucp->pData32=p=(const uint32_t *)udata_getMemory(ucp->propsData); 131 132 /* unserialize the trie; it is directly after the int32_t indexes[UPROPS_INDEX_COUNT] */ 133 length=(int32_t)p[UPROPS_PROPS32_INDEX]*4; 134 length=utrie_unserialize(&ucp->propsTrie, (const uint8_t *)(p+UPROPS_INDEX_COUNT), length-64, pErrorCode); 135 if(U_FAILURE(*pErrorCode)) { 136 return; 137 } 138 139 /* unserialize the properties vectors trie */ 140 length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4; 141 if(length>0) { 142 length=utrie_unserialize(&ucp->propsVectorsTrie, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, pErrorCode); 143 } 144 if(length<=0 || U_FAILURE(*pErrorCode)) { 145 /* 146 * length==0: 147 * Allow the properties vectors trie to be missing - 148 * also requires propsVectorsColumns=indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX] 149 * to be zero so that this trie is never accessed. 150 */ 151 uprv_memset(&ucp->propsVectorsTrie, 0, sizeof(ucp->propsVectorsTrie)); 152 } 153 } 154 155 #endif 156 157 #if !UCHAR_HARDCODE_DATA 158 static int8_t 159 uprv_loadPropsData(UErrorCode *pErrorCode) { 160 /* load Unicode character properties data from file if necessary */ 161 162 /* 163 * This lazy intialization with double-checked locking (without mutex protection for 164 * haveNormData==0) is transiently unsafe under certain circumstances. 165 * Check the readme and use u_init() if necessary. 166 */ 167 if(havePropsData==0) { 168 UCharProps ucp={ NULL }; 169 170 if(U_FAILURE(*pErrorCode)) { 171 return havePropsData; 172 } 173 174 /* open the data outside the mutex block */ 175 _openProps(&ucp, pErrorCode); 176 177 if(U_SUCCESS(*pErrorCode)) { 178 /* in the mutex block, set the data for this process */ 179 umtx_lock(NULL); 180 if(propsData==NULL) { 181 propsData=ucp.propsData; 182 ucp.propsData=NULL; 183 pData32=ucp.pData32; 184 ucp.pData32=NULL; 185 uprv_memcpy(&propsTrie, &ucp.propsTrie, sizeof(propsTrie)); 186 uprv_memcpy(&propsVectorsTrie, &ucp.propsVectorsTrie, sizeof(propsVectorsTrie)); 187 } 188 189 /* initialize some variables */ 190 uprv_memcpy(indexes, pData32, sizeof(indexes)); 191 192 /* additional properties */ 193 if(indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0) { 194 propsVectors=pData32+indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]; 195 countPropsVectors=indexes[UPROPS_RESERVED_INDEX]-indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]; 196 propsVectorsColumns=indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]; 197 } 198 199 havePropsData=1; 200 umtx_unlock(NULL); 201 } else { 202 dataErrorCode=*pErrorCode; 203 havePropsData=-1; 204 } 205 ucln_common_registerCleanup(UCLN_COMMON_UCHAR, uchar_cleanup); 206 207 /* if a different thread set it first, then close the extra data */ 208 udata_close(ucp.propsData); /* NULL if it was set correctly */ 209 } 210 211 return havePropsData; 212 } 213 214 static int8_t 215 loadPropsData(void) { 216 UErrorCode errorCode = U_ZERO_ERROR; 217 int8_t retVal = uprv_loadPropsData(&errorCode); 218 return retVal; 219 } 220 221 #endif 222 223 /* constants and macros for access to the data ------------------------------ */ 224 225 /* getting a uint32_t properties word from the data */ 226 #if UCHAR_HARDCODE_DATA 227 228 #define GET_PROPS(c, result) ((result)=UTRIE2_GET16(&propsTrie, c)); 229 230 #else 231 232 #define HAVE_DATA (havePropsData>0 || loadPropsData()>0) 233 #define GET_PROPS_UNSAFE(c, result) \ 234 UTRIE_GET16(&propsTrie, c, result); 235 #define GET_PROPS(c, result) \ 236 if(HAVE_DATA) { \ 237 GET_PROPS_UNSAFE(c, result); \ 238 } else { \ 239 (result)=0; \ 240 } 241 242 #endif 243 244 U_CFUNC UBool 245 uprv_haveProperties(UErrorCode *pErrorCode) { 246 if(U_FAILURE(*pErrorCode)) { 247 return FALSE; 248 } 249 #if !UCHAR_HARDCODE_DATA 250 if(havePropsData==0) { 251 uprv_loadPropsData(pErrorCode); 252 } 253 if(havePropsData<0) { 254 *pErrorCode=dataErrorCode; 255 return FALSE; 256 } 257 #endif 258 return TRUE; 259 } 260 261 /* API functions ------------------------------------------------------------ */ 262 263 /* Gets the Unicode character's general category.*/ 264 U_CAPI int8_t U_EXPORT2 265 u_charType(UChar32 c) { 266 uint32_t props; 267 GET_PROPS(c, props); 268 return (int8_t)GET_CATEGORY(props); 269 } 270 271 /* Enumerate all code points with their general categories. */ 272 struct _EnumTypeCallback { 273 UCharEnumTypeRange *enumRange; 274 const void *context; 275 }; 276 277 static uint32_t U_CALLCONV 278 _enumTypeValue(const void *context, uint32_t value) { 279 return GET_CATEGORY(value); 280 } 281 282 static UBool U_CALLCONV 283 _enumTypeRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 284 /* just cast the value to UCharCategory */ 285 return ((struct _EnumTypeCallback *)context)-> 286 enumRange(((struct _EnumTypeCallback *)context)->context, 287 start, end+1, (UCharCategory)value); 288 } 289 290 U_CAPI void U_EXPORT2 291 u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context) { 292 struct _EnumTypeCallback callback; 293 294 if(enumRange==NULL 295 #if !UCHAR_HARDCODE_DATA 296 || !HAVE_DATA 297 #endif 298 ) { 299 return; 300 } 301 302 callback.enumRange=enumRange; 303 callback.context=context; 304 utrie2_enum(&propsTrie, _enumTypeValue, _enumTypeRange, &callback); 305 } 306 307 /* Checks if ch is a lower case letter.*/ 308 U_CAPI UBool U_EXPORT2 309 u_islower(UChar32 c) { 310 uint32_t props; 311 GET_PROPS(c, props); 312 return (UBool)(GET_CATEGORY(props)==U_LOWERCASE_LETTER); 313 } 314 315 /* Checks if ch is an upper case letter.*/ 316 U_CAPI UBool U_EXPORT2 317 u_isupper(UChar32 c) { 318 uint32_t props; 319 GET_PROPS(c, props); 320 return (UBool)(GET_CATEGORY(props)==U_UPPERCASE_LETTER); 321 } 322 323 /* Checks if ch is a title case letter; usually upper case letters.*/ 324 U_CAPI UBool U_EXPORT2 325 u_istitle(UChar32 c) { 326 uint32_t props; 327 GET_PROPS(c, props); 328 return (UBool)(GET_CATEGORY(props)==U_TITLECASE_LETTER); 329 } 330 331 /* Checks if ch is a decimal digit. */ 332 U_CAPI UBool U_EXPORT2 333 u_isdigit(UChar32 c) { 334 uint32_t props; 335 GET_PROPS(c, props); 336 return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER); 337 } 338 339 U_CAPI UBool U_EXPORT2 340 u_isxdigit(UChar32 c) { 341 uint32_t props; 342 343 /* check ASCII and Fullwidth ASCII a-fA-F */ 344 if( 345 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || 346 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) 347 ) { 348 return TRUE; 349 } 350 351 GET_PROPS(c, props); 352 return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER); 353 } 354 355 /* Checks if the Unicode character is a letter.*/ 356 U_CAPI UBool U_EXPORT2 357 u_isalpha(UChar32 c) { 358 uint32_t props; 359 GET_PROPS(c, props); 360 return (UBool)((CAT_MASK(props)&U_GC_L_MASK)!=0); 361 } 362 363 U_CAPI UBool U_EXPORT2 364 u_isUAlphabetic(UChar32 c) { 365 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0; 366 } 367 368 /* Checks if c is a letter or a decimal digit */ 369 U_CAPI UBool U_EXPORT2 370 u_isalnum(UChar32 c) { 371 uint32_t props; 372 GET_PROPS(c, props); 373 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0); 374 } 375 376 /** 377 * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. 378 * @internal 379 */ 380 U_CFUNC UBool 381 u_isalnumPOSIX(UChar32 c) { 382 return (UBool)(u_isUAlphabetic(c) || u_isdigit(c)); 383 } 384 385 /* Checks if ch is a unicode character with assigned character type.*/ 386 U_CAPI UBool U_EXPORT2 387 u_isdefined(UChar32 c) { 388 uint32_t props; 389 GET_PROPS(c, props); 390 return (UBool)(GET_CATEGORY(props)!=0); 391 } 392 393 /* Checks if the Unicode character is a base form character that can take a diacritic.*/ 394 U_CAPI UBool U_EXPORT2 395 u_isbase(UChar32 c) { 396 uint32_t props; 397 GET_PROPS(c, props); 398 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_N_MASK|U_GC_MC_MASK|U_GC_ME_MASK))!=0); 399 } 400 401 /* Checks if the Unicode character is a control character.*/ 402 U_CAPI UBool U_EXPORT2 403 u_iscntrl(UChar32 c) { 404 uint32_t props; 405 GET_PROPS(c, props); 406 return (UBool)((CAT_MASK(props)&(U_GC_CC_MASK|U_GC_CF_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK))!=0); 407 } 408 409 U_CAPI UBool U_EXPORT2 410 u_isISOControl(UChar32 c) { 411 return (uint32_t)c<=0x9f && (c<=0x1f || c>=0x7f); 412 } 413 414 /* Some control characters that are used as space. */ 415 #define IS_THAT_CONTROL_SPACE(c) \ 416 (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==NL)) 417 418 /* Checks if the Unicode character is a space character.*/ 419 U_CAPI UBool U_EXPORT2 420 u_isspace(UChar32 c) { 421 uint32_t props; 422 GET_PROPS(c, props); 423 return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0 || IS_THAT_CONTROL_SPACE(c)); 424 } 425 426 U_CAPI UBool U_EXPORT2 427 u_isJavaSpaceChar(UChar32 c) { 428 uint32_t props; 429 GET_PROPS(c, props); 430 return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0); 431 } 432 433 /* Checks if the Unicode character is a whitespace character.*/ 434 U_CAPI UBool U_EXPORT2 435 u_isWhitespace(UChar32 c) { 436 uint32_t props; 437 GET_PROPS(c, props); 438 return (UBool)( 439 ((CAT_MASK(props)&U_GC_Z_MASK)!=0 && 440 c!=NBSP && c!=FIGURESP && c!=NNBSP) || /* exclude no-break spaces */ 441 IS_THAT_CONTROL_SPACE(c) 442 ); 443 } 444 445 U_CAPI UBool U_EXPORT2 446 u_isblank(UChar32 c) { 447 if((uint32_t)c<=0x9f) { 448 return c==9 || c==0x20; /* TAB or SPACE */ 449 } else { 450 /* Zs */ 451 uint32_t props; 452 GET_PROPS(c, props); 453 return (UBool)(GET_CATEGORY(props)==U_SPACE_SEPARATOR); 454 } 455 } 456 457 U_CAPI UBool U_EXPORT2 458 u_isUWhiteSpace(UChar32 c) { 459 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0; 460 } 461 462 /* Checks if the Unicode character is printable.*/ 463 U_CAPI UBool U_EXPORT2 464 u_isprint(UChar32 c) { 465 uint32_t props; 466 GET_PROPS(c, props); 467 /* comparing ==0 returns FALSE for the categories mentioned */ 468 return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0); 469 } 470 471 /** 472 * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. 473 * Implements UCHAR_POSIX_PRINT. 474 * @internal 475 */ 476 U_CFUNC UBool 477 u_isprintPOSIX(UChar32 c) { 478 uint32_t props; 479 GET_PROPS(c, props); 480 /* 481 * The only cntrl character in graph+blank is TAB (in blank). 482 * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). 483 */ 484 return (UBool)((GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c)); 485 } 486 487 U_CAPI UBool U_EXPORT2 488 u_isgraph(UChar32 c) { 489 uint32_t props; 490 GET_PROPS(c, props); 491 /* comparing ==0 returns FALSE for the categories mentioned */ 492 return (UBool)((CAT_MASK(props)& 493 (U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK)) 494 ==0); 495 } 496 497 /** 498 * Checks if c is in 499 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 500 * with space=\p{Whitespace} and Control=Cc. 501 * Implements UCHAR_POSIX_GRAPH. 502 * @internal 503 */ 504 U_CFUNC UBool 505 u_isgraphPOSIX(UChar32 c) { 506 uint32_t props; 507 GET_PROPS(c, props); 508 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ 509 /* comparing ==0 returns FALSE for the categories mentioned */ 510 return (UBool)((CAT_MASK(props)& 511 (U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK)) 512 ==0); 513 } 514 515 U_CAPI UBool U_EXPORT2 516 u_ispunct(UChar32 c) { 517 uint32_t props; 518 GET_PROPS(c, props); 519 return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0); 520 } 521 522 /* Checks if the Unicode character can start a Unicode identifier.*/ 523 U_CAPI UBool U_EXPORT2 524 u_isIDStart(UChar32 c) { 525 /* same as u_isalpha() */ 526 uint32_t props; 527 GET_PROPS(c, props); 528 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_NL_MASK))!=0); 529 } 530 531 /* Checks if the Unicode character can be a Unicode identifier part other than starting the 532 identifier.*/ 533 U_CAPI UBool U_EXPORT2 534 u_isIDPart(UChar32 c) { 535 uint32_t props; 536 GET_PROPS(c, props); 537 return (UBool)( 538 (CAT_MASK(props)& 539 (U_GC_ND_MASK|U_GC_NL_MASK| 540 U_GC_L_MASK| 541 U_GC_PC_MASK|U_GC_MC_MASK|U_GC_MN_MASK) 542 )!=0 || 543 u_isIDIgnorable(c)); 544 } 545 546 /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/ 547 U_CAPI UBool U_EXPORT2 548 u_isIDIgnorable(UChar32 c) { 549 if(c<=0x9f) { 550 return u_isISOControl(c) && !IS_THAT_CONTROL_SPACE(c); 551 } else { 552 uint32_t props; 553 GET_PROPS(c, props); 554 return (UBool)(GET_CATEGORY(props)==U_FORMAT_CHAR); 555 } 556 } 557 558 /*Checks if the Unicode character can start a Java identifier.*/ 559 U_CAPI UBool U_EXPORT2 560 u_isJavaIDStart(UChar32 c) { 561 uint32_t props; 562 GET_PROPS(c, props); 563 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_SC_MASK|U_GC_PC_MASK))!=0); 564 } 565 566 /*Checks if the Unicode character can be a Java identifier part other than starting the 567 * identifier. 568 */ 569 U_CAPI UBool U_EXPORT2 570 u_isJavaIDPart(UChar32 c) { 571 uint32_t props; 572 GET_PROPS(c, props); 573 return (UBool)( 574 (CAT_MASK(props)& 575 (U_GC_ND_MASK|U_GC_NL_MASK| 576 U_GC_L_MASK| 577 U_GC_SC_MASK|U_GC_PC_MASK| 578 U_GC_MC_MASK|U_GC_MN_MASK) 579 )!=0 || 580 u_isIDIgnorable(c)); 581 } 582 583 U_CAPI int32_t U_EXPORT2 584 u_charDigitValue(UChar32 c) { 585 uint32_t props; 586 GET_PROPS(c, props); 587 588 if(GET_NUMERIC_TYPE(props)==1) { 589 return GET_NUMERIC_VALUE(props); 590 } else { 591 return -1; 592 } 593 } 594 595 U_CAPI double U_EXPORT2 596 u_getNumericValue(UChar32 c) { 597 uint32_t props, numericType, numericValue; 598 GET_PROPS(c, props); 599 numericType=GET_NUMERIC_TYPE(props); 600 601 if(numericType==0 || numericType>=UPROPS_NT_COUNT) { 602 return U_NO_NUMERIC_VALUE; 603 } 604 605 numericValue=GET_NUMERIC_VALUE(props); 606 607 if(numericType<U_NT_COUNT) { 608 /* normal type, the value is stored directly */ 609 return numericValue; 610 } else if(numericType==UPROPS_NT_FRACTION) { 611 /* fraction value */ 612 int32_t numerator; 613 uint32_t denominator; 614 615 numerator=(int32_t)numericValue>>UPROPS_FRACTION_NUM_SHIFT; 616 denominator=(numericValue&UPROPS_FRACTION_DEN_MASK)+UPROPS_FRACTION_DEN_OFFSET; 617 618 if(numerator==0) { 619 numerator=-1; 620 } 621 return (double)numerator/(double)denominator; 622 } else /* numericType==UPROPS_NT_LARGE */ { 623 /* large value with exponent */ 624 double numValue; 625 int32_t mant, exp; 626 627 mant=(int32_t)numericValue>>UPROPS_LARGE_MANT_SHIFT; 628 exp=(int32_t)numericValue&UPROPS_LARGE_EXP_MASK; 629 if(mant==0) { 630 mant=1; 631 exp+=UPROPS_LARGE_EXP_OFFSET_EXTRA; 632 } else if(mant>9) { 633 return U_NO_NUMERIC_VALUE; /* reserved mantissa value */ 634 } else { 635 exp+=UPROPS_LARGE_EXP_OFFSET; 636 } 637 638 numValue=mant; 639 640 /* multiply by 10^exp without math.h */ 641 while(exp>=4) { 642 numValue*=10000.; 643 exp-=4; 644 } 645 switch(exp) { 646 case 3: 647 numValue*=1000.; 648 break; 649 case 2: 650 numValue*=100.; 651 break; 652 case 1: 653 numValue*=10.; 654 break; 655 case 0: 656 default: 657 break; 658 } 659 660 return numValue; 661 } 662 } 663 664 /* ICU 3.4: bidi/shaping properties moved to ubidi_props.c */ 665 666 /* ICU 2.1: u_getCombiningClass() moved to unorm.cpp */ 667 668 U_CAPI int32_t U_EXPORT2 669 u_digit(UChar32 ch, int8_t radix) { 670 int8_t value; 671 if((uint8_t)(radix-2)<=(36-2)) { 672 value=(int8_t)u_charDigitValue(ch); 673 if(value<0) { 674 /* ch is not a decimal digit, try latin letters */ 675 if(ch>=0x61 && ch<=0x7A) { 676 value=(int8_t)(ch-0x57); /* ch - 'a' + 10 */ 677 } else if(ch>=0x41 && ch<=0x5A) { 678 value=(int8_t)(ch-0x37); /* ch - 'A' + 10 */ 679 } else if(ch>=0xFF41 && ch<=0xFF5A) { 680 value=(int8_t)(ch-0xFF37); /* fullwidth ASCII a-z */ 681 } else if(ch>=0xFF21 && ch<=0xFF3A) { 682 value=(int8_t)(ch-0xFF17); /* fullwidth ASCII A-Z */ 683 } 684 } 685 } else { 686 value=-1; /* invalid radix */ 687 } 688 return (int8_t)((value<radix) ? value : -1); 689 } 690 691 U_CAPI UChar32 U_EXPORT2 692 u_forDigit(int32_t digit, int8_t radix) { 693 if((uint8_t)(radix-2)>(36-2) || (uint32_t)digit>=(uint32_t)radix) { 694 return 0; 695 } else if(digit<10) { 696 return (UChar32)(0x30+digit); 697 } else { 698 return (UChar32)((0x61-10)+digit); 699 } 700 } 701 702 /* miscellaneous, and support for uprops.c ---------------------------------- */ 703 704 U_CAPI void U_EXPORT2 705 u_getUnicodeVersion(UVersionInfo versionArray) { 706 if(versionArray!=NULL) { 707 uprv_memcpy(versionArray, dataVersion, U_MAX_VERSION_LENGTH); 708 } 709 } 710 711 U_CFUNC uint32_t 712 u_getUnicodeProperties(UChar32 c, int32_t column) { 713 uint16_t vecIndex; 714 715 if(column==-1) { 716 uint32_t props; 717 GET_PROPS(c, props); 718 return props; 719 } else if( 720 #if !UCHAR_HARDCODE_DATA 721 !HAVE_DATA || countPropsVectors==0 || 722 #endif 723 column<0 || column>=propsVectorsColumns 724 ) { 725 return 0; 726 } else { 727 vecIndex=UTRIE2_GET16(&propsVectorsTrie, c); 728 return propsVectors[vecIndex+column]; 729 } 730 } 731 732 U_CFUNC int32_t 733 uprv_getMaxValues(int32_t column) { 734 #if !UCHAR_HARDCODE_DATA 735 if(HAVE_DATA) { 736 #endif 737 switch(column) { 738 case 0: 739 return indexes[UPROPS_MAX_VALUES_INDEX]; 740 case 2: 741 return indexes[UPROPS_MAX_VALUES_2_INDEX]; 742 default: 743 return 0; 744 } 745 #if !UCHAR_HARDCODE_DATA 746 } else { 747 return 0; 748 } 749 #endif 750 } 751 752 /* 753 * get Hangul Syllable Type 754 * implemented here so that uchar.c (uhst_addPropertyStarts()) 755 * does not depend on uprops.c (u_getIntPropertyValue(c, UCHAR_HANGUL_SYLLABLE_TYPE)) 756 */ 757 U_CFUNC UHangulSyllableType 758 uchar_getHST(UChar32 c) { 759 /* purely algorithmic; hardcode known characters, check for assigned new ones */ 760 if(c<JAMO_L_BASE) { 761 /* U_HST_NOT_APPLICABLE */ 762 } else if(c<=0x11ff) { 763 /* Jamo range */ 764 if(c<=0x115f) { 765 /* Jamo L range, HANGUL CHOSEONG ... */ 766 if(c==0x115f || c<=0x1159 || u_charType(c)==U_OTHER_LETTER) { 767 return U_HST_LEADING_JAMO; 768 } 769 } else if(c<=0x11a7) { 770 /* Jamo V range, HANGUL JUNGSEONG ... */ 771 if(c<=0x11a2 || u_charType(c)==U_OTHER_LETTER) { 772 return U_HST_VOWEL_JAMO; 773 } 774 } else { 775 /* Jamo T range */ 776 if(c<=0x11f9 || u_charType(c)==U_OTHER_LETTER) { 777 return U_HST_TRAILING_JAMO; 778 } 779 } 780 } else if((c-=HANGUL_BASE)<0) { 781 /* U_HST_NOT_APPLICABLE */ 782 } else if(c<HANGUL_COUNT) { 783 /* Hangul syllable */ 784 return c%JAMO_T_COUNT==0 ? U_HST_LV_SYLLABLE : U_HST_LVT_SYLLABLE; 785 } 786 return U_HST_NOT_APPLICABLE; 787 } 788 789 U_CAPI void U_EXPORT2 790 u_charAge(UChar32 c, UVersionInfo versionArray) { 791 if(versionArray!=NULL) { 792 uint32_t version=u_getUnicodeProperties(c, 0)>>UPROPS_AGE_SHIFT; 793 versionArray[0]=(uint8_t)(version>>4); 794 versionArray[1]=(uint8_t)(version&0xf); 795 versionArray[2]=versionArray[3]=0; 796 } 797 } 798 799 U_CAPI UScriptCode U_EXPORT2 800 uscript_getScript(UChar32 c, UErrorCode *pErrorCode) { 801 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 802 return USCRIPT_INVALID_CODE; 803 } 804 if((uint32_t)c>0x10ffff) { 805 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 806 return USCRIPT_INVALID_CODE; 807 } 808 809 return (UScriptCode)(u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_MASK); 810 } 811 812 U_CAPI UBlockCode U_EXPORT2 813 ublock_getCode(UChar32 c) { 814 return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT); 815 } 816 817 /* property starts for UnicodeSet ------------------------------------------- */ 818 819 /* for Hangul_Syllable_Type */ 820 U_CFUNC void U_EXPORT2 821 uhst_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { 822 UChar32 c; 823 int32_t value, value2; 824 825 if(U_FAILURE(*pErrorCode)) { 826 return; 827 } 828 829 #if !UCHAR_HARDCODE_DATA 830 if(!HAVE_DATA) { 831 *pErrorCode=dataErrorCode; 832 return; 833 } 834 #endif 835 836 /* add code points with hardcoded properties, plus the ones following them */ 837 838 /* 839 * Add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE. 840 * First, we add fixed boundaries for the blocks of Jamos. 841 * Then we check in loops to see where the current Unicode version 842 * actually stops assigning such Jamos. We start each loop 843 * at the end of the per-Jamo-block assignments in Unicode 4 or earlier. 844 * (These have not changed since Unicode 2.) 845 */ 846 sa->add(sa->set, 0x1100); 847 value=U_HST_LEADING_JAMO; 848 for(c=0x115a; c<=0x115f; ++c) { 849 value2=uchar_getHST(c); 850 if(value!=value2) { 851 value=value2; 852 sa->add(sa->set, c); 853 } 854 } 855 856 sa->add(sa->set, 0x1160); 857 value=U_HST_VOWEL_JAMO; 858 for(c=0x11a3; c<=0x11a7; ++c) { 859 value2=uchar_getHST(c); 860 if(value!=value2) { 861 value=value2; 862 sa->add(sa->set, c); 863 } 864 } 865 866 sa->add(sa->set, 0x11a8); 867 value=U_HST_TRAILING_JAMO; 868 for(c=0x11fa; c<=0x11ff; ++c) { 869 value2=uchar_getHST(c); 870 if(value!=value2) { 871 value=value2; 872 sa->add(sa->set, c); 873 } 874 } 875 876 /* Add Hangul type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE. */ 877 for(c=HANGUL_BASE; c<(HANGUL_BASE+HANGUL_COUNT); c+=JAMO_T_COUNT) { 878 sa->add(sa->set, c); 879 sa->add(sa->set, c+1); 880 } 881 sa->add(sa->set, c); 882 } 883 884 static UBool U_CALLCONV 885 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 886 /* add the start code point to the USet */ 887 const USetAdder *sa=(const USetAdder *)context; 888 sa->add(sa->set, start); 889 return TRUE; 890 } 891 892 #define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1) 893 894 U_CFUNC void U_EXPORT2 895 uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { 896 if(U_FAILURE(*pErrorCode)) { 897 return; 898 } 899 900 #if !UCHAR_HARDCODE_DATA 901 if(!HAVE_DATA) { 902 *pErrorCode=dataErrorCode; 903 return; 904 } 905 #endif 906 907 /* add the start code point of each same-value range of the main trie */ 908 utrie2_enum(&propsTrie, NULL, _enumPropertyStartsRange, sa); 909 910 /* add code points with hardcoded properties, plus the ones following them */ 911 912 /* add for u_isblank() */ 913 USET_ADD_CP_AND_NEXT(sa, TAB); 914 915 /* add for IS_THAT_CONTROL_SPACE() */ 916 sa->add(sa->set, CR+1); /* range TAB..CR */ 917 sa->add(sa->set, 0x1c); 918 sa->add(sa->set, 0x1f+1); 919 USET_ADD_CP_AND_NEXT(sa, NL); 920 921 /* add for u_isIDIgnorable() what was not added above */ 922 sa->add(sa->set, DEL); /* range DEL..NBSP-1, NBSP added below */ 923 sa->add(sa->set, HAIRSP); 924 sa->add(sa->set, RLM+1); 925 sa->add(sa->set, INHSWAP); 926 sa->add(sa->set, NOMDIG+1); 927 USET_ADD_CP_AND_NEXT(sa, ZWNBSP); 928 929 /* add no-break spaces for u_isWhitespace() what was not added above */ 930 USET_ADD_CP_AND_NEXT(sa, NBSP); 931 USET_ADD_CP_AND_NEXT(sa, FIGURESP); 932 USET_ADD_CP_AND_NEXT(sa, NNBSP); 933 934 /* add for u_digit() */ 935 sa->add(sa->set, U_a); 936 sa->add(sa->set, U_z+1); 937 sa->add(sa->set, U_A); 938 sa->add(sa->set, U_Z+1); 939 sa->add(sa->set, U_FW_a); 940 sa->add(sa->set, U_FW_z+1); 941 sa->add(sa->set, U_FW_A); 942 sa->add(sa->set, U_FW_Z+1); 943 944 /* add for u_isxdigit() */ 945 sa->add(sa->set, U_f+1); 946 sa->add(sa->set, U_F+1); 947 sa->add(sa->set, U_FW_f+1); 948 sa->add(sa->set, U_FW_F+1); 949 950 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ 951 sa->add(sa->set, WJ); /* range WJ..NOMDIG */ 952 sa->add(sa->set, 0xfff0); 953 sa->add(sa->set, 0xfffb+1); 954 sa->add(sa->set, 0xe0000); 955 sa->add(sa->set, 0xe0fff+1); 956 957 /* add for UCHAR_GRAPHEME_BASE and others */ 958 USET_ADD_CP_AND_NEXT(sa, CGJ); 959 } 960 961 U_CFUNC void U_EXPORT2 962 upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { 963 if(U_FAILURE(*pErrorCode)) { 964 return; 965 } 966 967 #if !UCHAR_HARDCODE_DATA 968 if(!HAVE_DATA) { 969 *pErrorCode=dataErrorCode; 970 return; 971 } 972 #endif 973 974 /* add the start code point of each same-value range of the properties vectors trie */ 975 if(propsVectorsColumns>0) { 976 /* if propsVectorsColumns==0 then the properties vectors trie may not be there at all */ 977 utrie2_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, sa); 978 } 979 } 980