1 /* 2 ******************************************************************************** 3 * Copyright (C) 1996-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************** 6 * 7 * File UCHAR.C 8 * 9 * Modification History: 10 * 11 * Date Name Description 12 * 04/02/97 aliu Creation. 13 * 4/15/99 Madhu Updated all the function definitions for C Implementation 14 * 5/20/99 Madhu Added the function u_getVersion() 15 * 8/19/1999 srl Upgraded scripts to Unicode3.0 16 * 11/11/1999 weiv added u_isalnum(), cleaned comments 17 * 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion. 18 * 06/20/2000 helena OS/400 port changes; mostly typecast. 19 ****************************************************************************** 20 */ 21 22 #include "unicode/utypes.h" 23 #include "unicode/uchar.h" 24 #include "unicode/uscript.h" 25 #include "unicode/udata.h" 26 #include "umutex.h" 27 #include "cmemory.h" 28 #include "ucln_cmn.h" 29 #include "utrie2.h" 30 #include "udataswp.h" 31 #include "uprops.h" 32 33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 34 35 /* dynamically loaded Unicode character properties -------------------------- */ 36 37 #define UCHAR_HARDCODE_DATA 1 38 39 #if UCHAR_HARDCODE_DATA 40 41 /* uchar_props_data.c is machine-generated by genprops --csource */ 42 #include "uchar_props_data.c" 43 44 #else 45 46 /* 47 * loaded uprops.dat - 48 * for a description of the file format, see icu/source/tools/genprops/store.c 49 */ 50 static const char DATA_NAME[] = "uprops"; 51 static const char DATA_TYPE[] = "icu"; 52 53 static UDataMemory *propsData=NULL; 54 static UErrorCode dataErrorCode=U_ZERO_ERROR; 55 56 static uint8_t formatVersion[4]={ 0, 0, 0, 0 }; 57 static UVersionInfo dataVersion={ 0, 0, 0, 0 }; 58 59 static UTrie propsTrie={ 0 }, propsVectorsTrie={ 0 }; 60 static const uint32_t *pData32=NULL, *propsVectors=NULL; 61 static int32_t countPropsVectors=0, propsVectorsColumns=0; 62 63 static int8_t havePropsData=0; /* == 0 -> Data has not been loaded. 64 * < 0 -> Error occured attempting to load data. 65 * > 0 -> Data has been successfully loaded. 66 */ 67 68 /* index values loaded from uprops.dat */ 69 static int32_t indexes[UPROPS_INDEX_COUNT]; 70 71 static UBool U_CALLCONV 72 isAcceptable(void *context, 73 const char *type, const char *name, 74 const UDataInfo *pInfo) { 75 if( 76 pInfo->size>=20 && 77 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 78 pInfo->charsetFamily==U_CHARSET_FAMILY && 79 pInfo->dataFormat[0]==0x55 && /* dataFormat="UPro" */ 80 pInfo->dataFormat[1]==0x50 && 81 pInfo->dataFormat[2]==0x72 && 82 pInfo->dataFormat[3]==0x6f && 83 pInfo->formatVersion[0]==4 && 84 pInfo->formatVersion[2]==UTRIE_SHIFT && 85 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT 86 ) { 87 uprv_memcpy(formatVersion, pInfo->formatVersion, 4); 88 uprv_memcpy(dataVersion, pInfo->dataVersion, 4); 89 return TRUE; 90 } else { 91 return FALSE; 92 } 93 } 94 95 static UBool U_CALLCONV uchar_cleanup(void) 96 { 97 if (propsData) { 98 udata_close(propsData); 99 propsData=NULL; 100 } 101 pData32=NULL; 102 propsVectors=NULL; 103 countPropsVectors=0; 104 uprv_memset(dataVersion, 0, U_MAX_VERSION_LENGTH); 105 dataErrorCode=U_ZERO_ERROR; 106 havePropsData=0; 107 108 return TRUE; 109 } 110 111 struct UCharProps { 112 UDataMemory *propsData; 113 UTrie propsTrie, propsVectorsTrie; 114 const uint32_t *pData32; 115 }; 116 typedef struct UCharProps UCharProps; 117 118 /* open uprops.icu */ 119 static void 120 _openProps(UCharProps *ucp, UErrorCode *pErrorCode) { 121 const uint32_t *p; 122 int32_t length; 123 124 ucp->propsData=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode); 125 if(U_FAILURE(*pErrorCode)) { 126 return; 127 } 128 129 ucp->pData32=p=(const uint32_t *)udata_getMemory(ucp->propsData); 130 131 /* unserialize the trie; it is directly after the int32_t indexes[UPROPS_INDEX_COUNT] */ 132 length=(int32_t)p[UPROPS_PROPS32_INDEX]*4; 133 length=utrie_unserialize(&ucp->propsTrie, (const uint8_t *)(p+UPROPS_INDEX_COUNT), length-64, pErrorCode); 134 if(U_FAILURE(*pErrorCode)) { 135 return; 136 } 137 138 /* unserialize the properties vectors trie */ 139 length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4; 140 if(length>0) { 141 length=utrie_unserialize(&ucp->propsVectorsTrie, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, pErrorCode); 142 } 143 if(length<=0 || U_FAILURE(*pErrorCode)) { 144 /* 145 * length==0: 146 * Allow the properties vectors trie to be missing - 147 * also requires propsVectorsColumns=indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX] 148 * to be zero so that this trie is never accessed. 149 */ 150 uprv_memset(&ucp->propsVectorsTrie, 0, sizeof(ucp->propsVectorsTrie)); 151 } 152 } 153 154 #endif 155 156 #if !UCHAR_HARDCODE_DATA 157 static int8_t 158 uprv_loadPropsData(UErrorCode *pErrorCode) { 159 /* load Unicode character properties data from file if necessary */ 160 161 /* 162 * This lazy intialization with double-checked locking (without mutex protection for 163 * haveNormData==0) is transiently unsafe under certain circumstances. 164 * Check the readme and use u_init() if necessary. 165 */ 166 if(havePropsData==0) { 167 UCharProps ucp={ NULL }; 168 169 if(U_FAILURE(*pErrorCode)) { 170 return havePropsData; 171 } 172 173 /* open the data outside the mutex block */ 174 _openProps(&ucp, pErrorCode); 175 176 if(U_SUCCESS(*pErrorCode)) { 177 /* in the mutex block, set the data for this process */ 178 umtx_lock(NULL); 179 if(propsData==NULL) { 180 propsData=ucp.propsData; 181 ucp.propsData=NULL; 182 pData32=ucp.pData32; 183 ucp.pData32=NULL; 184 uprv_memcpy(&propsTrie, &ucp.propsTrie, sizeof(propsTrie)); 185 uprv_memcpy(&propsVectorsTrie, &ucp.propsVectorsTrie, sizeof(propsVectorsTrie)); 186 } 187 188 /* initialize some variables */ 189 uprv_memcpy(indexes, pData32, sizeof(indexes)); 190 191 /* additional properties */ 192 if(indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0) { 193 propsVectors=pData32+indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]; 194 countPropsVectors=indexes[UPROPS_RESERVED_INDEX]-indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]; 195 propsVectorsColumns=indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]; 196 } 197 198 havePropsData=1; 199 umtx_unlock(NULL); 200 } else { 201 dataErrorCode=*pErrorCode; 202 havePropsData=-1; 203 } 204 ucln_common_registerCleanup(UCLN_COMMON_UCHAR, uchar_cleanup); 205 206 /* if a different thread set it first, then close the extra data */ 207 udata_close(ucp.propsData); /* NULL if it was set correctly */ 208 } 209 210 return havePropsData; 211 } 212 213 static int8_t 214 loadPropsData(void) { 215 UErrorCode errorCode = U_ZERO_ERROR; 216 int8_t retVal = uprv_loadPropsData(&errorCode); 217 return retVal; 218 } 219 220 #endif 221 222 /* constants and macros for access to the data ------------------------------ */ 223 224 /* getting a uint32_t properties word from the data */ 225 #if UCHAR_HARDCODE_DATA 226 227 #define GET_PROPS(c, result) ((result)=UTRIE2_GET16(&propsTrie, c)); 228 229 #else 230 231 #define HAVE_DATA (havePropsData>0 || loadPropsData()>0) 232 #define GET_PROPS_UNSAFE(c, result) \ 233 UTRIE_GET16(&propsTrie, c, result); 234 #define GET_PROPS(c, result) \ 235 if(HAVE_DATA) { \ 236 GET_PROPS_UNSAFE(c, result); \ 237 } else { \ 238 (result)=0; \ 239 } 240 241 #endif 242 243 U_CFUNC UBool 244 uprv_haveProperties(UErrorCode *pErrorCode) { 245 if(U_FAILURE(*pErrorCode)) { 246 return FALSE; 247 } 248 #if !UCHAR_HARDCODE_DATA 249 if(havePropsData==0) { 250 uprv_loadPropsData(pErrorCode); 251 } 252 if(havePropsData<0) { 253 *pErrorCode=dataErrorCode; 254 return FALSE; 255 } 256 #endif 257 return TRUE; 258 } 259 260 /* API functions ------------------------------------------------------------ */ 261 262 /* Gets the Unicode character's general category.*/ 263 U_CAPI int8_t U_EXPORT2 264 u_charType(UChar32 c) { 265 uint32_t props; 266 GET_PROPS(c, props); 267 return (int8_t)GET_CATEGORY(props); 268 } 269 270 /* Enumerate all code points with their general categories. */ 271 struct _EnumTypeCallback { 272 UCharEnumTypeRange *enumRange; 273 const void *context; 274 }; 275 276 static uint32_t U_CALLCONV 277 _enumTypeValue(const void *context, uint32_t value) { 278 return GET_CATEGORY(value); 279 } 280 281 static UBool U_CALLCONV 282 _enumTypeRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 283 /* just cast the value to UCharCategory */ 284 return ((struct _EnumTypeCallback *)context)-> 285 enumRange(((struct _EnumTypeCallback *)context)->context, 286 start, end+1, (UCharCategory)value); 287 } 288 289 U_CAPI void U_EXPORT2 290 u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context) { 291 struct _EnumTypeCallback callback; 292 293 if(enumRange==NULL 294 #if !UCHAR_HARDCODE_DATA 295 || !HAVE_DATA 296 #endif 297 ) { 298 return; 299 } 300 301 callback.enumRange=enumRange; 302 callback.context=context; 303 utrie2_enum(&propsTrie, _enumTypeValue, _enumTypeRange, &callback); 304 } 305 306 /* Checks if ch is a lower case letter.*/ 307 U_CAPI UBool U_EXPORT2 308 u_islower(UChar32 c) { 309 uint32_t props; 310 GET_PROPS(c, props); 311 return (UBool)(GET_CATEGORY(props)==U_LOWERCASE_LETTER); 312 } 313 314 /* Checks if ch is an upper case letter.*/ 315 U_CAPI UBool U_EXPORT2 316 u_isupper(UChar32 c) { 317 uint32_t props; 318 GET_PROPS(c, props); 319 return (UBool)(GET_CATEGORY(props)==U_UPPERCASE_LETTER); 320 } 321 322 /* Checks if ch is a title case letter; usually upper case letters.*/ 323 U_CAPI UBool U_EXPORT2 324 u_istitle(UChar32 c) { 325 uint32_t props; 326 GET_PROPS(c, props); 327 return (UBool)(GET_CATEGORY(props)==U_TITLECASE_LETTER); 328 } 329 330 /* Checks if ch is a decimal digit. */ 331 U_CAPI UBool U_EXPORT2 332 u_isdigit(UChar32 c) { 333 uint32_t props; 334 GET_PROPS(c, props); 335 return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER); 336 } 337 338 U_CAPI UBool U_EXPORT2 339 u_isxdigit(UChar32 c) { 340 uint32_t props; 341 342 /* check ASCII and Fullwidth ASCII a-fA-F */ 343 if( 344 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || 345 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) 346 ) { 347 return TRUE; 348 } 349 350 GET_PROPS(c, props); 351 return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER); 352 } 353 354 /* Checks if the Unicode character is a letter.*/ 355 U_CAPI UBool U_EXPORT2 356 u_isalpha(UChar32 c) { 357 uint32_t props; 358 GET_PROPS(c, props); 359 return (UBool)((CAT_MASK(props)&U_GC_L_MASK)!=0); 360 } 361 362 U_CAPI UBool U_EXPORT2 363 u_isUAlphabetic(UChar32 c) { 364 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0; 365 } 366 367 /* Checks if c is a letter or a decimal digit */ 368 U_CAPI UBool U_EXPORT2 369 u_isalnum(UChar32 c) { 370 uint32_t props; 371 GET_PROPS(c, props); 372 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0); 373 } 374 375 /** 376 * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. 377 * @internal 378 */ 379 U_CFUNC UBool 380 u_isalnumPOSIX(UChar32 c) { 381 return (UBool)(u_isUAlphabetic(c) || u_isdigit(c)); 382 } 383 384 /* Checks if ch is a unicode character with assigned character type.*/ 385 U_CAPI UBool U_EXPORT2 386 u_isdefined(UChar32 c) { 387 uint32_t props; 388 GET_PROPS(c, props); 389 return (UBool)(GET_CATEGORY(props)!=0); 390 } 391 392 /* Checks if the Unicode character is a base form character that can take a diacritic.*/ 393 U_CAPI UBool U_EXPORT2 394 u_isbase(UChar32 c) { 395 uint32_t props; 396 GET_PROPS(c, props); 397 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_N_MASK|U_GC_MC_MASK|U_GC_ME_MASK))!=0); 398 } 399 400 /* Checks if the Unicode character is a control character.*/ 401 U_CAPI UBool U_EXPORT2 402 u_iscntrl(UChar32 c) { 403 uint32_t props; 404 GET_PROPS(c, props); 405 return (UBool)((CAT_MASK(props)&(U_GC_CC_MASK|U_GC_CF_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK))!=0); 406 } 407 408 U_CAPI UBool U_EXPORT2 409 u_isISOControl(UChar32 c) { 410 return (uint32_t)c<=0x9f && (c<=0x1f || c>=0x7f); 411 } 412 413 /* Some control characters that are used as space. */ 414 #define IS_THAT_CONTROL_SPACE(c) \ 415 (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==NL)) 416 417 /* Java has decided that U+0085 New Line is not whitespace any more. */ 418 #define IS_THAT_ASCII_CONTROL_SPACE(c) \ 419 (c<=0x1f && c>=TAB && (c<=CR || c>=0x1c)) 420 421 /* Checks if the Unicode character is a space character.*/ 422 U_CAPI UBool U_EXPORT2 423 u_isspace(UChar32 c) { 424 uint32_t props; 425 GET_PROPS(c, props); 426 return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0 || IS_THAT_CONTROL_SPACE(c)); 427 } 428 429 U_CAPI UBool U_EXPORT2 430 u_isJavaSpaceChar(UChar32 c) { 431 uint32_t props; 432 GET_PROPS(c, props); 433 return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0); 434 } 435 436 /* Checks if the Unicode character is a whitespace character.*/ 437 U_CAPI UBool U_EXPORT2 438 u_isWhitespace(UChar32 c) { 439 uint32_t props; 440 GET_PROPS(c, props); 441 return (UBool)( 442 ((CAT_MASK(props)&U_GC_Z_MASK)!=0 && 443 c!=NBSP && c!=FIGURESP && c!=NNBSP) || /* exclude no-break spaces */ 444 IS_THAT_ASCII_CONTROL_SPACE(c) 445 ); 446 } 447 448 U_CAPI UBool U_EXPORT2 449 u_isblank(UChar32 c) { 450 if((uint32_t)c<=0x9f) { 451 return c==9 || c==0x20; /* TAB or SPACE */ 452 } else { 453 /* Zs */ 454 uint32_t props; 455 GET_PROPS(c, props); 456 return (UBool)(GET_CATEGORY(props)==U_SPACE_SEPARATOR); 457 } 458 } 459 460 U_CAPI UBool U_EXPORT2 461 u_isUWhiteSpace(UChar32 c) { 462 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0; 463 } 464 465 /* Checks if the Unicode character is printable.*/ 466 U_CAPI UBool U_EXPORT2 467 u_isprint(UChar32 c) { 468 uint32_t props; 469 GET_PROPS(c, props); 470 /* comparing ==0 returns FALSE for the categories mentioned */ 471 return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0); 472 } 473 474 /** 475 * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. 476 * Implements UCHAR_POSIX_PRINT. 477 * @internal 478 */ 479 U_CFUNC UBool 480 u_isprintPOSIX(UChar32 c) { 481 uint32_t props; 482 GET_PROPS(c, props); 483 /* 484 * The only cntrl character in graph+blank is TAB (in blank). 485 * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). 486 */ 487 return (UBool)((GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c)); 488 } 489 490 U_CAPI UBool U_EXPORT2 491 u_isgraph(UChar32 c) { 492 uint32_t props; 493 GET_PROPS(c, props); 494 /* comparing ==0 returns FALSE for the categories mentioned */ 495 return (UBool)((CAT_MASK(props)& 496 (U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK)) 497 ==0); 498 } 499 500 /** 501 * Checks if c is in 502 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 503 * with space=\p{Whitespace} and Control=Cc. 504 * Implements UCHAR_POSIX_GRAPH. 505 * @internal 506 */ 507 U_CFUNC UBool 508 u_isgraphPOSIX(UChar32 c) { 509 uint32_t props; 510 GET_PROPS(c, props); 511 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ 512 /* comparing ==0 returns FALSE for the categories mentioned */ 513 return (UBool)((CAT_MASK(props)& 514 (U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK)) 515 ==0); 516 } 517 518 U_CAPI UBool U_EXPORT2 519 u_ispunct(UChar32 c) { 520 uint32_t props; 521 GET_PROPS(c, props); 522 return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0); 523 } 524 525 /* Checks if the Unicode character can start a Unicode identifier.*/ 526 U_CAPI UBool U_EXPORT2 527 u_isIDStart(UChar32 c) { 528 /* same as u_isalpha() */ 529 uint32_t props; 530 GET_PROPS(c, props); 531 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_NL_MASK))!=0); 532 } 533 534 /* Checks if the Unicode character can be a Unicode identifier part other than starting the 535 identifier.*/ 536 U_CAPI UBool U_EXPORT2 537 u_isIDPart(UChar32 c) { 538 uint32_t props; 539 GET_PROPS(c, props); 540 return (UBool)( 541 (CAT_MASK(props)& 542 (U_GC_ND_MASK|U_GC_NL_MASK| 543 U_GC_L_MASK| 544 U_GC_PC_MASK|U_GC_MC_MASK|U_GC_MN_MASK) 545 )!=0 || 546 u_isIDIgnorable(c)); 547 } 548 549 /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/ 550 U_CAPI UBool U_EXPORT2 551 u_isIDIgnorable(UChar32 c) { 552 if(c<=0x9f) { 553 return u_isISOControl(c) && !IS_THAT_ASCII_CONTROL_SPACE(c); 554 } else { 555 uint32_t props; 556 GET_PROPS(c, props); 557 return (UBool)(GET_CATEGORY(props)==U_FORMAT_CHAR); 558 } 559 } 560 561 /*Checks if the Unicode character can start a Java identifier.*/ 562 U_CAPI UBool U_EXPORT2 563 u_isJavaIDStart(UChar32 c) { 564 uint32_t props; 565 GET_PROPS(c, props); 566 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_SC_MASK|U_GC_PC_MASK))!=0); 567 } 568 569 /*Checks if the Unicode character can be a Java identifier part other than starting the 570 * identifier. 571 */ 572 U_CAPI UBool U_EXPORT2 573 u_isJavaIDPart(UChar32 c) { 574 uint32_t props; 575 GET_PROPS(c, props); 576 return (UBool)( 577 (CAT_MASK(props)& 578 (U_GC_ND_MASK|U_GC_NL_MASK| 579 U_GC_L_MASK| 580 U_GC_SC_MASK|U_GC_PC_MASK| 581 U_GC_MC_MASK|U_GC_MN_MASK) 582 )!=0 || 583 u_isIDIgnorable(c)); 584 } 585 586 U_CAPI int32_t U_EXPORT2 587 u_charDigitValue(UChar32 c) { 588 uint32_t props; 589 int32_t value; 590 GET_PROPS(c, props); 591 value=(int32_t)GET_NUMERIC_TYPE_VALUE(props)-UPROPS_NTV_DECIMAL_START; 592 if(value<=9) { 593 return value; 594 } else { 595 return -1; 596 } 597 } 598 599 U_CAPI double U_EXPORT2 600 u_getNumericValue(UChar32 c) { 601 uint32_t props; 602 int32_t ntv; 603 GET_PROPS(c, props); 604 ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(props); 605 606 if(ntv==UPROPS_NTV_NONE) { 607 return U_NO_NUMERIC_VALUE; 608 } else if(ntv<UPROPS_NTV_DIGIT_START) { 609 /* decimal digit */ 610 return ntv-UPROPS_NTV_DECIMAL_START; 611 } else if(ntv<UPROPS_NTV_NUMERIC_START) { 612 /* other digit */ 613 return ntv-UPROPS_NTV_DIGIT_START; 614 } else if(ntv<UPROPS_NTV_FRACTION_START) { 615 /* small integer */ 616 return ntv-UPROPS_NTV_NUMERIC_START; 617 } else if(ntv<UPROPS_NTV_LARGE_START) { 618 /* fraction */ 619 int32_t numerator=(ntv>>4)-12; 620 int32_t denominator=(ntv&0xf)+1; 621 return (double)numerator/denominator; 622 } else if(ntv<UPROPS_NTV_RESERVED_START) { 623 /* large, single-significant-digit integer */ 624 double numValue; 625 int32_t mant=(ntv>>5)-14; 626 int32_t exp=(ntv&0x1f)+2; 627 numValue=mant; 628 629 /* multiply by 10^exp without math.h */ 630 while(exp>=4) { 631 numValue*=10000.; 632 exp-=4; 633 } 634 switch(exp) { 635 case 3: 636 numValue*=1000.; 637 break; 638 case 2: 639 numValue*=100.; 640 break; 641 case 1: 642 numValue*=10.; 643 break; 644 case 0: 645 default: 646 break; 647 } 648 649 return numValue; 650 } else { 651 /* reserved */ 652 return U_NO_NUMERIC_VALUE; 653 } 654 } 655 656 U_CAPI int32_t U_EXPORT2 657 u_digit(UChar32 ch, int8_t radix) { 658 int8_t value; 659 if((uint8_t)(radix-2)<=(36-2)) { 660 value=(int8_t)u_charDigitValue(ch); 661 if(value<0) { 662 /* ch is not a decimal digit, try latin letters */ 663 if(ch>=0x61 && ch<=0x7A) { 664 value=(int8_t)(ch-0x57); /* ch - 'a' + 10 */ 665 } else if(ch>=0x41 && ch<=0x5A) { 666 value=(int8_t)(ch-0x37); /* ch - 'A' + 10 */ 667 } else if(ch>=0xFF41 && ch<=0xFF5A) { 668 value=(int8_t)(ch-0xFF37); /* fullwidth ASCII a-z */ 669 } else if(ch>=0xFF21 && ch<=0xFF3A) { 670 value=(int8_t)(ch-0xFF17); /* fullwidth ASCII A-Z */ 671 } 672 } 673 } else { 674 value=-1; /* invalid radix */ 675 } 676 return (int8_t)((value<radix) ? value : -1); 677 } 678 679 U_CAPI UChar32 U_EXPORT2 680 u_forDigit(int32_t digit, int8_t radix) { 681 if((uint8_t)(radix-2)>(36-2) || (uint32_t)digit>=(uint32_t)radix) { 682 return 0; 683 } else if(digit<10) { 684 return (UChar32)(0x30+digit); 685 } else { 686 return (UChar32)((0x61-10)+digit); 687 } 688 } 689 690 /* miscellaneous, and support for uprops.c ---------------------------------- */ 691 692 U_CAPI void U_EXPORT2 693 u_getUnicodeVersion(UVersionInfo versionArray) { 694 if(versionArray!=NULL) { 695 uprv_memcpy(versionArray, dataVersion, U_MAX_VERSION_LENGTH); 696 } 697 } 698 699 U_CFUNC uint32_t 700 u_getUnicodeProperties(UChar32 c, int32_t column) { 701 uint16_t vecIndex; 702 703 if(column==-1) { 704 uint32_t props; 705 GET_PROPS(c, props); 706 return props; 707 } else if( 708 #if !UCHAR_HARDCODE_DATA 709 !HAVE_DATA || countPropsVectors==0 || 710 #endif 711 column<0 || column>=propsVectorsColumns 712 ) { 713 return 0; 714 } else { 715 vecIndex=UTRIE2_GET16(&propsVectorsTrie, c); 716 return propsVectors[vecIndex+column]; 717 } 718 } 719 720 U_CFUNC int32_t 721 uprv_getMaxValues(int32_t column) { 722 #if !UCHAR_HARDCODE_DATA 723 if(HAVE_DATA) { 724 #endif 725 switch(column) { 726 case 0: 727 return indexes[UPROPS_MAX_VALUES_INDEX]; 728 case 2: 729 return indexes[UPROPS_MAX_VALUES_2_INDEX]; 730 default: 731 return 0; 732 } 733 #if !UCHAR_HARDCODE_DATA 734 } else { 735 return 0; 736 } 737 #endif 738 } 739 740 U_CAPI void U_EXPORT2 741 u_charAge(UChar32 c, UVersionInfo versionArray) { 742 if(versionArray!=NULL) { 743 uint32_t version=u_getUnicodeProperties(c, 0)>>UPROPS_AGE_SHIFT; 744 versionArray[0]=(uint8_t)(version>>4); 745 versionArray[1]=(uint8_t)(version&0xf); 746 versionArray[2]=versionArray[3]=0; 747 } 748 } 749 750 U_CAPI UScriptCode U_EXPORT2 751 uscript_getScript(UChar32 c, UErrorCode *pErrorCode) { 752 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 753 return USCRIPT_INVALID_CODE; 754 } 755 if((uint32_t)c>0x10ffff) { 756 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 757 return USCRIPT_INVALID_CODE; 758 } 759 760 return (UScriptCode)(u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_MASK); 761 } 762 763 U_CAPI UBlockCode U_EXPORT2 764 ublock_getCode(UChar32 c) { 765 return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT); 766 } 767 768 /* property starts for UnicodeSet ------------------------------------------- */ 769 770 static UBool U_CALLCONV 771 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 772 /* add the start code point to the USet */ 773 const USetAdder *sa=(const USetAdder *)context; 774 sa->add(sa->set, start); 775 return TRUE; 776 } 777 778 #define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1) 779 780 U_CFUNC void U_EXPORT2 781 uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { 782 if(U_FAILURE(*pErrorCode)) { 783 return; 784 } 785 786 #if !UCHAR_HARDCODE_DATA 787 if(!HAVE_DATA) { 788 *pErrorCode=dataErrorCode; 789 return; 790 } 791 #endif 792 793 /* add the start code point of each same-value range of the main trie */ 794 utrie2_enum(&propsTrie, NULL, _enumPropertyStartsRange, sa); 795 796 /* add code points with hardcoded properties, plus the ones following them */ 797 798 /* add for u_isblank() */ 799 USET_ADD_CP_AND_NEXT(sa, TAB); 800 801 /* add for IS_THAT_CONTROL_SPACE() */ 802 sa->add(sa->set, CR+1); /* range TAB..CR */ 803 sa->add(sa->set, 0x1c); 804 sa->add(sa->set, 0x1f+1); 805 USET_ADD_CP_AND_NEXT(sa, NL); 806 807 /* add for u_isIDIgnorable() what was not added above */ 808 sa->add(sa->set, DEL); /* range DEL..NBSP-1, NBSP added below */ 809 sa->add(sa->set, HAIRSP); 810 sa->add(sa->set, RLM+1); 811 sa->add(sa->set, INHSWAP); 812 sa->add(sa->set, NOMDIG+1); 813 USET_ADD_CP_AND_NEXT(sa, ZWNBSP); 814 815 /* add no-break spaces for u_isWhitespace() what was not added above */ 816 USET_ADD_CP_AND_NEXT(sa, NBSP); 817 USET_ADD_CP_AND_NEXT(sa, FIGURESP); 818 USET_ADD_CP_AND_NEXT(sa, NNBSP); 819 820 /* add for u_digit() */ 821 sa->add(sa->set, U_a); 822 sa->add(sa->set, U_z+1); 823 sa->add(sa->set, U_A); 824 sa->add(sa->set, U_Z+1); 825 sa->add(sa->set, U_FW_a); 826 sa->add(sa->set, U_FW_z+1); 827 sa->add(sa->set, U_FW_A); 828 sa->add(sa->set, U_FW_Z+1); 829 830 /* add for u_isxdigit() */ 831 sa->add(sa->set, U_f+1); 832 sa->add(sa->set, U_F+1); 833 sa->add(sa->set, U_FW_f+1); 834 sa->add(sa->set, U_FW_F+1); 835 836 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ 837 sa->add(sa->set, WJ); /* range WJ..NOMDIG */ 838 sa->add(sa->set, 0xfff0); 839 sa->add(sa->set, 0xfffb+1); 840 sa->add(sa->set, 0xe0000); 841 sa->add(sa->set, 0xe0fff+1); 842 843 /* add for UCHAR_GRAPHEME_BASE and others */ 844 USET_ADD_CP_AND_NEXT(sa, CGJ); 845 } 846 847 U_CFUNC void U_EXPORT2 848 upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { 849 if(U_FAILURE(*pErrorCode)) { 850 return; 851 } 852 853 #if !UCHAR_HARDCODE_DATA 854 if(!HAVE_DATA) { 855 *pErrorCode=dataErrorCode; 856 return; 857 } 858 #endif 859 860 /* add the start code point of each same-value range of the properties vectors trie */ 861 if(propsVectorsColumns>0) { 862 /* if propsVectorsColumns==0 then the properties vectors trie may not be there at all */ 863 utrie2_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, sa); 864 } 865 } 866