1 /* ------------------------------------------------------------------------ 2 3 unicodedata -- Provides access to the Unicode database. 4 5 Data was extracted from the UnicodeData.txt file. 6 The current version number is reported in the unidata_version constant. 7 8 Written by Marc-Andre Lemburg (mal (at) lemburg.com). 9 Modified for Python 2.0 by Fredrik Lundh (fredrik (at) pythonware.com) 10 Modified by Martin v. Lwis (martin (at) v.loewis.de) 11 12 Copyright (c) Corporation for National Research Initiatives. 13 14 ------------------------------------------------------------------------ */ 15 16 #define PY_SSIZE_T_CLEAN 17 18 #include "Python.h" 19 #include "ucnhash.h" 20 #include "structmember.h" 21 22 /*[clinic input] 23 module unicodedata 24 class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type' 25 [clinic start generated code]*/ 26 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/ 27 28 /* character properties */ 29 30 typedef struct { 31 const unsigned char category; /* index into 32 _PyUnicode_CategoryNames */ 33 const unsigned char combining; /* combining class value 0 - 255 */ 34 const unsigned char bidirectional; /* index into 35 _PyUnicode_BidirectionalNames */ 36 const unsigned char mirrored; /* true if mirrored in bidir mode */ 37 const unsigned char east_asian_width; /* index into 38 _PyUnicode_EastAsianWidth */ 39 const unsigned char normalization_quick_check; /* see is_normalized() */ 40 } _PyUnicode_DatabaseRecord; 41 42 typedef struct change_record { 43 /* sequence of fields should be the same as in merge_old_version */ 44 const unsigned char bidir_changed; 45 const unsigned char category_changed; 46 const unsigned char decimal_changed; 47 const unsigned char mirrored_changed; 48 const unsigned char east_asian_width_changed; 49 const double numeric_changed; 50 } change_record; 51 52 /* data file generated by Tools/unicode/makeunicodedata.py */ 53 #include "unicodedata_db.h" 54 55 static const _PyUnicode_DatabaseRecord* 56 _getrecord_ex(Py_UCS4 code) 57 { 58 int index; 59 if (code >= 0x110000) 60 index = 0; 61 else { 62 index = index1[(code>>SHIFT)]; 63 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 64 } 65 66 return &_PyUnicode_Database_Records[index]; 67 } 68 69 /* ------------- Previous-version API ------------------------------------- */ 70 typedef struct previous_version { 71 PyObject_HEAD 72 const char *name; 73 const change_record* (*getrecord)(Py_UCS4); 74 Py_UCS4 (*normalization)(Py_UCS4); 75 } PreviousDBVersion; 76 77 #include "clinic/unicodedata.c.h" 78 79 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v)) 80 81 static PyMemberDef DB_members[] = { 82 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, 83 {NULL} 84 }; 85 86 /* forward declaration */ 87 static PyTypeObject UCD_Type; 88 #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type) 89 90 static PyObject* 91 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), 92 Py_UCS4 (*normalization)(Py_UCS4)) 93 { 94 PreviousDBVersion *self; 95 self = PyObject_New(PreviousDBVersion, &UCD_Type); 96 if (self == NULL) 97 return NULL; 98 self->name = name; 99 self->getrecord = getrecord; 100 self->normalization = normalization; 101 return (PyObject*)self; 102 } 103 104 105 /* --- Module API --------------------------------------------------------- */ 106 107 /*[clinic input] 108 unicodedata.UCD.decimal 109 110 self: self 111 chr: int(accept={str}) 112 default: object=NULL 113 / 114 115 Converts a Unicode character into its equivalent decimal value. 116 117 Returns the decimal value assigned to the character chr as integer. 118 If no such value is defined, default is returned, or, if not given, 119 ValueError is raised. 120 [clinic start generated code]*/ 121 122 static PyObject * 123 unicodedata_UCD_decimal_impl(PyObject *self, int chr, 124 PyObject *default_value) 125 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/ 126 { 127 int have_old = 0; 128 long rc; 129 Py_UCS4 c = (Py_UCS4)chr; 130 131 if (self && UCD_Check(self)) { 132 const change_record *old = get_old_record(self, c); 133 if (old->category_changed == 0) { 134 /* unassigned */ 135 have_old = 1; 136 rc = -1; 137 } 138 else if (old->decimal_changed != 0xFF) { 139 have_old = 1; 140 rc = old->decimal_changed; 141 } 142 } 143 144 if (!have_old) 145 rc = Py_UNICODE_TODECIMAL(c); 146 if (rc < 0) { 147 if (default_value == NULL) { 148 PyErr_SetString(PyExc_ValueError, 149 "not a decimal"); 150 return NULL; 151 } 152 else { 153 Py_INCREF(default_value); 154 return default_value; 155 } 156 } 157 return PyLong_FromLong(rc); 158 } 159 160 /*[clinic input] 161 unicodedata.UCD.digit 162 163 self: self 164 chr: int(accept={str}) 165 default: object=NULL 166 / 167 168 Converts a Unicode character into its equivalent digit value. 169 170 Returns the digit value assigned to the character chr as integer. 171 If no such value is defined, default is returned, or, if not given, 172 ValueError is raised. 173 [clinic start generated code]*/ 174 175 static PyObject * 176 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value) 177 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/ 178 { 179 long rc; 180 Py_UCS4 c = (Py_UCS4)chr; 181 rc = Py_UNICODE_TODIGIT(c); 182 if (rc < 0) { 183 if (default_value == NULL) { 184 PyErr_SetString(PyExc_ValueError, "not a digit"); 185 return NULL; 186 } 187 else { 188 Py_INCREF(default_value); 189 return default_value; 190 } 191 } 192 return PyLong_FromLong(rc); 193 } 194 195 /*[clinic input] 196 unicodedata.UCD.numeric 197 198 self: self 199 chr: int(accept={str}) 200 default: object=NULL 201 / 202 203 Converts a Unicode character into its equivalent numeric value. 204 205 Returns the numeric value assigned to the character chr as float. 206 If no such value is defined, default is returned, or, if not given, 207 ValueError is raised. 208 [clinic start generated code]*/ 209 210 static PyObject * 211 unicodedata_UCD_numeric_impl(PyObject *self, int chr, 212 PyObject *default_value) 213 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/ 214 { 215 int have_old = 0; 216 double rc; 217 Py_UCS4 c = (Py_UCS4)chr; 218 219 if (self && UCD_Check(self)) { 220 const change_record *old = get_old_record(self, c); 221 if (old->category_changed == 0) { 222 /* unassigned */ 223 have_old = 1; 224 rc = -1.0; 225 } 226 else if (old->decimal_changed != 0xFF) { 227 have_old = 1; 228 rc = old->decimal_changed; 229 } 230 } 231 232 if (!have_old) 233 rc = Py_UNICODE_TONUMERIC(c); 234 if (rc == -1.0) { 235 if (default_value == NULL) { 236 PyErr_SetString(PyExc_ValueError, "not a numeric character"); 237 return NULL; 238 } 239 else { 240 Py_INCREF(default_value); 241 return default_value; 242 } 243 } 244 return PyFloat_FromDouble(rc); 245 } 246 247 /*[clinic input] 248 unicodedata.UCD.category 249 250 self: self 251 chr: int(accept={str}) 252 / 253 254 Returns the general category assigned to the character chr as string. 255 [clinic start generated code]*/ 256 257 static PyObject * 258 unicodedata_UCD_category_impl(PyObject *self, int chr) 259 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/ 260 { 261 int index; 262 Py_UCS4 c = (Py_UCS4)chr; 263 index = (int) _getrecord_ex(c)->category; 264 if (self && UCD_Check(self)) { 265 const change_record *old = get_old_record(self, c); 266 if (old->category_changed != 0xFF) 267 index = old->category_changed; 268 } 269 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]); 270 } 271 272 /*[clinic input] 273 unicodedata.UCD.bidirectional 274 275 self: self 276 chr: int(accept={str}) 277 / 278 279 Returns the bidirectional class assigned to the character chr as string. 280 281 If no such value is defined, an empty string is returned. 282 [clinic start generated code]*/ 283 284 static PyObject * 285 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr) 286 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/ 287 { 288 int index; 289 Py_UCS4 c = (Py_UCS4)chr; 290 index = (int) _getrecord_ex(c)->bidirectional; 291 if (self && UCD_Check(self)) { 292 const change_record *old = get_old_record(self, c); 293 if (old->category_changed == 0) 294 index = 0; /* unassigned */ 295 else if (old->bidir_changed != 0xFF) 296 index = old->bidir_changed; 297 } 298 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]); 299 } 300 301 /*[clinic input] 302 unicodedata.UCD.combining -> int 303 304 self: self 305 chr: int(accept={str}) 306 / 307 308 Returns the canonical combining class assigned to the character chr as integer. 309 310 Returns 0 if no combining class is defined. 311 [clinic start generated code]*/ 312 313 static int 314 unicodedata_UCD_combining_impl(PyObject *self, int chr) 315 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/ 316 { 317 int index; 318 Py_UCS4 c = (Py_UCS4)chr; 319 index = (int) _getrecord_ex(c)->combining; 320 if (self && UCD_Check(self)) { 321 const change_record *old = get_old_record(self, c); 322 if (old->category_changed == 0) 323 index = 0; /* unassigned */ 324 } 325 return index; 326 } 327 328 /*[clinic input] 329 unicodedata.UCD.mirrored -> int 330 331 self: self 332 chr: int(accept={str}) 333 / 334 335 Returns the mirrored property assigned to the character chr as integer. 336 337 Returns 1 if the character has been identified as a "mirrored" 338 character in bidirectional text, 0 otherwise. 339 [clinic start generated code]*/ 340 341 static int 342 unicodedata_UCD_mirrored_impl(PyObject *self, int chr) 343 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/ 344 { 345 int index; 346 Py_UCS4 c = (Py_UCS4)chr; 347 index = (int) _getrecord_ex(c)->mirrored; 348 if (self && UCD_Check(self)) { 349 const change_record *old = get_old_record(self, c); 350 if (old->category_changed == 0) 351 index = 0; /* unassigned */ 352 else if (old->mirrored_changed != 0xFF) 353 index = old->mirrored_changed; 354 } 355 return index; 356 } 357 358 /*[clinic input] 359 unicodedata.UCD.east_asian_width 360 361 self: self 362 chr: int(accept={str}) 363 / 364 365 Returns the east asian width assigned to the character chr as string. 366 [clinic start generated code]*/ 367 368 static PyObject * 369 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr) 370 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/ 371 { 372 int index; 373 Py_UCS4 c = (Py_UCS4)chr; 374 index = (int) _getrecord_ex(c)->east_asian_width; 375 if (self && UCD_Check(self)) { 376 const change_record *old = get_old_record(self, c); 377 if (old->category_changed == 0) 378 index = 0; /* unassigned */ 379 else if (old->east_asian_width_changed != 0xFF) 380 index = old->east_asian_width_changed; 381 } 382 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]); 383 } 384 385 /*[clinic input] 386 unicodedata.UCD.decomposition 387 388 self: self 389 chr: int(accept={str}) 390 / 391 392 Returns the character decomposition mapping assigned to the character chr as string. 393 394 An empty string is returned in case no such mapping is defined. 395 [clinic start generated code]*/ 396 397 static PyObject * 398 unicodedata_UCD_decomposition_impl(PyObject *self, int chr) 399 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/ 400 { 401 char decomp[256]; 402 int code, index, count; 403 size_t i; 404 unsigned int prefix_index; 405 Py_UCS4 c = (Py_UCS4)chr; 406 407 code = (int)c; 408 409 if (self && UCD_Check(self)) { 410 const change_record *old = get_old_record(self, c); 411 if (old->category_changed == 0) 412 return PyUnicode_FromString(""); /* unassigned */ 413 } 414 415 if (code < 0 || code >= 0x110000) 416 index = 0; 417 else { 418 index = decomp_index1[(code>>DECOMP_SHIFT)]; 419 index = decomp_index2[(index<<DECOMP_SHIFT)+ 420 (code&((1<<DECOMP_SHIFT)-1))]; 421 } 422 423 /* high byte is number of hex bytes (usually one or two), low byte 424 is prefix code (from*/ 425 count = decomp_data[index] >> 8; 426 427 /* XXX: could allocate the PyString up front instead 428 (strlen(prefix) + 5 * count + 1 bytes) */ 429 430 /* Based on how index is calculated above and decomp_data is generated 431 from Tools/unicode/makeunicodedata.py, it should not be possible 432 to overflow decomp_prefix. */ 433 prefix_index = decomp_data[index] & 255; 434 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix)); 435 436 /* copy prefix */ 437 i = strlen(decomp_prefix[prefix_index]); 438 memcpy(decomp, decomp_prefix[prefix_index], i); 439 440 while (count-- > 0) { 441 if (i) 442 decomp[i++] = ' '; 443 assert(i < sizeof(decomp)); 444 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", 445 decomp_data[++index]); 446 i += strlen(decomp + i); 447 } 448 return PyUnicode_FromStringAndSize(decomp, i); 449 } 450 451 static void 452 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) 453 { 454 if (code >= 0x110000) { 455 *index = 0; 456 } else if (self && UCD_Check(self) && 457 get_old_record(self, code)->category_changed==0) { 458 /* unassigned in old version */ 459 *index = 0; 460 } 461 else { 462 *index = decomp_index1[(code>>DECOMP_SHIFT)]; 463 *index = decomp_index2[(*index<<DECOMP_SHIFT)+ 464 (code&((1<<DECOMP_SHIFT)-1))]; 465 } 466 467 /* high byte is number of hex bytes (usually one or two), low byte 468 is prefix code (from*/ 469 *count = decomp_data[*index] >> 8; 470 *prefix = decomp_data[*index] & 255; 471 472 (*index)++; 473 } 474 475 #define SBase 0xAC00 476 #define LBase 0x1100 477 #define VBase 0x1161 478 #define TBase 0x11A7 479 #define LCount 19 480 #define VCount 21 481 #define TCount 28 482 #define NCount (VCount*TCount) 483 #define SCount (LCount*NCount) 484 485 static PyObject* 486 nfd_nfkd(PyObject *self, PyObject *input, int k) 487 { 488 PyObject *result; 489 Py_UCS4 *output; 490 Py_ssize_t i, o, osize; 491 int kind; 492 void *data; 493 /* Longest decomposition in Unicode 3.2: U+FDFA */ 494 Py_UCS4 stack[20]; 495 Py_ssize_t space, isize; 496 int index, prefix, count, stackptr; 497 unsigned char prev, cur; 498 499 stackptr = 0; 500 isize = PyUnicode_GET_LENGTH(input); 501 space = isize; 502 /* Overallocate at most 10 characters. */ 503 if (space > 10) { 504 if (space <= PY_SSIZE_T_MAX - 10) 505 space += 10; 506 } 507 else { 508 space *= 2; 509 } 510 osize = space; 511 output = PyMem_NEW(Py_UCS4, space); 512 if (!output) { 513 PyErr_NoMemory(); 514 return NULL; 515 } 516 i = o = 0; 517 kind = PyUnicode_KIND(input); 518 data = PyUnicode_DATA(input); 519 520 while (i < isize) { 521 stack[stackptr++] = PyUnicode_READ(kind, data, i++); 522 while(stackptr) { 523 Py_UCS4 code = stack[--stackptr]; 524 /* Hangul Decomposition adds three characters in 525 a single step, so we need at least that much room. */ 526 if (space < 3) { 527 Py_UCS4 *new_output; 528 osize += 10; 529 space += 10; 530 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4)); 531 if (new_output == NULL) { 532 PyMem_Free(output); 533 PyErr_NoMemory(); 534 return NULL; 535 } 536 output = new_output; 537 } 538 /* Hangul Decomposition. */ 539 if (SBase <= code && code < (SBase+SCount)) { 540 int SIndex = code - SBase; 541 int L = LBase + SIndex / NCount; 542 int V = VBase + (SIndex % NCount) / TCount; 543 int T = TBase + SIndex % TCount; 544 output[o++] = L; 545 output[o++] = V; 546 space -= 2; 547 if (T != TBase) { 548 output[o++] = T; 549 space --; 550 } 551 continue; 552 } 553 /* normalization changes */ 554 if (self && UCD_Check(self)) { 555 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); 556 if (value != 0) { 557 stack[stackptr++] = value; 558 continue; 559 } 560 } 561 562 /* Other decompositions. */ 563 get_decomp_record(self, code, &index, &prefix, &count); 564 565 /* Copy character if it is not decomposable, or has a 566 compatibility decomposition, but we do NFD. */ 567 if (!count || (prefix && !k)) { 568 output[o++] = code; 569 space--; 570 continue; 571 } 572 /* Copy decomposition onto the stack, in reverse 573 order. */ 574 while(count) { 575 code = decomp_data[index + (--count)]; 576 stack[stackptr++] = code; 577 } 578 } 579 } 580 581 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, 582 output, o); 583 PyMem_Free(output); 584 if (!result) 585 return NULL; 586 /* result is guaranteed to be ready, as it is compact. */ 587 kind = PyUnicode_KIND(result); 588 data = PyUnicode_DATA(result); 589 590 /* Sort canonically. */ 591 i = 0; 592 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 593 for (i++; i < PyUnicode_GET_LENGTH(result); i++) { 594 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 595 if (prev == 0 || cur == 0 || prev <= cur) { 596 prev = cur; 597 continue; 598 } 599 /* Non-canonical order. Need to switch *i with previous. */ 600 o = i - 1; 601 while (1) { 602 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1); 603 PyUnicode_WRITE(kind, data, o+1, 604 PyUnicode_READ(kind, data, o)); 605 PyUnicode_WRITE(kind, data, o, tmp); 606 o--; 607 if (o < 0) 608 break; 609 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining; 610 if (prev == 0 || prev <= cur) 611 break; 612 } 613 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 614 } 615 return result; 616 } 617 618 static int 619 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code) 620 { 621 unsigned int index; 622 for (index = 0; nfc[index].start; index++) { 623 unsigned int start = nfc[index].start; 624 if (code < start) 625 return -1; 626 if (code <= start + nfc[index].count) { 627 unsigned int delta = code - start; 628 return nfc[index].index + delta; 629 } 630 } 631 return -1; 632 } 633 634 static PyObject* 635 nfc_nfkc(PyObject *self, PyObject *input, int k) 636 { 637 PyObject *result; 638 int kind; 639 void *data; 640 Py_UCS4 *output; 641 Py_ssize_t i, i1, o, len; 642 int f,l,index,index1,comb; 643 Py_UCS4 code; 644 Py_ssize_t skipped[20]; 645 int cskipped = 0; 646 647 result = nfd_nfkd(self, input, k); 648 if (!result) 649 return NULL; 650 /* result will be "ready". */ 651 kind = PyUnicode_KIND(result); 652 data = PyUnicode_DATA(result); 653 len = PyUnicode_GET_LENGTH(result); 654 655 /* We allocate a buffer for the output. 656 If we find that we made no changes, we still return 657 the NFD result. */ 658 output = PyMem_NEW(Py_UCS4, len); 659 if (!output) { 660 PyErr_NoMemory(); 661 Py_DECREF(result); 662 return 0; 663 } 664 i = o = 0; 665 666 again: 667 while (i < len) { 668 for (index = 0; index < cskipped; index++) { 669 if (skipped[index] == i) { 670 /* *i character is skipped. 671 Remove from list. */ 672 skipped[index] = skipped[cskipped-1]; 673 cskipped--; 674 i++; 675 goto again; /* continue while */ 676 } 677 } 678 /* Hangul Composition. We don't need to check for <LV,T> 679 pairs, since we always have decomposed data. */ 680 code = PyUnicode_READ(kind, data, i); 681 if (LBase <= code && code < (LBase+LCount) && 682 i + 1 < len && 683 VBase <= PyUnicode_READ(kind, data, i+1) && 684 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) { 685 /* check L character is a modern leading consonant (0x1100 ~ 0x1112) 686 and V character is a modern vowel (0x1161 ~ 0x1175). */ 687 int LIndex, VIndex; 688 LIndex = code - LBase; 689 VIndex = PyUnicode_READ(kind, data, i+1) - VBase; 690 code = SBase + (LIndex*VCount+VIndex)*TCount; 691 i+=2; 692 if (i < len && 693 TBase < PyUnicode_READ(kind, data, i) && 694 PyUnicode_READ(kind, data, i) < (TBase+TCount)) { 695 /* check T character is a modern trailing consonant 696 (0x11A8 ~ 0x11C2). */ 697 code += PyUnicode_READ(kind, data, i)-TBase; 698 i++; 699 } 700 output[o++] = code; 701 continue; 702 } 703 704 /* code is still input[i] here */ 705 f = find_nfc_index(self, nfc_first, code); 706 if (f == -1) { 707 output[o++] = code; 708 i++; 709 continue; 710 } 711 /* Find next unblocked character. */ 712 i1 = i+1; 713 comb = 0; 714 /* output base character for now; might be updated later. */ 715 output[o] = PyUnicode_READ(kind, data, i); 716 while (i1 < len) { 717 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1); 718 int comb1 = _getrecord_ex(code1)->combining; 719 if (comb) { 720 if (comb1 == 0) 721 break; 722 if (comb >= comb1) { 723 /* Character is blocked. */ 724 i1++; 725 continue; 726 } 727 } 728 l = find_nfc_index(self, nfc_last, code1); 729 /* i1 cannot be combined with i. If i1 730 is a starter, we don't need to look further. 731 Otherwise, record the combining class. */ 732 if (l == -1) { 733 not_combinable: 734 if (comb1 == 0) 735 break; 736 comb = comb1; 737 i1++; 738 continue; 739 } 740 index = f*TOTAL_LAST + l; 741 index1 = comp_index[index >> COMP_SHIFT]; 742 code = comp_data[(index1<<COMP_SHIFT)+ 743 (index&((1<<COMP_SHIFT)-1))]; 744 if (code == 0) 745 goto not_combinable; 746 747 /* Replace the original character. */ 748 output[o] = code; 749 /* Mark the second character unused. */ 750 assert(cskipped < 20); 751 skipped[cskipped++] = i1; 752 i1++; 753 f = find_nfc_index(self, nfc_first, output[o]); 754 if (f == -1) 755 break; 756 } 757 /* Output character was already written. 758 Just advance the indices. */ 759 o++; i++; 760 } 761 if (o == len) { 762 /* No changes. Return original string. */ 763 PyMem_Free(output); 764 return result; 765 } 766 Py_DECREF(result); 767 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, 768 output, o); 769 PyMem_Free(output); 770 return result; 771 } 772 773 /* Return 1 if the input is certainly normalized, 0 if it might not be. */ 774 static int 775 is_normalized(PyObject *self, PyObject *input, int nfc, int k) 776 { 777 Py_ssize_t i, len; 778 int kind; 779 void *data; 780 unsigned char prev_combining = 0, quickcheck_mask; 781 782 /* An older version of the database is requested, quickchecks must be 783 disabled. */ 784 if (self && UCD_Check(self)) 785 return 0; 786 787 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, 788 as described in http://unicode.org/reports/tr15/#Annex8. */ 789 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); 790 791 i = 0; 792 kind = PyUnicode_KIND(input); 793 data = PyUnicode_DATA(input); 794 len = PyUnicode_GET_LENGTH(input); 795 while (i < len) { 796 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 797 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); 798 unsigned char combining = record->combining; 799 unsigned char quickcheck = record->normalization_quick_check; 800 801 if (quickcheck & quickcheck_mask) 802 return 0; /* this string might need normalization */ 803 if (combining && prev_combining > combining) 804 return 0; /* non-canonical sort order, not normalized */ 805 prev_combining = combining; 806 } 807 return 1; /* certainly normalized */ 808 } 809 810 /*[clinic input] 811 unicodedata.UCD.normalize 812 813 self: self 814 form: str 815 unistr as input: unicode 816 / 817 818 Return the normal form 'form' for the Unicode string unistr. 819 820 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. 821 [clinic start generated code]*/ 822 823 static PyObject * 824 unicodedata_UCD_normalize_impl(PyObject *self, const char *form, 825 PyObject *input) 826 /*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/ 827 { 828 if (PyUnicode_GET_LENGTH(input) == 0) { 829 /* Special case empty input strings, since resizing 830 them later would cause internal errors. */ 831 Py_INCREF(input); 832 return input; 833 } 834 835 if (strcmp(form, "NFC") == 0) { 836 if (is_normalized(self, input, 1, 0)) { 837 Py_INCREF(input); 838 return input; 839 } 840 return nfc_nfkc(self, input, 0); 841 } 842 if (strcmp(form, "NFKC") == 0) { 843 if (is_normalized(self, input, 1, 1)) { 844 Py_INCREF(input); 845 return input; 846 } 847 return nfc_nfkc(self, input, 1); 848 } 849 if (strcmp(form, "NFD") == 0) { 850 if (is_normalized(self, input, 0, 0)) { 851 Py_INCREF(input); 852 return input; 853 } 854 return nfd_nfkd(self, input, 0); 855 } 856 if (strcmp(form, "NFKD") == 0) { 857 if (is_normalized(self, input, 0, 1)) { 858 Py_INCREF(input); 859 return input; 860 } 861 return nfd_nfkd(self, input, 1); 862 } 863 PyErr_SetString(PyExc_ValueError, "invalid normalization form"); 864 return NULL; 865 } 866 867 /* -------------------------------------------------------------------- */ 868 /* unicode character name tables */ 869 870 /* data file generated by Tools/unicode/makeunicodedata.py */ 871 #include "unicodename_db.h" 872 873 /* -------------------------------------------------------------------- */ 874 /* database code (cut and pasted from the unidb package) */ 875 876 static unsigned long 877 _gethash(const char *s, int len, int scale) 878 { 879 int i; 880 unsigned long h = 0; 881 unsigned long ix; 882 for (i = 0; i < len; i++) { 883 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i])); 884 ix = h & 0xff000000; 885 if (ix) 886 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; 887 } 888 return h; 889 } 890 891 static const char * const hangul_syllables[][3] = { 892 { "G", "A", "" }, 893 { "GG", "AE", "G" }, 894 { "N", "YA", "GG" }, 895 { "D", "YAE", "GS" }, 896 { "DD", "EO", "N", }, 897 { "R", "E", "NJ" }, 898 { "M", "YEO", "NH" }, 899 { "B", "YE", "D" }, 900 { "BB", "O", "L" }, 901 { "S", "WA", "LG" }, 902 { "SS", "WAE", "LM" }, 903 { "", "OE", "LB" }, 904 { "J", "YO", "LS" }, 905 { "JJ", "U", "LT" }, 906 { "C", "WEO", "LP" }, 907 { "K", "WE", "LH" }, 908 { "T", "WI", "M" }, 909 { "P", "YU", "B" }, 910 { "H", "EU", "BS" }, 911 { 0, "YI", "S" }, 912 { 0, "I", "SS" }, 913 { 0, 0, "NG" }, 914 { 0, 0, "J" }, 915 { 0, 0, "C" }, 916 { 0, 0, "K" }, 917 { 0, 0, "T" }, 918 { 0, 0, "P" }, 919 { 0, 0, "H" } 920 }; 921 922 /* These ranges need to match makeunicodedata.py:cjk_ranges. */ 923 static int 924 is_unified_ideograph(Py_UCS4 code) 925 { 926 return 927 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ 928 (0x4E00 <= code && code <= 0x9FEF) || /* CJK Ideograph */ 929 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */ 930 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */ 931 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */ 932 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */ 933 (0x2CEB0 <= code && code <= 0x2EBEF); /* CJK Ideograph Extension F */ 934 } 935 936 /* macros used to determine if the given code point is in the PUA range that 937 * we are using to store aliases and named sequences */ 938 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end)) 939 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \ 940 (cp < named_sequences_end)) 941 942 static int 943 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, 944 int with_alias_and_seq) 945 { 946 /* Find the name associated with the given code point. 947 * If with_alias_and_seq is 1, check for names in the Private Use Area 15 948 * that we are using for aliases and named sequences. */ 949 int offset; 950 int i; 951 int word; 952 unsigned char* w; 953 954 if (code >= 0x110000) 955 return 0; 956 957 /* XXX should we just skip all the code points in the PUAs here? */ 958 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code))) 959 return 0; 960 961 if (self && UCD_Check(self)) { 962 /* in 3.2.0 there are no aliases and named sequences */ 963 const change_record *old; 964 if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) 965 return 0; 966 old = get_old_record(self, code); 967 if (old->category_changed == 0) { 968 /* unassigned */ 969 return 0; 970 } 971 } 972 973 if (SBase <= code && code < SBase+SCount) { 974 /* Hangul syllable. */ 975 int SIndex = code - SBase; 976 int L = SIndex / NCount; 977 int V = (SIndex % NCount) / TCount; 978 int T = SIndex % TCount; 979 980 if (buflen < 27) 981 /* Worst case: HANGUL SYLLABLE <10chars>. */ 982 return 0; 983 strcpy(buffer, "HANGUL SYLLABLE "); 984 buffer += 16; 985 strcpy(buffer, hangul_syllables[L][0]); 986 buffer += strlen(hangul_syllables[L][0]); 987 strcpy(buffer, hangul_syllables[V][1]); 988 buffer += strlen(hangul_syllables[V][1]); 989 strcpy(buffer, hangul_syllables[T][2]); 990 buffer += strlen(hangul_syllables[T][2]); 991 *buffer = '\0'; 992 return 1; 993 } 994 995 if (is_unified_ideograph(code)) { 996 if (buflen < 28) 997 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ 998 return 0; 999 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); 1000 return 1; 1001 } 1002 1003 /* get offset into phrasebook */ 1004 offset = phrasebook_offset1[(code>>phrasebook_shift)]; 1005 offset = phrasebook_offset2[(offset<<phrasebook_shift) + 1006 (code&((1<<phrasebook_shift)-1))]; 1007 if (!offset) 1008 return 0; 1009 1010 i = 0; 1011 1012 for (;;) { 1013 /* get word index */ 1014 word = phrasebook[offset] - phrasebook_short; 1015 if (word >= 0) { 1016 word = (word << 8) + phrasebook[offset+1]; 1017 offset += 2; 1018 } else 1019 word = phrasebook[offset++]; 1020 if (i) { 1021 if (i > buflen) 1022 return 0; /* buffer overflow */ 1023 buffer[i++] = ' '; 1024 } 1025 /* copy word string from lexicon. the last character in the 1026 word has bit 7 set. the last word in a string ends with 1027 0x80 */ 1028 w = lexicon + lexicon_offset[word]; 1029 while (*w < 128) { 1030 if (i >= buflen) 1031 return 0; /* buffer overflow */ 1032 buffer[i++] = *w++; 1033 } 1034 if (i >= buflen) 1035 return 0; /* buffer overflow */ 1036 buffer[i++] = *w & 127; 1037 if (*w == 128) 1038 break; /* end of word */ 1039 } 1040 1041 return 1; 1042 } 1043 1044 static int 1045 _cmpname(PyObject *self, int code, const char* name, int namelen) 1046 { 1047 /* check if code corresponds to the given name */ 1048 int i; 1049 char buffer[NAME_MAXLEN+1]; 1050 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1)) 1051 return 0; 1052 for (i = 0; i < namelen; i++) { 1053 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i]) 1054 return 0; 1055 } 1056 return buffer[namelen] == '\0'; 1057 } 1058 1059 static void 1060 find_syllable(const char *str, int *len, int *pos, int count, int column) 1061 { 1062 int i, len1; 1063 *len = -1; 1064 for (i = 0; i < count; i++) { 1065 const char *s = hangul_syllables[i][column]; 1066 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int); 1067 if (len1 <= *len) 1068 continue; 1069 if (strncmp(str, s, len1) == 0) { 1070 *len = len1; 1071 *pos = i; 1072 } 1073 } 1074 if (*len == -1) { 1075 *len = 0; 1076 } 1077 } 1078 1079 static int 1080 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq) 1081 { 1082 /* check if named sequences are allowed */ 1083 if (!with_named_seq && IS_NAMED_SEQ(cp)) 1084 return 0; 1085 /* if the code point is in the PUA range that we use for aliases, 1086 * convert it to obtain the right code point */ 1087 if (IS_ALIAS(cp)) 1088 *code = name_aliases[cp-aliases_start]; 1089 else 1090 *code = cp; 1091 return 1; 1092 } 1093 1094 static int 1095 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, 1096 int with_named_seq) 1097 { 1098 /* Return the code point associated with the given name. 1099 * Named aliases are resolved too (unless self != NULL (i.e. we are using 1100 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are 1101 * using for the named sequence, and the caller must then convert it. */ 1102 unsigned int h, v; 1103 unsigned int mask = code_size-1; 1104 unsigned int i, incr; 1105 1106 /* Check for hangul syllables. */ 1107 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { 1108 int len, L = -1, V = -1, T = -1; 1109 const char *pos = name + 16; 1110 find_syllable(pos, &len, &L, LCount, 0); 1111 pos += len; 1112 find_syllable(pos, &len, &V, VCount, 1); 1113 pos += len; 1114 find_syllable(pos, &len, &T, TCount, 2); 1115 pos += len; 1116 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { 1117 *code = SBase + (L*VCount+V)*TCount + T; 1118 return 1; 1119 } 1120 /* Otherwise, it's an illegal syllable name. */ 1121 return 0; 1122 } 1123 1124 /* Check for unified ideographs. */ 1125 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { 1126 /* Four or five hexdigits must follow. */ 1127 v = 0; 1128 name += 22; 1129 namelen -= 22; 1130 if (namelen != 4 && namelen != 5) 1131 return 0; 1132 while (namelen--) { 1133 v *= 16; 1134 if (*name >= '0' && *name <= '9') 1135 v += *name - '0'; 1136 else if (*name >= 'A' && *name <= 'F') 1137 v += *name - 'A' + 10; 1138 else 1139 return 0; 1140 name++; 1141 } 1142 if (!is_unified_ideograph(v)) 1143 return 0; 1144 *code = v; 1145 return 1; 1146 } 1147 1148 /* the following is the same as python's dictionary lookup, with 1149 only minor changes. see the makeunicodedata script for more 1150 details */ 1151 1152 h = (unsigned int) _gethash(name, namelen, code_magic); 1153 i = (~h) & mask; 1154 v = code_hash[i]; 1155 if (!v) 1156 return 0; 1157 if (_cmpname(self, v, name, namelen)) 1158 return _check_alias_and_seq(v, code, with_named_seq); 1159 incr = (h ^ (h >> 3)) & mask; 1160 if (!incr) 1161 incr = mask; 1162 for (;;) { 1163 i = (i + incr) & mask; 1164 v = code_hash[i]; 1165 if (!v) 1166 return 0; 1167 if (_cmpname(self, v, name, namelen)) 1168 return _check_alias_and_seq(v, code, with_named_seq); 1169 incr = incr << 1; 1170 if (incr > mask) 1171 incr = incr ^ code_poly; 1172 } 1173 } 1174 1175 static const _PyUnicode_Name_CAPI hashAPI = 1176 { 1177 sizeof(_PyUnicode_Name_CAPI), 1178 _getucname, 1179 _getcode 1180 }; 1181 1182 /* -------------------------------------------------------------------- */ 1183 /* Python bindings */ 1184 1185 /*[clinic input] 1186 unicodedata.UCD.name 1187 1188 self: self 1189 chr: int(accept={str}) 1190 default: object=NULL 1191 / 1192 1193 Returns the name assigned to the character chr as a string. 1194 1195 If no name is defined, default is returned, or, if not given, 1196 ValueError is raised. 1197 [clinic start generated code]*/ 1198 1199 static PyObject * 1200 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value) 1201 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/ 1202 { 1203 char name[NAME_MAXLEN+1]; 1204 Py_UCS4 c = (Py_UCS4)chr; 1205 1206 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) { 1207 if (default_value == NULL) { 1208 PyErr_SetString(PyExc_ValueError, "no such name"); 1209 return NULL; 1210 } 1211 else { 1212 Py_INCREF(default_value); 1213 return default_value; 1214 } 1215 } 1216 1217 return PyUnicode_FromString(name); 1218 } 1219 1220 /*[clinic input] 1221 unicodedata.UCD.lookup 1222 1223 self: self 1224 name: str(accept={str, robuffer}, zeroes=True) 1225 / 1226 1227 Look up character by name. 1228 1229 If a character with the given name is found, return the 1230 corresponding character. If not found, KeyError is raised. 1231 [clinic start generated code]*/ 1232 1233 static PyObject * 1234 unicodedata_UCD_lookup_impl(PyObject *self, const char *name, 1235 Py_ssize_clean_t name_length) 1236 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/ 1237 { 1238 Py_UCS4 code; 1239 unsigned int index; 1240 if (name_length > NAME_MAXLEN) { 1241 PyErr_SetString(PyExc_KeyError, "name too long"); 1242 return NULL; 1243 } 1244 1245 if (!_getcode(self, name, (int)name_length, &code, 1)) { 1246 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name); 1247 return NULL; 1248 } 1249 /* check if code is in the PUA range that we use for named sequences 1250 and convert it */ 1251 if (IS_NAMED_SEQ(code)) { 1252 index = code-named_sequences_start; 1253 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, 1254 named_sequences[index].seq, 1255 named_sequences[index].seqlen); 1256 } 1257 return PyUnicode_FromOrdinal(code); 1258 } 1259 1260 /* XXX Add doc strings. */ 1261 1262 static PyMethodDef unicodedata_functions[] = { 1263 UNICODEDATA_UCD_DECIMAL_METHODDEF 1264 UNICODEDATA_UCD_DIGIT_METHODDEF 1265 UNICODEDATA_UCD_NUMERIC_METHODDEF 1266 UNICODEDATA_UCD_CATEGORY_METHODDEF 1267 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF 1268 UNICODEDATA_UCD_COMBINING_METHODDEF 1269 UNICODEDATA_UCD_MIRRORED_METHODDEF 1270 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF 1271 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF 1272 UNICODEDATA_UCD_NAME_METHODDEF 1273 UNICODEDATA_UCD_LOOKUP_METHODDEF 1274 UNICODEDATA_UCD_NORMALIZE_METHODDEF 1275 {NULL, NULL} /* sentinel */ 1276 }; 1277 1278 static PyTypeObject UCD_Type = { 1279 /* The ob_type field must be initialized in the module init function 1280 * to be portable to Windows without using C++. */ 1281 PyVarObject_HEAD_INIT(NULL, 0) 1282 "unicodedata.UCD", /*tp_name*/ 1283 sizeof(PreviousDBVersion), /*tp_basicsize*/ 1284 0, /*tp_itemsize*/ 1285 /* methods */ 1286 (destructor)PyObject_Del, /*tp_dealloc*/ 1287 0, /*tp_print*/ 1288 0, /*tp_getattr*/ 1289 0, /*tp_setattr*/ 1290 0, /*tp_reserved*/ 1291 0, /*tp_repr*/ 1292 0, /*tp_as_number*/ 1293 0, /*tp_as_sequence*/ 1294 0, /*tp_as_mapping*/ 1295 0, /*tp_hash*/ 1296 0, /*tp_call*/ 1297 0, /*tp_str*/ 1298 PyObject_GenericGetAttr,/*tp_getattro*/ 1299 0, /*tp_setattro*/ 1300 0, /*tp_as_buffer*/ 1301 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 1302 0, /*tp_doc*/ 1303 0, /*tp_traverse*/ 1304 0, /*tp_clear*/ 1305 0, /*tp_richcompare*/ 1306 0, /*tp_weaklistoffset*/ 1307 0, /*tp_iter*/ 1308 0, /*tp_iternext*/ 1309 unicodedata_functions, /*tp_methods*/ 1310 DB_members, /*tp_members*/ 1311 0, /*tp_getset*/ 1312 0, /*tp_base*/ 1313 0, /*tp_dict*/ 1314 0, /*tp_descr_get*/ 1315 0, /*tp_descr_set*/ 1316 0, /*tp_dictoffset*/ 1317 0, /*tp_init*/ 1318 0, /*tp_alloc*/ 1319 0, /*tp_new*/ 1320 0, /*tp_free*/ 1321 0, /*tp_is_gc*/ 1322 }; 1323 1324 PyDoc_STRVAR(unicodedata_docstring, 1325 "This module provides access to the Unicode Character Database which\n\ 1326 defines character properties for all Unicode characters. The data in\n\ 1327 this database is based on the UnicodeData.txt file version\n\ 1328 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\ 1329 \n\ 1330 The module uses the same names and symbols as defined by the\n\ 1331 UnicodeData File Format " UNIDATA_VERSION "."); 1332 1333 static struct PyModuleDef unicodedatamodule = { 1334 PyModuleDef_HEAD_INIT, 1335 "unicodedata", 1336 unicodedata_docstring, 1337 -1, 1338 unicodedata_functions, 1339 NULL, 1340 NULL, 1341 NULL, 1342 NULL 1343 }; 1344 1345 PyMODINIT_FUNC 1346 PyInit_unicodedata(void) 1347 { 1348 PyObject *m, *v; 1349 1350 Py_TYPE(&UCD_Type) = &PyType_Type; 1351 1352 m = PyModule_Create(&unicodedatamodule); 1353 if (!m) 1354 return NULL; 1355 1356 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); 1357 Py_INCREF(&UCD_Type); 1358 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type); 1359 1360 /* Previous versions */ 1361 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); 1362 if (v != NULL) 1363 PyModule_AddObject(m, "ucd_3_2_0", v); 1364 1365 /* Export C API */ 1366 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL); 1367 if (v != NULL) 1368 PyModule_AddObject(m, "ucnhash_CAPI", v); 1369 return m; 1370 } 1371 1372 /* 1373 Local variables: 1374 c-basic-offset: 4 1375 indent-tabs-mode: nil 1376 End: 1377 */ 1378