1 /* ------------------------------------------------------------------------ 2 3 unicodedata -- Provides access to the Unicode database. 4 5 Data was extracted from the UnicodeData.txt file. 6 The current version number is reported in the unidata_version constant. 7 8 Written by Marc-Andre Lemburg (mal (at) lemburg.com). 9 Modified for Python 2.0 by Fredrik Lundh (fredrik (at) pythonware.com) 10 Modified by Martin v. Lwis (martin (at) v.loewis.de) 11 12 Copyright (c) Corporation for National Research Initiatives. 13 14 ------------------------------------------------------------------------ */ 15 16 #define PY_SSIZE_T_CLEAN 17 18 #include "Python.h" 19 #include "ucnhash.h" 20 #include "structmember.h" 21 22 /*[clinic input] 23 module unicodedata 24 class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type' 25 [clinic start generated code]*/ 26 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/ 27 28 /* character properties */ 29 30 typedef struct { 31 const unsigned char category; /* index into 32 _PyUnicode_CategoryNames */ 33 const unsigned char combining; /* combining class value 0 - 255 */ 34 const unsigned char bidirectional; /* index into 35 _PyUnicode_BidirectionalNames */ 36 const unsigned char mirrored; /* true if mirrored in bidir mode */ 37 const unsigned char east_asian_width; /* index into 38 _PyUnicode_EastAsianWidth */ 39 const unsigned char normalization_quick_check; /* see is_normalized() */ 40 } _PyUnicode_DatabaseRecord; 41 42 typedef struct change_record { 43 /* sequence of fields should be the same as in merge_old_version */ 44 const unsigned char bidir_changed; 45 const unsigned char category_changed; 46 const unsigned char decimal_changed; 47 const unsigned char mirrored_changed; 48 const unsigned char east_asian_width_changed; 49 const double numeric_changed; 50 } change_record; 51 52 /* data file generated by Tools/unicode/makeunicodedata.py */ 53 #include "unicodedata_db.h" 54 55 static const _PyUnicode_DatabaseRecord* 56 _getrecord_ex(Py_UCS4 code) 57 { 58 int index; 59 if (code >= 0x110000) 60 index = 0; 61 else { 62 index = index1[(code>>SHIFT)]; 63 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 64 } 65 66 return &_PyUnicode_Database_Records[index]; 67 } 68 69 /* ------------- Previous-version API ------------------------------------- */ 70 typedef struct previous_version { 71 PyObject_HEAD 72 const char *name; 73 const change_record* (*getrecord)(Py_UCS4); 74 Py_UCS4 (*normalization)(Py_UCS4); 75 } PreviousDBVersion; 76 77 #include "clinic/unicodedata.c.h" 78 79 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v)) 80 81 static PyMemberDef DB_members[] = { 82 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, 83 {NULL} 84 }; 85 86 /* forward declaration */ 87 static PyTypeObject UCD_Type; 88 #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type) 89 90 static PyObject* 91 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), 92 Py_UCS4 (*normalization)(Py_UCS4)) 93 { 94 PreviousDBVersion *self; 95 self = PyObject_New(PreviousDBVersion, &UCD_Type); 96 if (self == NULL) 97 return NULL; 98 self->name = name; 99 self->getrecord = getrecord; 100 self->normalization = normalization; 101 return (PyObject*)self; 102 } 103 104 105 /* --- Module API --------------------------------------------------------- */ 106 107 /*[clinic input] 108 unicodedata.UCD.decimal 109 110 self: self 111 chr: int(accept={str}) 112 default: object=NULL 113 / 114 115 Converts a Unicode character into its equivalent decimal value. 116 117 Returns the decimal value assigned to the character chr as integer. 118 If no such value is defined, default is returned, or, if not given, 119 ValueError is raised. 120 [clinic start generated code]*/ 121 122 static PyObject * 123 unicodedata_UCD_decimal_impl(PyObject *self, int chr, 124 PyObject *default_value) 125 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/ 126 { 127 int have_old = 0; 128 long rc; 129 Py_UCS4 c = (Py_UCS4)chr; 130 131 if (self && UCD_Check(self)) { 132 const change_record *old = get_old_record(self, c); 133 if (old->category_changed == 0) { 134 /* unassigned */ 135 have_old = 1; 136 rc = -1; 137 } 138 else if (old->decimal_changed != 0xFF) { 139 have_old = 1; 140 rc = old->decimal_changed; 141 } 142 } 143 144 if (!have_old) 145 rc = Py_UNICODE_TODECIMAL(c); 146 if (rc < 0) { 147 if (default_value == NULL) { 148 PyErr_SetString(PyExc_ValueError, 149 "not a decimal"); 150 return NULL; 151 } 152 else { 153 Py_INCREF(default_value); 154 return default_value; 155 } 156 } 157 return PyLong_FromLong(rc); 158 } 159 160 /*[clinic input] 161 unicodedata.UCD.digit 162 163 self: self 164 chr: int(accept={str}) 165 default: object=NULL 166 / 167 168 Converts a Unicode character into its equivalent digit value. 169 170 Returns the digit value assigned to the character chr as integer. 171 If no such value is defined, default is returned, or, if not given, 172 ValueError is raised. 173 [clinic start generated code]*/ 174 175 static PyObject * 176 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value) 177 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/ 178 { 179 long rc; 180 Py_UCS4 c = (Py_UCS4)chr; 181 rc = Py_UNICODE_TODIGIT(c); 182 if (rc < 0) { 183 if (default_value == NULL) { 184 PyErr_SetString(PyExc_ValueError, "not a digit"); 185 return NULL; 186 } 187 else { 188 Py_INCREF(default_value); 189 return default_value; 190 } 191 } 192 return PyLong_FromLong(rc); 193 } 194 195 /*[clinic input] 196 unicodedata.UCD.numeric 197 198 self: self 199 chr: int(accept={str}) 200 default: object=NULL 201 / 202 203 Converts a Unicode character into its equivalent numeric value. 204 205 Returns the numeric value assigned to the character chr as float. 206 If no such value is defined, default is returned, or, if not given, 207 ValueError is raised. 208 [clinic start generated code]*/ 209 210 static PyObject * 211 unicodedata_UCD_numeric_impl(PyObject *self, int chr, 212 PyObject *default_value) 213 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/ 214 { 215 int have_old = 0; 216 double rc; 217 Py_UCS4 c = (Py_UCS4)chr; 218 219 if (self && UCD_Check(self)) { 220 const change_record *old = get_old_record(self, c); 221 if (old->category_changed == 0) { 222 /* unassigned */ 223 have_old = 1; 224 rc = -1.0; 225 } 226 else if (old->decimal_changed != 0xFF) { 227 have_old = 1; 228 rc = old->decimal_changed; 229 } 230 } 231 232 if (!have_old) 233 rc = Py_UNICODE_TONUMERIC(c); 234 if (rc == -1.0) { 235 if (default_value == NULL) { 236 PyErr_SetString(PyExc_ValueError, "not a numeric character"); 237 return NULL; 238 } 239 else { 240 Py_INCREF(default_value); 241 return default_value; 242 } 243 } 244 return PyFloat_FromDouble(rc); 245 } 246 247 /*[clinic input] 248 unicodedata.UCD.category 249 250 self: self 251 chr: int(accept={str}) 252 / 253 254 Returns the general category assigned to the character chr as string. 255 [clinic start generated code]*/ 256 257 static PyObject * 258 unicodedata_UCD_category_impl(PyObject *self, int chr) 259 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/ 260 { 261 int index; 262 Py_UCS4 c = (Py_UCS4)chr; 263 index = (int) _getrecord_ex(c)->category; 264 if (self && UCD_Check(self)) { 265 const change_record *old = get_old_record(self, c); 266 if (old->category_changed != 0xFF) 267 index = old->category_changed; 268 } 269 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]); 270 } 271 272 /*[clinic input] 273 unicodedata.UCD.bidirectional 274 275 self: self 276 chr: int(accept={str}) 277 / 278 279 Returns the bidirectional class assigned to the character chr as string. 280 281 If no such value is defined, an empty string is returned. 282 [clinic start generated code]*/ 283 284 static PyObject * 285 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr) 286 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/ 287 { 288 int index; 289 Py_UCS4 c = (Py_UCS4)chr; 290 index = (int) _getrecord_ex(c)->bidirectional; 291 if (self && UCD_Check(self)) { 292 const change_record *old = get_old_record(self, c); 293 if (old->category_changed == 0) 294 index = 0; /* unassigned */ 295 else if (old->bidir_changed != 0xFF) 296 index = old->bidir_changed; 297 } 298 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]); 299 } 300 301 /*[clinic input] 302 unicodedata.UCD.combining -> int 303 304 self: self 305 chr: int(accept={str}) 306 / 307 308 Returns the canonical combining class assigned to the character chr as integer. 309 310 Returns 0 if no combining class is defined. 311 [clinic start generated code]*/ 312 313 static int 314 unicodedata_UCD_combining_impl(PyObject *self, int chr) 315 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/ 316 { 317 int index; 318 Py_UCS4 c = (Py_UCS4)chr; 319 index = (int) _getrecord_ex(c)->combining; 320 if (self && UCD_Check(self)) { 321 const change_record *old = get_old_record(self, c); 322 if (old->category_changed == 0) 323 index = 0; /* unassigned */ 324 } 325 return index; 326 } 327 328 /*[clinic input] 329 unicodedata.UCD.mirrored -> int 330 331 self: self 332 chr: int(accept={str}) 333 / 334 335 Returns the mirrored property assigned to the character chr as integer. 336 337 Returns 1 if the character has been identified as a "mirrored" 338 character in bidirectional text, 0 otherwise. 339 [clinic start generated code]*/ 340 341 static int 342 unicodedata_UCD_mirrored_impl(PyObject *self, int chr) 343 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/ 344 { 345 int index; 346 Py_UCS4 c = (Py_UCS4)chr; 347 index = (int) _getrecord_ex(c)->mirrored; 348 if (self && UCD_Check(self)) { 349 const change_record *old = get_old_record(self, c); 350 if (old->category_changed == 0) 351 index = 0; /* unassigned */ 352 else if (old->mirrored_changed != 0xFF) 353 index = old->mirrored_changed; 354 } 355 return index; 356 } 357 358 /*[clinic input] 359 unicodedata.UCD.east_asian_width 360 361 self: self 362 chr: int(accept={str}) 363 / 364 365 Returns the east asian width assigned to the character chr as string. 366 [clinic start generated code]*/ 367 368 static PyObject * 369 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr) 370 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/ 371 { 372 int index; 373 Py_UCS4 c = (Py_UCS4)chr; 374 index = (int) _getrecord_ex(c)->east_asian_width; 375 if (self && UCD_Check(self)) { 376 const change_record *old = get_old_record(self, c); 377 if (old->category_changed == 0) 378 index = 0; /* unassigned */ 379 else if (old->east_asian_width_changed != 0xFF) 380 index = old->east_asian_width_changed; 381 } 382 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]); 383 } 384 385 /*[clinic input] 386 unicodedata.UCD.decomposition 387 388 self: self 389 chr: int(accept={str}) 390 / 391 392 Returns the character decomposition mapping assigned to the character chr as string. 393 394 An empty string is returned in case no such mapping is defined. 395 [clinic start generated code]*/ 396 397 static PyObject * 398 unicodedata_UCD_decomposition_impl(PyObject *self, int chr) 399 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/ 400 { 401 char decomp[256]; 402 int code, index, count; 403 size_t i; 404 unsigned int prefix_index; 405 Py_UCS4 c = (Py_UCS4)chr; 406 407 code = (int)c; 408 409 if (self && UCD_Check(self)) { 410 const change_record *old = get_old_record(self, c); 411 if (old->category_changed == 0) 412 return PyUnicode_FromString(""); /* unassigned */ 413 } 414 415 if (code < 0 || code >= 0x110000) 416 index = 0; 417 else { 418 index = decomp_index1[(code>>DECOMP_SHIFT)]; 419 index = decomp_index2[(index<<DECOMP_SHIFT)+ 420 (code&((1<<DECOMP_SHIFT)-1))]; 421 } 422 423 /* high byte is number of hex bytes (usually one or two), low byte 424 is prefix code (from*/ 425 count = decomp_data[index] >> 8; 426 427 /* XXX: could allocate the PyString up front instead 428 (strlen(prefix) + 5 * count + 1 bytes) */ 429 430 /* Based on how index is calculated above and decomp_data is generated 431 from Tools/unicode/makeunicodedata.py, it should not be possible 432 to overflow decomp_prefix. */ 433 prefix_index = decomp_data[index] & 255; 434 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix)); 435 436 /* copy prefix */ 437 i = strlen(decomp_prefix[prefix_index]); 438 memcpy(decomp, decomp_prefix[prefix_index], i); 439 440 while (count-- > 0) { 441 if (i) 442 decomp[i++] = ' '; 443 assert(i < sizeof(decomp)); 444 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", 445 decomp_data[++index]); 446 i += strlen(decomp + i); 447 } 448 return PyUnicode_FromStringAndSize(decomp, i); 449 } 450 451 static void 452 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) 453 { 454 if (code >= 0x110000) { 455 *index = 0; 456 } else if (self && UCD_Check(self) && 457 get_old_record(self, code)->category_changed==0) { 458 /* unassigned in old version */ 459 *index = 0; 460 } 461 else { 462 *index = decomp_index1[(code>>DECOMP_SHIFT)]; 463 *index = decomp_index2[(*index<<DECOMP_SHIFT)+ 464 (code&((1<<DECOMP_SHIFT)-1))]; 465 } 466 467 /* high byte is number of hex bytes (usually one or two), low byte 468 is prefix code (from*/ 469 *count = decomp_data[*index] >> 8; 470 *prefix = decomp_data[*index] & 255; 471 472 (*index)++; 473 } 474 475 #define SBase 0xAC00 476 #define LBase 0x1100 477 #define VBase 0x1161 478 #define TBase 0x11A7 479 #define LCount 19 480 #define VCount 21 481 #define TCount 28 482 #define NCount (VCount*TCount) 483 #define SCount (LCount*NCount) 484 485 static PyObject* 486 nfd_nfkd(PyObject *self, PyObject *input, int k) 487 { 488 PyObject *result; 489 Py_UCS4 *output; 490 Py_ssize_t i, o, osize; 491 int kind; 492 void *data; 493 /* Longest decomposition in Unicode 3.2: U+FDFA */ 494 Py_UCS4 stack[20]; 495 Py_ssize_t space, isize; 496 int index, prefix, count, stackptr; 497 unsigned char prev, cur; 498 499 stackptr = 0; 500 isize = PyUnicode_GET_LENGTH(input); 501 space = isize; 502 /* Overallocate at most 10 characters. */ 503 if (space > 10) { 504 if (space <= PY_SSIZE_T_MAX - 10) 505 space += 10; 506 } 507 else { 508 space *= 2; 509 } 510 osize = space; 511 output = PyMem_NEW(Py_UCS4, space); 512 if (!output) { 513 PyErr_NoMemory(); 514 return NULL; 515 } 516 i = o = 0; 517 kind = PyUnicode_KIND(input); 518 data = PyUnicode_DATA(input); 519 520 while (i < isize) { 521 stack[stackptr++] = PyUnicode_READ(kind, data, i++); 522 while(stackptr) { 523 Py_UCS4 code = stack[--stackptr]; 524 /* Hangul Decomposition adds three characters in 525 a single step, so we need at least that much room. */ 526 if (space < 3) { 527 Py_UCS4 *new_output; 528 osize += 10; 529 space += 10; 530 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4)); 531 if (new_output == NULL) { 532 PyMem_Free(output); 533 PyErr_NoMemory(); 534 return NULL; 535 } 536 output = new_output; 537 } 538 /* Hangul Decomposition. */ 539 if (SBase <= code && code < (SBase+SCount)) { 540 int SIndex = code - SBase; 541 int L = LBase + SIndex / NCount; 542 int V = VBase + (SIndex % NCount) / TCount; 543 int T = TBase + SIndex % TCount; 544 output[o++] = L; 545 output[o++] = V; 546 space -= 2; 547 if (T != TBase) { 548 output[o++] = T; 549 space --; 550 } 551 continue; 552 } 553 /* normalization changes */ 554 if (self && UCD_Check(self)) { 555 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); 556 if (value != 0) { 557 stack[stackptr++] = value; 558 continue; 559 } 560 } 561 562 /* Other decompositions. */ 563 get_decomp_record(self, code, &index, &prefix, &count); 564 565 /* Copy character if it is not decomposable, or has a 566 compatibility decomposition, but we do NFD. */ 567 if (!count || (prefix && !k)) { 568 output[o++] = code; 569 space--; 570 continue; 571 } 572 /* Copy decomposition onto the stack, in reverse 573 order. */ 574 while(count) { 575 code = decomp_data[index + (--count)]; 576 stack[stackptr++] = code; 577 } 578 } 579 } 580 581 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, 582 output, o); 583 PyMem_Free(output); 584 if (!result) 585 return NULL; 586 /* result is guaranteed to be ready, as it is compact. */ 587 kind = PyUnicode_KIND(result); 588 data = PyUnicode_DATA(result); 589 590 /* Sort canonically. */ 591 i = 0; 592 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 593 for (i++; i < PyUnicode_GET_LENGTH(result); i++) { 594 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 595 if (prev == 0 || cur == 0 || prev <= cur) { 596 prev = cur; 597 continue; 598 } 599 /* Non-canonical order. Need to switch *i with previous. */ 600 o = i - 1; 601 while (1) { 602 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1); 603 PyUnicode_WRITE(kind, data, o+1, 604 PyUnicode_READ(kind, data, o)); 605 PyUnicode_WRITE(kind, data, o, tmp); 606 o--; 607 if (o < 0) 608 break; 609 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining; 610 if (prev == 0 || prev <= cur) 611 break; 612 } 613 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 614 } 615 return result; 616 } 617 618 static int 619 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code) 620 { 621 unsigned int index; 622 for (index = 0; nfc[index].start; index++) { 623 unsigned int start = nfc[index].start; 624 if (code < start) 625 return -1; 626 if (code <= start + nfc[index].count) { 627 unsigned int delta = code - start; 628 return nfc[index].index + delta; 629 } 630 } 631 return -1; 632 } 633 634 static PyObject* 635 nfc_nfkc(PyObject *self, PyObject *input, int k) 636 { 637 PyObject *result; 638 int kind; 639 void *data; 640 Py_UCS4 *output; 641 Py_ssize_t i, i1, o, len; 642 int f,l,index,index1,comb; 643 Py_UCS4 code; 644 Py_ssize_t skipped[20]; 645 int cskipped = 0; 646 647 result = nfd_nfkd(self, input, k); 648 if (!result) 649 return NULL; 650 /* result will be "ready". */ 651 kind = PyUnicode_KIND(result); 652 data = PyUnicode_DATA(result); 653 len = PyUnicode_GET_LENGTH(result); 654 655 /* We allocate a buffer for the output. 656 If we find that we made no changes, we still return 657 the NFD result. */ 658 output = PyMem_NEW(Py_UCS4, len); 659 if (!output) { 660 PyErr_NoMemory(); 661 Py_DECREF(result); 662 return 0; 663 } 664 i = o = 0; 665 666 again: 667 while (i < len) { 668 for (index = 0; index < cskipped; index++) { 669 if (skipped[index] == i) { 670 /* *i character is skipped. 671 Remove from list. */ 672 skipped[index] = skipped[cskipped-1]; 673 cskipped--; 674 i++; 675 goto again; /* continue while */ 676 } 677 } 678 /* Hangul Composition. We don't need to check for <LV,T> 679 pairs, since we always have decomposed data. */ 680 code = PyUnicode_READ(kind, data, i); 681 if (LBase <= code && code < (LBase+LCount) && 682 i + 1 < len && 683 VBase <= PyUnicode_READ(kind, data, i+1) && 684 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) { 685 int LIndex, VIndex; 686 LIndex = code - LBase; 687 VIndex = PyUnicode_READ(kind, data, i+1) - VBase; 688 code = SBase + (LIndex*VCount+VIndex)*TCount; 689 i+=2; 690 if (i < len && 691 TBase <= PyUnicode_READ(kind, data, i) && 692 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) { 693 code += PyUnicode_READ(kind, data, i)-TBase; 694 i++; 695 } 696 output[o++] = code; 697 continue; 698 } 699 700 /* code is still input[i] here */ 701 f = find_nfc_index(self, nfc_first, code); 702 if (f == -1) { 703 output[o++] = code; 704 i++; 705 continue; 706 } 707 /* Find next unblocked character. */ 708 i1 = i+1; 709 comb = 0; 710 /* output base character for now; might be updated later. */ 711 output[o] = PyUnicode_READ(kind, data, i); 712 while (i1 < len) { 713 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1); 714 int comb1 = _getrecord_ex(code1)->combining; 715 if (comb) { 716 if (comb1 == 0) 717 break; 718 if (comb >= comb1) { 719 /* Character is blocked. */ 720 i1++; 721 continue; 722 } 723 } 724 l = find_nfc_index(self, nfc_last, code1); 725 /* i1 cannot be combined with i. If i1 726 is a starter, we don't need to look further. 727 Otherwise, record the combining class. */ 728 if (l == -1) { 729 not_combinable: 730 if (comb1 == 0) 731 break; 732 comb = comb1; 733 i1++; 734 continue; 735 } 736 index = f*TOTAL_LAST + l; 737 index1 = comp_index[index >> COMP_SHIFT]; 738 code = comp_data[(index1<<COMP_SHIFT)+ 739 (index&((1<<COMP_SHIFT)-1))]; 740 if (code == 0) 741 goto not_combinable; 742 743 /* Replace the original character. */ 744 output[o] = code; 745 /* Mark the second character unused. */ 746 assert(cskipped < 20); 747 skipped[cskipped++] = i1; 748 i1++; 749 f = find_nfc_index(self, nfc_first, output[o]); 750 if (f == -1) 751 break; 752 } 753 /* Output character was already written. 754 Just advance the indices. */ 755 o++; i++; 756 } 757 if (o == len) { 758 /* No changes. Return original string. */ 759 PyMem_Free(output); 760 return result; 761 } 762 Py_DECREF(result); 763 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, 764 output, o); 765 PyMem_Free(output); 766 return result; 767 } 768 769 /* Return 1 if the input is certainly normalized, 0 if it might not be. */ 770 static int 771 is_normalized(PyObject *self, PyObject *input, int nfc, int k) 772 { 773 Py_ssize_t i, len; 774 int kind; 775 void *data; 776 unsigned char prev_combining = 0, quickcheck_mask; 777 778 /* An older version of the database is requested, quickchecks must be 779 disabled. */ 780 if (self && UCD_Check(self)) 781 return 0; 782 783 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, 784 as described in http://unicode.org/reports/tr15/#Annex8. */ 785 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); 786 787 i = 0; 788 kind = PyUnicode_KIND(input); 789 data = PyUnicode_DATA(input); 790 len = PyUnicode_GET_LENGTH(input); 791 while (i < len) { 792 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 793 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); 794 unsigned char combining = record->combining; 795 unsigned char quickcheck = record->normalization_quick_check; 796 797 if (quickcheck & quickcheck_mask) 798 return 0; /* this string might need normalization */ 799 if (combining && prev_combining > combining) 800 return 0; /* non-canonical sort order, not normalized */ 801 prev_combining = combining; 802 } 803 return 1; /* certainly normalized */ 804 } 805 806 /*[clinic input] 807 unicodedata.UCD.normalize 808 809 self: self 810 form: str 811 unistr as input: object(subclass_of='&PyUnicode_Type') 812 / 813 814 Return the normal form 'form' for the Unicode string unistr. 815 816 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. 817 [clinic start generated code]*/ 818 819 static PyObject * 820 unicodedata_UCD_normalize_impl(PyObject *self, const char *form, 821 PyObject *input) 822 /*[clinic end generated code: output=62d1f8870027efdc input=cd092e631cf11883]*/ 823 { 824 if (PyUnicode_READY(input) == -1) 825 return NULL; 826 827 if (PyUnicode_GET_LENGTH(input) == 0) { 828 /* Special case empty input strings, since resizing 829 them later would cause internal errors. */ 830 Py_INCREF(input); 831 return input; 832 } 833 834 if (strcmp(form, "NFC") == 0) { 835 if (is_normalized(self, input, 1, 0)) { 836 Py_INCREF(input); 837 return input; 838 } 839 return nfc_nfkc(self, input, 0); 840 } 841 if (strcmp(form, "NFKC") == 0) { 842 if (is_normalized(self, input, 1, 1)) { 843 Py_INCREF(input); 844 return input; 845 } 846 return nfc_nfkc(self, input, 1); 847 } 848 if (strcmp(form, "NFD") == 0) { 849 if (is_normalized(self, input, 0, 0)) { 850 Py_INCREF(input); 851 return input; 852 } 853 return nfd_nfkd(self, input, 0); 854 } 855 if (strcmp(form, "NFKD") == 0) { 856 if (is_normalized(self, input, 0, 1)) { 857 Py_INCREF(input); 858 return input; 859 } 860 return nfd_nfkd(self, input, 1); 861 } 862 PyErr_SetString(PyExc_ValueError, "invalid normalization form"); 863 return NULL; 864 } 865 866 /* -------------------------------------------------------------------- */ 867 /* unicode character name tables */ 868 869 /* data file generated by Tools/unicode/makeunicodedata.py */ 870 #include "unicodename_db.h" 871 872 /* -------------------------------------------------------------------- */ 873 /* database code (cut and pasted from the unidb package) */ 874 875 static unsigned long 876 _gethash(const char *s, int len, int scale) 877 { 878 int i; 879 unsigned long h = 0; 880 unsigned long ix; 881 for (i = 0; i < len; i++) { 882 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i])); 883 ix = h & 0xff000000; 884 if (ix) 885 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; 886 } 887 return h; 888 } 889 890 static const char * const hangul_syllables[][3] = { 891 { "G", "A", "" }, 892 { "GG", "AE", "G" }, 893 { "N", "YA", "GG" }, 894 { "D", "YAE", "GS" }, 895 { "DD", "EO", "N", }, 896 { "R", "E", "NJ" }, 897 { "M", "YEO", "NH" }, 898 { "B", "YE", "D" }, 899 { "BB", "O", "L" }, 900 { "S", "WA", "LG" }, 901 { "SS", "WAE", "LM" }, 902 { "", "OE", "LB" }, 903 { "J", "YO", "LS" }, 904 { "JJ", "U", "LT" }, 905 { "C", "WEO", "LP" }, 906 { "K", "WE", "LH" }, 907 { "T", "WI", "M" }, 908 { "P", "YU", "B" }, 909 { "H", "EU", "BS" }, 910 { 0, "YI", "S" }, 911 { 0, "I", "SS" }, 912 { 0, 0, "NG" }, 913 { 0, 0, "J" }, 914 { 0, 0, "C" }, 915 { 0, 0, "K" }, 916 { 0, 0, "T" }, 917 { 0, 0, "P" }, 918 { 0, 0, "H" } 919 }; 920 921 /* These ranges need to match makeunicodedata.py:cjk_ranges. */ 922 static int 923 is_unified_ideograph(Py_UCS4 code) 924 { 925 return 926 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ 927 (0x4E00 <= code && code <= 0x9FD5) || /* CJK Ideograph */ 928 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */ 929 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */ 930 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */ 931 (0x2B820 <= code && code <= 0x2CEA1); /* CJK Ideograph Extension E */ 932 } 933 934 /* macros used to determine if the given code point is in the PUA range that 935 * we are using to store aliases and named sequences */ 936 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end)) 937 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \ 938 (cp < named_sequences_end)) 939 940 static int 941 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, 942 int with_alias_and_seq) 943 { 944 /* Find the name associated with the given code point. 945 * If with_alias_and_seq is 1, check for names in the Private Use Area 15 946 * that we are using for aliases and named sequences. */ 947 int offset; 948 int i; 949 int word; 950 unsigned char* w; 951 952 if (code >= 0x110000) 953 return 0; 954 955 /* XXX should we just skip all the code points in the PUAs here? */ 956 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code))) 957 return 0; 958 959 if (self && UCD_Check(self)) { 960 /* in 3.2.0 there are no aliases and named sequences */ 961 const change_record *old; 962 if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) 963 return 0; 964 old = get_old_record(self, code); 965 if (old->category_changed == 0) { 966 /* unassigned */ 967 return 0; 968 } 969 } 970 971 if (SBase <= code && code < SBase+SCount) { 972 /* Hangul syllable. */ 973 int SIndex = code - SBase; 974 int L = SIndex / NCount; 975 int V = (SIndex % NCount) / TCount; 976 int T = SIndex % TCount; 977 978 if (buflen < 27) 979 /* Worst case: HANGUL SYLLABLE <10chars>. */ 980 return 0; 981 strcpy(buffer, "HANGUL SYLLABLE "); 982 buffer += 16; 983 strcpy(buffer, hangul_syllables[L][0]); 984 buffer += strlen(hangul_syllables[L][0]); 985 strcpy(buffer, hangul_syllables[V][1]); 986 buffer += strlen(hangul_syllables[V][1]); 987 strcpy(buffer, hangul_syllables[T][2]); 988 buffer += strlen(hangul_syllables[T][2]); 989 *buffer = '\0'; 990 return 1; 991 } 992 993 if (is_unified_ideograph(code)) { 994 if (buflen < 28) 995 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ 996 return 0; 997 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); 998 return 1; 999 } 1000 1001 /* get offset into phrasebook */ 1002 offset = phrasebook_offset1[(code>>phrasebook_shift)]; 1003 offset = phrasebook_offset2[(offset<<phrasebook_shift) + 1004 (code&((1<<phrasebook_shift)-1))]; 1005 if (!offset) 1006 return 0; 1007 1008 i = 0; 1009 1010 for (;;) { 1011 /* get word index */ 1012 word = phrasebook[offset] - phrasebook_short; 1013 if (word >= 0) { 1014 word = (word << 8) + phrasebook[offset+1]; 1015 offset += 2; 1016 } else 1017 word = phrasebook[offset++]; 1018 if (i) { 1019 if (i > buflen) 1020 return 0; /* buffer overflow */ 1021 buffer[i++] = ' '; 1022 } 1023 /* copy word string from lexicon. the last character in the 1024 word has bit 7 set. the last word in a string ends with 1025 0x80 */ 1026 w = lexicon + lexicon_offset[word]; 1027 while (*w < 128) { 1028 if (i >= buflen) 1029 return 0; /* buffer overflow */ 1030 buffer[i++] = *w++; 1031 } 1032 if (i >= buflen) 1033 return 0; /* buffer overflow */ 1034 buffer[i++] = *w & 127; 1035 if (*w == 128) 1036 break; /* end of word */ 1037 } 1038 1039 return 1; 1040 } 1041 1042 static int 1043 _cmpname(PyObject *self, int code, const char* name, int namelen) 1044 { 1045 /* check if code corresponds to the given name */ 1046 int i; 1047 char buffer[NAME_MAXLEN+1]; 1048 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1)) 1049 return 0; 1050 for (i = 0; i < namelen; i++) { 1051 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i]) 1052 return 0; 1053 } 1054 return buffer[namelen] == '\0'; 1055 } 1056 1057 static void 1058 find_syllable(const char *str, int *len, int *pos, int count, int column) 1059 { 1060 int i, len1; 1061 *len = -1; 1062 for (i = 0; i < count; i++) { 1063 const char *s = hangul_syllables[i][column]; 1064 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int); 1065 if (len1 <= *len) 1066 continue; 1067 if (strncmp(str, s, len1) == 0) { 1068 *len = len1; 1069 *pos = i; 1070 } 1071 } 1072 if (*len == -1) { 1073 *len = 0; 1074 } 1075 } 1076 1077 static int 1078 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq) 1079 { 1080 /* check if named sequences are allowed */ 1081 if (!with_named_seq && IS_NAMED_SEQ(cp)) 1082 return 0; 1083 /* if the code point is in the PUA range that we use for aliases, 1084 * convert it to obtain the right code point */ 1085 if (IS_ALIAS(cp)) 1086 *code = name_aliases[cp-aliases_start]; 1087 else 1088 *code = cp; 1089 return 1; 1090 } 1091 1092 static int 1093 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, 1094 int with_named_seq) 1095 { 1096 /* Return the code point associated with the given name. 1097 * Named aliases are resolved too (unless self != NULL (i.e. we are using 1098 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are 1099 * using for the named sequence, and the caller must then convert it. */ 1100 unsigned int h, v; 1101 unsigned int mask = code_size-1; 1102 unsigned int i, incr; 1103 1104 /* Check for hangul syllables. */ 1105 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { 1106 int len, L = -1, V = -1, T = -1; 1107 const char *pos = name + 16; 1108 find_syllable(pos, &len, &L, LCount, 0); 1109 pos += len; 1110 find_syllable(pos, &len, &V, VCount, 1); 1111 pos += len; 1112 find_syllable(pos, &len, &T, TCount, 2); 1113 pos += len; 1114 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { 1115 *code = SBase + (L*VCount+V)*TCount + T; 1116 return 1; 1117 } 1118 /* Otherwise, it's an illegal syllable name. */ 1119 return 0; 1120 } 1121 1122 /* Check for unified ideographs. */ 1123 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { 1124 /* Four or five hexdigits must follow. */ 1125 v = 0; 1126 name += 22; 1127 namelen -= 22; 1128 if (namelen != 4 && namelen != 5) 1129 return 0; 1130 while (namelen--) { 1131 v *= 16; 1132 if (*name >= '0' && *name <= '9') 1133 v += *name - '0'; 1134 else if (*name >= 'A' && *name <= 'F') 1135 v += *name - 'A' + 10; 1136 else 1137 return 0; 1138 name++; 1139 } 1140 if (!is_unified_ideograph(v)) 1141 return 0; 1142 *code = v; 1143 return 1; 1144 } 1145 1146 /* the following is the same as python's dictionary lookup, with 1147 only minor changes. see the makeunicodedata script for more 1148 details */ 1149 1150 h = (unsigned int) _gethash(name, namelen, code_magic); 1151 i = (~h) & mask; 1152 v = code_hash[i]; 1153 if (!v) 1154 return 0; 1155 if (_cmpname(self, v, name, namelen)) 1156 return _check_alias_and_seq(v, code, with_named_seq); 1157 incr = (h ^ (h >> 3)) & mask; 1158 if (!incr) 1159 incr = mask; 1160 for (;;) { 1161 i = (i + incr) & mask; 1162 v = code_hash[i]; 1163 if (!v) 1164 return 0; 1165 if (_cmpname(self, v, name, namelen)) 1166 return _check_alias_and_seq(v, code, with_named_seq); 1167 incr = incr << 1; 1168 if (incr > mask) 1169 incr = incr ^ code_poly; 1170 } 1171 } 1172 1173 static const _PyUnicode_Name_CAPI hashAPI = 1174 { 1175 sizeof(_PyUnicode_Name_CAPI), 1176 _getucname, 1177 _getcode 1178 }; 1179 1180 /* -------------------------------------------------------------------- */ 1181 /* Python bindings */ 1182 1183 /*[clinic input] 1184 unicodedata.UCD.name 1185 1186 self: self 1187 chr: int(accept={str}) 1188 default: object=NULL 1189 / 1190 1191 Returns the name assigned to the character chr as a string. 1192 1193 If no name is defined, default is returned, or, if not given, 1194 ValueError is raised. 1195 [clinic start generated code]*/ 1196 1197 static PyObject * 1198 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value) 1199 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/ 1200 { 1201 char name[NAME_MAXLEN+1]; 1202 Py_UCS4 c = (Py_UCS4)chr; 1203 1204 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) { 1205 if (default_value == NULL) { 1206 PyErr_SetString(PyExc_ValueError, "no such name"); 1207 return NULL; 1208 } 1209 else { 1210 Py_INCREF(default_value); 1211 return default_value; 1212 } 1213 } 1214 1215 return PyUnicode_FromString(name); 1216 } 1217 1218 /*[clinic input] 1219 unicodedata.UCD.lookup 1220 1221 self: self 1222 name: str(accept={str, robuffer}, zeroes=True) 1223 / 1224 1225 Look up character by name. 1226 1227 If a character with the given name is found, return the 1228 corresponding character. If not found, KeyError is raised. 1229 [clinic start generated code]*/ 1230 1231 static PyObject * 1232 unicodedata_UCD_lookup_impl(PyObject *self, const char *name, 1233 Py_ssize_clean_t name_length) 1234 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/ 1235 { 1236 Py_UCS4 code; 1237 unsigned int index; 1238 if (name_length > NAME_MAXLEN) { 1239 PyErr_SetString(PyExc_KeyError, "name too long"); 1240 return NULL; 1241 } 1242 1243 if (!_getcode(self, name, (int)name_length, &code, 1)) { 1244 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name); 1245 return NULL; 1246 } 1247 /* check if code is in the PUA range that we use for named sequences 1248 and convert it */ 1249 if (IS_NAMED_SEQ(code)) { 1250 index = code-named_sequences_start; 1251 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, 1252 named_sequences[index].seq, 1253 named_sequences[index].seqlen); 1254 } 1255 return PyUnicode_FromOrdinal(code); 1256 } 1257 1258 /* XXX Add doc strings. */ 1259 1260 static PyMethodDef unicodedata_functions[] = { 1261 UNICODEDATA_UCD_DECIMAL_METHODDEF 1262 UNICODEDATA_UCD_DIGIT_METHODDEF 1263 UNICODEDATA_UCD_NUMERIC_METHODDEF 1264 UNICODEDATA_UCD_CATEGORY_METHODDEF 1265 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF 1266 UNICODEDATA_UCD_COMBINING_METHODDEF 1267 UNICODEDATA_UCD_MIRRORED_METHODDEF 1268 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF 1269 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF 1270 UNICODEDATA_UCD_NAME_METHODDEF 1271 UNICODEDATA_UCD_LOOKUP_METHODDEF 1272 UNICODEDATA_UCD_NORMALIZE_METHODDEF 1273 {NULL, NULL} /* sentinel */ 1274 }; 1275 1276 static PyTypeObject UCD_Type = { 1277 /* The ob_type field must be initialized in the module init function 1278 * to be portable to Windows without using C++. */ 1279 PyVarObject_HEAD_INIT(NULL, 0) 1280 "unicodedata.UCD", /*tp_name*/ 1281 sizeof(PreviousDBVersion), /*tp_basicsize*/ 1282 0, /*tp_itemsize*/ 1283 /* methods */ 1284 (destructor)PyObject_Del, /*tp_dealloc*/ 1285 0, /*tp_print*/ 1286 0, /*tp_getattr*/ 1287 0, /*tp_setattr*/ 1288 0, /*tp_reserved*/ 1289 0, /*tp_repr*/ 1290 0, /*tp_as_number*/ 1291 0, /*tp_as_sequence*/ 1292 0, /*tp_as_mapping*/ 1293 0, /*tp_hash*/ 1294 0, /*tp_call*/ 1295 0, /*tp_str*/ 1296 PyObject_GenericGetAttr,/*tp_getattro*/ 1297 0, /*tp_setattro*/ 1298 0, /*tp_as_buffer*/ 1299 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 1300 0, /*tp_doc*/ 1301 0, /*tp_traverse*/ 1302 0, /*tp_clear*/ 1303 0, /*tp_richcompare*/ 1304 0, /*tp_weaklistoffset*/ 1305 0, /*tp_iter*/ 1306 0, /*tp_iternext*/ 1307 unicodedata_functions, /*tp_methods*/ 1308 DB_members, /*tp_members*/ 1309 0, /*tp_getset*/ 1310 0, /*tp_base*/ 1311 0, /*tp_dict*/ 1312 0, /*tp_descr_get*/ 1313 0, /*tp_descr_set*/ 1314 0, /*tp_dictoffset*/ 1315 0, /*tp_init*/ 1316 0, /*tp_alloc*/ 1317 0, /*tp_new*/ 1318 0, /*tp_free*/ 1319 0, /*tp_is_gc*/ 1320 }; 1321 1322 PyDoc_STRVAR(unicodedata_docstring, 1323 "This module provides access to the Unicode Character Database which\n\ 1324 defines character properties for all Unicode characters. The data in\n\ 1325 this database is based on the UnicodeData.txt file version\n\ 1326 " UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\ 1327 \n\ 1328 The module uses the same names and symbols as defined by the\n\ 1329 UnicodeData File Format " UNIDATA_VERSION "."); 1330 1331 static struct PyModuleDef unicodedatamodule = { 1332 PyModuleDef_HEAD_INIT, 1333 "unicodedata", 1334 unicodedata_docstring, 1335 -1, 1336 unicodedata_functions, 1337 NULL, 1338 NULL, 1339 NULL, 1340 NULL 1341 }; 1342 1343 PyMODINIT_FUNC 1344 PyInit_unicodedata(void) 1345 { 1346 PyObject *m, *v; 1347 1348 Py_TYPE(&UCD_Type) = &PyType_Type; 1349 1350 m = PyModule_Create(&unicodedatamodule); 1351 if (!m) 1352 return NULL; 1353 1354 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); 1355 Py_INCREF(&UCD_Type); 1356 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type); 1357 1358 /* Previous versions */ 1359 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); 1360 if (v != NULL) 1361 PyModule_AddObject(m, "ucd_3_2_0", v); 1362 1363 /* Export C API */ 1364 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL); 1365 if (v != NULL) 1366 PyModule_AddObject(m, "ucnhash_CAPI", v); 1367 return m; 1368 } 1369 1370 /* 1371 Local variables: 1372 c-basic-offset: 4 1373 indent-tabs-mode: nil 1374 End: 1375 */ 1376