1 /* ------------------------------------------------------------------------ 2 3 unicodedata -- Provides access to the Unicode 5.2 data base. 4 5 Data was extracted from the Unicode 5.2 UnicodeData.txt file. 6 7 Written by Marc-Andre Lemburg (mal (at) lemburg.com). 8 Modified for Python 2.0 by Fredrik Lundh (fredrik (at) pythonware.com) 9 Modified by Martin v. Lwis (martin (at) v.loewis.de) 10 11 Copyright (c) Corporation for National Research Initiatives. 12 13 ------------------------------------------------------------------------ */ 14 15 #include "Python.h" 16 #include "ucnhash.h" 17 #include "structmember.h" 18 19 /* character properties */ 20 21 typedef struct { 22 const unsigned char category; /* index into 23 _PyUnicode_CategoryNames */ 24 const unsigned char combining; /* combining class value 0 - 255 */ 25 const unsigned char bidirectional; /* index into 26 _PyUnicode_BidirectionalNames */ 27 const unsigned char mirrored; /* true if mirrored in bidir mode */ 28 const unsigned char east_asian_width; /* index into 29 _PyUnicode_EastAsianWidth */ 30 const unsigned char normalization_quick_check; /* see is_normalized() */ 31 } _PyUnicode_DatabaseRecord; 32 33 typedef struct change_record { 34 /* sequence of fields should be the same as in merge_old_version */ 35 const unsigned char bidir_changed; 36 const unsigned char category_changed; 37 const unsigned char decimal_changed; 38 const unsigned char mirrored_changed; 39 const double numeric_changed; 40 } change_record; 41 42 /* data file generated by Tools/unicode/makeunicodedata.py */ 43 #include "unicodedata_db.h" 44 45 static const _PyUnicode_DatabaseRecord* 46 _getrecord_ex(Py_UCS4 code) 47 { 48 int index; 49 if (code >= 0x110000) 50 index = 0; 51 else { 52 index = index1[(code>>SHIFT)]; 53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 54 } 55 56 return &_PyUnicode_Database_Records[index]; 57 } 58 59 /* ------------- Previous-version API ------------------------------------- */ 60 typedef struct previous_version { 61 PyObject_HEAD 62 const char *name; 63 const change_record* (*getrecord)(Py_UCS4); 64 Py_UCS4 (*normalization)(Py_UCS4); 65 } PreviousDBVersion; 66 67 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v)) 68 69 static PyMemberDef DB_members[] = { 70 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, 71 {NULL} 72 }; 73 74 /* forward declaration */ 75 static PyTypeObject UCD_Type; 76 77 static PyObject* 78 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), 79 Py_UCS4 (*normalization)(Py_UCS4)) 80 { 81 PreviousDBVersion *self; 82 self = PyObject_New(PreviousDBVersion, &UCD_Type); 83 if (self == NULL) 84 return NULL; 85 self->name = name; 86 self->getrecord = getrecord; 87 self->normalization = normalization; 88 return (PyObject*)self; 89 } 90 91 92 static Py_UCS4 getuchar(PyUnicodeObject *obj) 93 { 94 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj); 95 96 if (PyUnicode_GET_SIZE(obj) == 1) 97 return *v; 98 #ifndef Py_UNICODE_WIDE 99 else if ((PyUnicode_GET_SIZE(obj) == 2) && 100 (0xD800 <= v[0] && v[0] <= 0xDBFF) && 101 (0xDC00 <= v[1] && v[1] <= 0xDFFF)) 102 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000; 103 #endif 104 PyErr_SetString(PyExc_TypeError, 105 "need a single Unicode character as parameter"); 106 return (Py_UCS4)-1; 107 } 108 109 /* --- Module API --------------------------------------------------------- */ 110 111 PyDoc_STRVAR(unicodedata_decimal__doc__, 112 "decimal(unichr[, default])\n\ 113 \n\ 114 Returns the decimal value assigned to the Unicode character unichr\n\ 115 as integer. If no such value is defined, default is returned, or, if\n\ 116 not given, ValueError is raised."); 117 118 static PyObject * 119 unicodedata_decimal(PyObject *self, PyObject *args) 120 { 121 PyUnicodeObject *v; 122 PyObject *defobj = NULL; 123 int have_old = 0; 124 long rc; 125 Py_UCS4 c; 126 127 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) 128 return NULL; 129 c = getuchar(v); 130 if (c == (Py_UCS4)-1) 131 return NULL; 132 133 if (self) { 134 const change_record *old = get_old_record(self, c); 135 if (old->category_changed == 0) { 136 /* unassigned */ 137 have_old = 1; 138 rc = -1; 139 } 140 else if (old->decimal_changed != 0xFF) { 141 have_old = 1; 142 rc = old->decimal_changed; 143 } 144 } 145 146 if (!have_old) 147 rc = Py_UNICODE_TODECIMAL(c); 148 if (rc < 0) { 149 if (defobj == NULL) { 150 PyErr_SetString(PyExc_ValueError, 151 "not a decimal"); 152 return NULL; 153 } 154 else { 155 Py_INCREF(defobj); 156 return defobj; 157 } 158 } 159 return PyInt_FromLong(rc); 160 } 161 162 PyDoc_STRVAR(unicodedata_digit__doc__, 163 "digit(unichr[, default])\n\ 164 \n\ 165 Returns the digit value assigned to the Unicode character unichr as\n\ 166 integer. If no such value is defined, default is returned, or, if\n\ 167 not given, ValueError is raised."); 168 169 static PyObject * 170 unicodedata_digit(PyObject *self, PyObject *args) 171 { 172 PyUnicodeObject *v; 173 PyObject *defobj = NULL; 174 long rc; 175 Py_UCS4 c; 176 177 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj)) 178 return NULL; 179 c = getuchar(v); 180 if (c == (Py_UCS4)-1) 181 return NULL; 182 rc = Py_UNICODE_TODIGIT(c); 183 if (rc < 0) { 184 if (defobj == NULL) { 185 PyErr_SetString(PyExc_ValueError, "not a digit"); 186 return NULL; 187 } 188 else { 189 Py_INCREF(defobj); 190 return defobj; 191 } 192 } 193 return PyInt_FromLong(rc); 194 } 195 196 PyDoc_STRVAR(unicodedata_numeric__doc__, 197 "numeric(unichr[, default])\n\ 198 \n\ 199 Returns the numeric value assigned to the Unicode character unichr\n\ 200 as float. If no such value is defined, default is returned, or, if\n\ 201 not given, ValueError is raised."); 202 203 static PyObject * 204 unicodedata_numeric(PyObject *self, PyObject *args) 205 { 206 PyUnicodeObject *v; 207 PyObject *defobj = NULL; 208 int have_old = 0; 209 double rc; 210 Py_UCS4 c; 211 212 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) 213 return NULL; 214 c = getuchar(v); 215 if (c == (Py_UCS4)-1) 216 return NULL; 217 218 if (self) { 219 const change_record *old = get_old_record(self, c); 220 if (old->category_changed == 0) { 221 /* unassigned */ 222 have_old = 1; 223 rc = -1.0; 224 } 225 else if (old->decimal_changed != 0xFF) { 226 have_old = 1; 227 rc = old->decimal_changed; 228 } 229 } 230 231 if (!have_old) 232 rc = Py_UNICODE_TONUMERIC(c); 233 if (rc == -1.0) { 234 if (defobj == NULL) { 235 PyErr_SetString(PyExc_ValueError, "not a numeric character"); 236 return NULL; 237 } 238 else { 239 Py_INCREF(defobj); 240 return defobj; 241 } 242 } 243 return PyFloat_FromDouble(rc); 244 } 245 246 PyDoc_STRVAR(unicodedata_category__doc__, 247 "category(unichr)\n\ 248 \n\ 249 Returns the general category assigned to the Unicode character\n\ 250 unichr as string."); 251 252 static PyObject * 253 unicodedata_category(PyObject *self, PyObject *args) 254 { 255 PyUnicodeObject *v; 256 int index; 257 Py_UCS4 c; 258 259 if (!PyArg_ParseTuple(args, "O!:category", 260 &PyUnicode_Type, &v)) 261 return NULL; 262 c = getuchar(v); 263 if (c == (Py_UCS4)-1) 264 return NULL; 265 index = (int) _getrecord_ex(c)->category; 266 if (self) { 267 const change_record *old = get_old_record(self, c); 268 if (old->category_changed != 0xFF) 269 index = old->category_changed; 270 } 271 return PyString_FromString(_PyUnicode_CategoryNames[index]); 272 } 273 274 PyDoc_STRVAR(unicodedata_bidirectional__doc__, 275 "bidirectional(unichr)\n\ 276 \n\ 277 Returns the bidirectional category assigned to the Unicode character\n\ 278 unichr as string. If no such value is defined, an empty string is\n\ 279 returned."); 280 281 static PyObject * 282 unicodedata_bidirectional(PyObject *self, PyObject *args) 283 { 284 PyUnicodeObject *v; 285 int index; 286 Py_UCS4 c; 287 288 if (!PyArg_ParseTuple(args, "O!:bidirectional", 289 &PyUnicode_Type, &v)) 290 return NULL; 291 c = getuchar(v); 292 if (c == (Py_UCS4)-1) 293 return NULL; 294 index = (int) _getrecord_ex(c)->bidirectional; 295 if (self) { 296 const change_record *old = get_old_record(self, c); 297 if (old->category_changed == 0) 298 index = 0; /* unassigned */ 299 else if (old->bidir_changed != 0xFF) 300 index = old->bidir_changed; 301 } 302 return PyString_FromString(_PyUnicode_BidirectionalNames[index]); 303 } 304 305 PyDoc_STRVAR(unicodedata_combining__doc__, 306 "combining(unichr)\n\ 307 \n\ 308 Returns the canonical combining class assigned to the Unicode\n\ 309 character unichr as integer. Returns 0 if no combining class is\n\ 310 defined."); 311 312 static PyObject * 313 unicodedata_combining(PyObject *self, PyObject *args) 314 { 315 PyUnicodeObject *v; 316 int index; 317 Py_UCS4 c; 318 319 if (!PyArg_ParseTuple(args, "O!:combining", 320 &PyUnicode_Type, &v)) 321 return NULL; 322 c = getuchar(v); 323 if (c == (Py_UCS4)-1) 324 return NULL; 325 index = (int) _getrecord_ex(c)->combining; 326 if (self) { 327 const change_record *old = get_old_record(self, c); 328 if (old->category_changed == 0) 329 index = 0; /* unassigned */ 330 } 331 return PyInt_FromLong(index); 332 } 333 334 PyDoc_STRVAR(unicodedata_mirrored__doc__, 335 "mirrored(unichr)\n\ 336 \n\ 337 Returns the mirrored property assigned to the Unicode character\n\ 338 unichr as integer. Returns 1 if the character has been identified as\n\ 339 a \"mirrored\" character in bidirectional text, 0 otherwise."); 340 341 static PyObject * 342 unicodedata_mirrored(PyObject *self, PyObject *args) 343 { 344 PyUnicodeObject *v; 345 int index; 346 Py_UCS4 c; 347 348 if (!PyArg_ParseTuple(args, "O!:mirrored", 349 &PyUnicode_Type, &v)) 350 return NULL; 351 c = getuchar(v); 352 if (c == (Py_UCS4)-1) 353 return NULL; 354 index = (int) _getrecord_ex(c)->mirrored; 355 if (self) { 356 const change_record *old = get_old_record(self, c); 357 if (old->category_changed == 0) 358 index = 0; /* unassigned */ 359 else if (old->mirrored_changed != 0xFF) 360 index = old->mirrored_changed; 361 } 362 return PyInt_FromLong(index); 363 } 364 365 PyDoc_STRVAR(unicodedata_east_asian_width__doc__, 366 "east_asian_width(unichr)\n\ 367 \n\ 368 Returns the east asian width assigned to the Unicode character\n\ 369 unichr as string."); 370 371 static PyObject * 372 unicodedata_east_asian_width(PyObject *self, PyObject *args) 373 { 374 PyUnicodeObject *v; 375 int index; 376 Py_UCS4 c; 377 378 if (!PyArg_ParseTuple(args, "O!:east_asian_width", 379 &PyUnicode_Type, &v)) 380 return NULL; 381 c = getuchar(v); 382 if (c == (Py_UCS4)-1) 383 return NULL; 384 index = (int) _getrecord_ex(c)->east_asian_width; 385 if (self) { 386 const change_record *old = get_old_record(self, c); 387 if (old->category_changed == 0) 388 index = 0; /* unassigned */ 389 } 390 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]); 391 } 392 393 PyDoc_STRVAR(unicodedata_decomposition__doc__, 394 "decomposition(unichr)\n\ 395 \n\ 396 Returns the character decomposition mapping assigned to the Unicode\n\ 397 character unichr as string. An empty string is returned in case no\n\ 398 such mapping is defined."); 399 400 static PyObject * 401 unicodedata_decomposition(PyObject *self, PyObject *args) 402 { 403 PyUnicodeObject *v; 404 char decomp[256]; 405 int code, index, count, i; 406 unsigned int prefix_index; 407 Py_UCS4 c; 408 409 if (!PyArg_ParseTuple(args, "O!:decomposition", 410 &PyUnicode_Type, &v)) 411 return NULL; 412 c = getuchar(v); 413 if (c == (Py_UCS4)-1) 414 return NULL; 415 416 code = (int)c; 417 418 if (self) { 419 const change_record *old = get_old_record(self, c); 420 if (old->category_changed == 0) 421 return PyString_FromString(""); /* unassigned */ 422 } 423 424 if (code < 0 || code >= 0x110000) 425 index = 0; 426 else { 427 index = decomp_index1[(code>>DECOMP_SHIFT)]; 428 index = decomp_index2[(index<<DECOMP_SHIFT)+ 429 (code&((1<<DECOMP_SHIFT)-1))]; 430 } 431 432 /* high byte is number of hex bytes (usually one or two), low byte 433 is prefix code (from*/ 434 count = decomp_data[index] >> 8; 435 436 /* XXX: could allocate the PyString up front instead 437 (strlen(prefix) + 5 * count + 1 bytes) */ 438 439 /* Based on how index is calculated above and decomp_data is generated 440 from Tools/unicode/makeunicodedata.py, it should not be possible 441 to overflow decomp_prefix. */ 442 prefix_index = decomp_data[index] & 255; 443 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix))); 444 445 /* copy prefix */ 446 i = strlen(decomp_prefix[prefix_index]); 447 memcpy(decomp, decomp_prefix[prefix_index], i); 448 449 while (count-- > 0) { 450 if (i) 451 decomp[i++] = ' '; 452 assert((size_t)i < sizeof(decomp)); 453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", 454 decomp_data[++index]); 455 i += strlen(decomp + i); 456 } 457 458 decomp[i] = '\0'; 459 460 return PyString_FromString(decomp); 461 } 462 463 static void 464 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) 465 { 466 if (code >= 0x110000) { 467 *index = 0; 468 } else if (self && get_old_record(self, code)->category_changed==0) { 469 /* unassigned in old version */ 470 *index = 0; 471 } 472 else { 473 *index = decomp_index1[(code>>DECOMP_SHIFT)]; 474 *index = decomp_index2[(*index<<DECOMP_SHIFT)+ 475 (code&((1<<DECOMP_SHIFT)-1))]; 476 } 477 478 /* high byte is number of hex bytes (usually one or two), low byte 479 is prefix code (from*/ 480 *count = decomp_data[*index] >> 8; 481 *prefix = decomp_data[*index] & 255; 482 483 (*index)++; 484 } 485 486 #define SBase 0xAC00 487 #define LBase 0x1100 488 #define VBase 0x1161 489 #define TBase 0x11A7 490 #define LCount 19 491 #define VCount 21 492 #define TCount 28 493 #define NCount (VCount*TCount) 494 #define SCount (LCount*NCount) 495 496 static PyObject* 497 nfd_nfkd(PyObject *self, PyObject *input, int k) 498 { 499 PyObject *result; 500 Py_UNICODE *i, *end, *o; 501 /* Longest decomposition in Unicode 3.2: U+FDFA */ 502 Py_UNICODE stack[20]; 503 Py_ssize_t space, isize; 504 int index, prefix, count, stackptr; 505 unsigned char prev, cur; 506 507 stackptr = 0; 508 isize = PyUnicode_GET_SIZE(input); 509 /* Overallocate atmost 10 characters. */ 510 space = (isize > 10 ? 10 : isize) + isize; 511 result = PyUnicode_FromUnicode(NULL, space); 512 if (!result) 513 return NULL; 514 i = PyUnicode_AS_UNICODE(input); 515 end = i + isize; 516 o = PyUnicode_AS_UNICODE(result); 517 518 while (i < end) { 519 stack[stackptr++] = *i++; 520 while(stackptr) { 521 Py_UNICODE code = stack[--stackptr]; 522 /* Hangul Decomposition adds three characters in 523 a single step, so we need atleast that much room. */ 524 if (space < 3) { 525 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10; 526 space += 10; 527 if (PyUnicode_Resize(&result, newsize) == -1) 528 return NULL; 529 o = PyUnicode_AS_UNICODE(result) + newsize - space; 530 } 531 /* Hangul Decomposition. */ 532 if (SBase <= code && code < (SBase+SCount)) { 533 int SIndex = code - SBase; 534 int L = LBase + SIndex / NCount; 535 int V = VBase + (SIndex % NCount) / TCount; 536 int T = TBase + SIndex % TCount; 537 *o++ = L; 538 *o++ = V; 539 space -= 2; 540 if (T != TBase) { 541 *o++ = T; 542 space --; 543 } 544 continue; 545 } 546 /* normalization changes */ 547 if (self) { 548 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); 549 if (value != 0) { 550 stack[stackptr++] = value; 551 continue; 552 } 553 } 554 555 /* Other decompositions. */ 556 get_decomp_record(self, code, &index, &prefix, &count); 557 558 /* Copy character if it is not decomposable, or has a 559 compatibility decomposition, but we do NFD. */ 560 if (!count || (prefix && !k)) { 561 *o++ = code; 562 space--; 563 continue; 564 } 565 /* Copy decomposition onto the stack, in reverse 566 order. */ 567 while(count) { 568 code = decomp_data[index + (--count)]; 569 stack[stackptr++] = code; 570 } 571 } 572 } 573 574 /* Drop overallocation. Cannot fail. */ 575 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space); 576 577 /* Sort canonically. */ 578 i = PyUnicode_AS_UNICODE(result); 579 prev = _getrecord_ex(*i)->combining; 580 end = i + PyUnicode_GET_SIZE(result); 581 for (i++; i < end; i++) { 582 cur = _getrecord_ex(*i)->combining; 583 if (prev == 0 || cur == 0 || prev <= cur) { 584 prev = cur; 585 continue; 586 } 587 /* Non-canonical order. Need to switch *i with previous. */ 588 o = i - 1; 589 while (1) { 590 Py_UNICODE tmp = o[1]; 591 o[1] = o[0]; 592 o[0] = tmp; 593 o--; 594 if (o < PyUnicode_AS_UNICODE(result)) 595 break; 596 prev = _getrecord_ex(*o)->combining; 597 if (prev == 0 || prev <= cur) 598 break; 599 } 600 prev = _getrecord_ex(*i)->combining; 601 } 602 return result; 603 } 604 605 static int 606 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code) 607 { 608 int index; 609 for (index = 0; nfc[index].start; index++) { 610 int start = nfc[index].start; 611 if (code < start) 612 return -1; 613 if (code <= start + nfc[index].count) { 614 int delta = code - start; 615 return nfc[index].index + delta; 616 } 617 } 618 return -1; 619 } 620 621 static PyObject* 622 nfc_nfkc(PyObject *self, PyObject *input, int k) 623 { 624 PyObject *result; 625 Py_UNICODE *i, *i1, *o, *end; 626 int f,l,index,index1,comb; 627 Py_UNICODE code; 628 Py_UNICODE *skipped[20]; 629 int cskipped = 0; 630 631 result = nfd_nfkd(self, input, k); 632 if (!result) 633 return NULL; 634 635 /* We are going to modify result in-place. 636 If nfd_nfkd is changed to sometimes return the input, 637 this code needs to be reviewed. */ 638 assert(result != input); 639 640 i = PyUnicode_AS_UNICODE(result); 641 end = i + PyUnicode_GET_SIZE(result); 642 o = PyUnicode_AS_UNICODE(result); 643 644 again: 645 while (i < end) { 646 for (index = 0; index < cskipped; index++) { 647 if (skipped[index] == i) { 648 /* *i character is skipped. 649 Remove from list. */ 650 skipped[index] = skipped[cskipped-1]; 651 cskipped--; 652 i++; 653 goto again; /* continue while */ 654 } 655 } 656 /* Hangul Composition. We don't need to check for <LV,T> 657 pairs, since we always have decomposed data. */ 658 if (LBase <= *i && *i < (LBase+LCount) && 659 i + 1 < end && 660 VBase <= i[1] && i[1] <= (VBase+VCount)) { 661 int LIndex, VIndex; 662 LIndex = i[0] - LBase; 663 VIndex = i[1] - VBase; 664 code = SBase + (LIndex*VCount+VIndex)*TCount; 665 i+=2; 666 if (i < end && 667 TBase <= *i && *i <= (TBase+TCount)) { 668 code += *i-TBase; 669 i++; 670 } 671 *o++ = code; 672 continue; 673 } 674 675 f = find_nfc_index(self, nfc_first, *i); 676 if (f == -1) { 677 *o++ = *i++; 678 continue; 679 } 680 /* Find next unblocked character. */ 681 i1 = i+1; 682 comb = 0; 683 while (i1 < end) { 684 int comb1 = _getrecord_ex(*i1)->combining; 685 if (comb) { 686 if (comb1 == 0) 687 break; 688 if (comb >= comb1) { 689 /* Character is blocked. */ 690 i1++; 691 continue; 692 } 693 } 694 l = find_nfc_index(self, nfc_last, *i1); 695 /* *i1 cannot be combined with *i. If *i1 696 is a starter, we don't need to look further. 697 Otherwise, record the combining class. */ 698 if (l == -1) { 699 not_combinable: 700 if (comb1 == 0) 701 break; 702 comb = comb1; 703 i1++; 704 continue; 705 } 706 index = f*TOTAL_LAST + l; 707 index1 = comp_index[index >> COMP_SHIFT]; 708 code = comp_data[(index1<<COMP_SHIFT)+ 709 (index&((1<<COMP_SHIFT)-1))]; 710 if (code == 0) 711 goto not_combinable; 712 713 /* Replace the original character. */ 714 *i = code; 715 /* Mark the second character unused. */ 716 assert(cskipped < 20); 717 skipped[cskipped++] = i1; 718 i1++; 719 f = find_nfc_index(self, nfc_first, *i); 720 if (f == -1) 721 break; 722 } 723 *o++ = *i++; 724 } 725 if (o != end) 726 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result)); 727 return result; 728 } 729 730 /* Return 1 if the input is certainly normalized, 0 if it might not be. */ 731 static int 732 is_normalized(PyObject *self, PyObject *input, int nfc, int k) 733 { 734 Py_UNICODE *i, *end; 735 unsigned char prev_combining = 0, quickcheck_mask; 736 737 /* An older version of the database is requested, quickchecks must be 738 disabled. */ 739 if (self != NULL) 740 return 0; 741 742 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, 743 as described in http://unicode.org/reports/tr15/#Annex8. */ 744 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); 745 746 i = PyUnicode_AS_UNICODE(input); 747 end = i + PyUnicode_GET_SIZE(input); 748 while (i < end) { 749 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++); 750 unsigned char combining = record->combining; 751 unsigned char quickcheck = record->normalization_quick_check; 752 753 if (quickcheck & quickcheck_mask) 754 return 0; /* this string might need normalization */ 755 if (combining && prev_combining > combining) 756 return 0; /* non-canonical sort order, not normalized */ 757 prev_combining = combining; 758 } 759 return 1; /* certainly normalized */ 760 } 761 762 PyDoc_STRVAR(unicodedata_normalize__doc__, 763 "normalize(form, unistr)\n\ 764 \n\ 765 Return the normal form 'form' for the Unicode string unistr. Valid\n\ 766 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'."); 767 768 static PyObject* 769 unicodedata_normalize(PyObject *self, PyObject *args) 770 { 771 char *form; 772 PyObject *input; 773 774 if(!PyArg_ParseTuple(args, "sO!:normalize", 775 &form, &PyUnicode_Type, &input)) 776 return NULL; 777 778 if (PyUnicode_GetSize(input) == 0) { 779 /* Special case empty input strings, since resizing 780 them later would cause internal errors. */ 781 Py_INCREF(input); 782 return input; 783 } 784 785 if (strcmp(form, "NFC") == 0) { 786 if (is_normalized(self, input, 1, 0)) { 787 Py_INCREF(input); 788 return input; 789 } 790 return nfc_nfkc(self, input, 0); 791 } 792 if (strcmp(form, "NFKC") == 0) { 793 if (is_normalized(self, input, 1, 1)) { 794 Py_INCREF(input); 795 return input; 796 } 797 return nfc_nfkc(self, input, 1); 798 } 799 if (strcmp(form, "NFD") == 0) { 800 if (is_normalized(self, input, 0, 0)) { 801 Py_INCREF(input); 802 return input; 803 } 804 return nfd_nfkd(self, input, 0); 805 } 806 if (strcmp(form, "NFKD") == 0) { 807 if (is_normalized(self, input, 0, 1)) { 808 Py_INCREF(input); 809 return input; 810 } 811 return nfd_nfkd(self, input, 1); 812 } 813 PyErr_SetString(PyExc_ValueError, "invalid normalization form"); 814 return NULL; 815 } 816 817 /* -------------------------------------------------------------------- */ 818 /* unicode character name tables */ 819 820 /* data file generated by Tools/unicode/makeunicodedata.py */ 821 #include "unicodename_db.h" 822 823 /* -------------------------------------------------------------------- */ 824 /* database code (cut and pasted from the unidb package) */ 825 826 static unsigned long 827 _gethash(const char *s, int len, int scale) 828 { 829 int i; 830 unsigned long h = 0; 831 unsigned long ix; 832 for (i = 0; i < len; i++) { 833 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i])); 834 ix = h & 0xff000000; 835 if (ix) 836 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; 837 } 838 return h; 839 } 840 841 static char *hangul_syllables[][3] = { 842 { "G", "A", "" }, 843 { "GG", "AE", "G" }, 844 { "N", "YA", "GG" }, 845 { "D", "YAE", "GS" }, 846 { "DD", "EO", "N", }, 847 { "R", "E", "NJ" }, 848 { "M", "YEO", "NH" }, 849 { "B", "YE", "D" }, 850 { "BB", "O", "L" }, 851 { "S", "WA", "LG" }, 852 { "SS", "WAE", "LM" }, 853 { "", "OE", "LB" }, 854 { "J", "YO", "LS" }, 855 { "JJ", "U", "LT" }, 856 { "C", "WEO", "LP" }, 857 { "K", "WE", "LH" }, 858 { "T", "WI", "M" }, 859 { "P", "YU", "B" }, 860 { "H", "EU", "BS" }, 861 { 0, "YI", "S" }, 862 { 0, "I", "SS" }, 863 { 0, 0, "NG" }, 864 { 0, 0, "J" }, 865 { 0, 0, "C" }, 866 { 0, 0, "K" }, 867 { 0, 0, "T" }, 868 { 0, 0, "P" }, 869 { 0, 0, "H" } 870 }; 871 872 static int 873 is_unified_ideograph(Py_UCS4 code) 874 { 875 return ( 876 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ 877 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */ 878 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */ 879 (0x2A700 <= code && code <= 0x2B734)); /* CJK Ideograph Extension C */ 880 } 881 882 static int 883 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen) 884 { 885 int offset; 886 int i; 887 int word; 888 unsigned char* w; 889 890 if (code >= 0x110000) 891 return 0; 892 893 if (self) { 894 const change_record *old = get_old_record(self, code); 895 if (old->category_changed == 0) { 896 /* unassigned */ 897 return 0; 898 } 899 } 900 901 if (SBase <= code && code < SBase+SCount) { 902 /* Hangul syllable. */ 903 int SIndex = code - SBase; 904 int L = SIndex / NCount; 905 int V = (SIndex % NCount) / TCount; 906 int T = SIndex % TCount; 907 908 if (buflen < 27) 909 /* Worst case: HANGUL SYLLABLE <10chars>. */ 910 return 0; 911 strcpy(buffer, "HANGUL SYLLABLE "); 912 buffer += 16; 913 strcpy(buffer, hangul_syllables[L][0]); 914 buffer += strlen(hangul_syllables[L][0]); 915 strcpy(buffer, hangul_syllables[V][1]); 916 buffer += strlen(hangul_syllables[V][1]); 917 strcpy(buffer, hangul_syllables[T][2]); 918 buffer += strlen(hangul_syllables[T][2]); 919 *buffer = '\0'; 920 return 1; 921 } 922 923 if (is_unified_ideograph(code)) { 924 if (buflen < 28) 925 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ 926 return 0; 927 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); 928 return 1; 929 } 930 931 /* get offset into phrasebook */ 932 offset = phrasebook_offset1[(code>>phrasebook_shift)]; 933 offset = phrasebook_offset2[(offset<<phrasebook_shift) + 934 (code&((1<<phrasebook_shift)-1))]; 935 if (!offset) 936 return 0; 937 938 i = 0; 939 940 for (;;) { 941 /* get word index */ 942 word = phrasebook[offset] - phrasebook_short; 943 if (word >= 0) { 944 word = (word << 8) + phrasebook[offset+1]; 945 offset += 2; 946 } else 947 word = phrasebook[offset++]; 948 if (i) { 949 if (i > buflen) 950 return 0; /* buffer overflow */ 951 buffer[i++] = ' '; 952 } 953 /* copy word string from lexicon. the last character in the 954 word has bit 7 set. the last word in a string ends with 955 0x80 */ 956 w = lexicon + lexicon_offset[word]; 957 while (*w < 128) { 958 if (i >= buflen) 959 return 0; /* buffer overflow */ 960 buffer[i++] = *w++; 961 } 962 if (i >= buflen) 963 return 0; /* buffer overflow */ 964 buffer[i++] = *w & 127; 965 if (*w == 128) 966 break; /* end of word */ 967 } 968 969 return 1; 970 } 971 972 static int 973 _cmpname(PyObject *self, int code, const char* name, int namelen) 974 { 975 /* check if code corresponds to the given name */ 976 int i; 977 char buffer[NAME_MAXLEN]; 978 if (!_getucname(self, code, buffer, sizeof(buffer))) 979 return 0; 980 for (i = 0; i < namelen; i++) { 981 if (toupper(Py_CHARMASK(name[i])) != buffer[i]) 982 return 0; 983 } 984 return buffer[namelen] == '\0'; 985 } 986 987 static void 988 find_syllable(const char *str, int *len, int *pos, int count, int column) 989 { 990 int i, len1; 991 *len = -1; 992 for (i = 0; i < count; i++) { 993 char *s = hangul_syllables[i][column]; 994 len1 = strlen(s); 995 if (len1 <= *len) 996 continue; 997 if (strncmp(str, s, len1) == 0) { 998 *len = len1; 999 *pos = i; 1000 } 1001 } 1002 if (*len == -1) { 1003 *len = 0; 1004 } 1005 } 1006 1007 static int 1008 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code) 1009 { 1010 unsigned int h, v; 1011 unsigned int mask = code_size-1; 1012 unsigned int i, incr; 1013 1014 /* Check for hangul syllables. */ 1015 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { 1016 int len, L = -1, V = -1, T = -1; 1017 const char *pos = name + 16; 1018 find_syllable(pos, &len, &L, LCount, 0); 1019 pos += len; 1020 find_syllable(pos, &len, &V, VCount, 1); 1021 pos += len; 1022 find_syllable(pos, &len, &T, TCount, 2); 1023 pos += len; 1024 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { 1025 *code = SBase + (L*VCount+V)*TCount + T; 1026 return 1; 1027 } 1028 /* Otherwise, it's an illegal syllable name. */ 1029 return 0; 1030 } 1031 1032 /* Check for unified ideographs. */ 1033 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { 1034 /* Four or five hexdigits must follow. */ 1035 v = 0; 1036 name += 22; 1037 namelen -= 22; 1038 if (namelen != 4 && namelen != 5) 1039 return 0; 1040 while (namelen--) { 1041 v *= 16; 1042 if (*name >= '0' && *name <= '9') 1043 v += *name - '0'; 1044 else if (*name >= 'A' && *name <= 'F') 1045 v += *name - 'A' + 10; 1046 else 1047 return 0; 1048 name++; 1049 } 1050 if (!is_unified_ideograph(v)) 1051 return 0; 1052 *code = v; 1053 return 1; 1054 } 1055 1056 /* the following is the same as python's dictionary lookup, with 1057 only minor changes. see the makeunicodedata script for more 1058 details */ 1059 1060 h = (unsigned int) _gethash(name, namelen, code_magic); 1061 i = (~h) & mask; 1062 v = code_hash[i]; 1063 if (!v) 1064 return 0; 1065 if (_cmpname(self, v, name, namelen)) { 1066 *code = v; 1067 return 1; 1068 } 1069 incr = (h ^ (h >> 3)) & mask; 1070 if (!incr) 1071 incr = mask; 1072 for (;;) { 1073 i = (i + incr) & mask; 1074 v = code_hash[i]; 1075 if (!v) 1076 return 0; 1077 if (_cmpname(self, v, name, namelen)) { 1078 *code = v; 1079 return 1; 1080 } 1081 incr = incr << 1; 1082 if (incr > mask) 1083 incr = incr ^ code_poly; 1084 } 1085 } 1086 1087 static const _PyUnicode_Name_CAPI hashAPI = 1088 { 1089 sizeof(_PyUnicode_Name_CAPI), 1090 _getucname, 1091 _getcode 1092 }; 1093 1094 /* -------------------------------------------------------------------- */ 1095 /* Python bindings */ 1096 1097 PyDoc_STRVAR(unicodedata_name__doc__, 1098 "name(unichr[, default])\n\ 1099 Returns the name assigned to the Unicode character unichr as a\n\ 1100 string. If no name is defined, default is returned, or, if not\n\ 1101 given, ValueError is raised."); 1102 1103 static PyObject * 1104 unicodedata_name(PyObject* self, PyObject* args) 1105 { 1106 char name[NAME_MAXLEN]; 1107 Py_UCS4 c; 1108 1109 PyUnicodeObject* v; 1110 PyObject* defobj = NULL; 1111 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj)) 1112 return NULL; 1113 1114 c = getuchar(v); 1115 if (c == (Py_UCS4)-1) 1116 return NULL; 1117 1118 if (!_getucname(self, c, name, sizeof(name))) { 1119 if (defobj == NULL) { 1120 PyErr_SetString(PyExc_ValueError, "no such name"); 1121 return NULL; 1122 } 1123 else { 1124 Py_INCREF(defobj); 1125 return defobj; 1126 } 1127 } 1128 1129 return Py_BuildValue("s", name); 1130 } 1131 1132 PyDoc_STRVAR(unicodedata_lookup__doc__, 1133 "lookup(name)\n\ 1134 \n\ 1135 Look up character by name. If a character with the\n\ 1136 given name is found, return the corresponding Unicode\n\ 1137 character. If not found, KeyError is raised."); 1138 1139 static PyObject * 1140 unicodedata_lookup(PyObject* self, PyObject* args) 1141 { 1142 Py_UCS4 code; 1143 Py_UNICODE str[2]; 1144 1145 char* name; 1146 int namelen; 1147 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) 1148 return NULL; 1149 1150 if (!_getcode(self, name, namelen, &code)) { 1151 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", 1152 name); 1153 return NULL; 1154 } 1155 1156 #ifndef Py_UNICODE_WIDE 1157 if (code >= 0x10000) { 1158 str[0] = 0xd800 + ((code - 0x10000) >> 10); 1159 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff); 1160 return PyUnicode_FromUnicode(str, 2); 1161 } 1162 #endif 1163 str[0] = (Py_UNICODE) code; 1164 return PyUnicode_FromUnicode(str, 1); 1165 } 1166 1167 /* XXX Add doc strings. */ 1168 1169 static PyMethodDef unicodedata_functions[] = { 1170 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__}, 1171 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__}, 1172 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__}, 1173 {"category", unicodedata_category, METH_VARARGS, 1174 unicodedata_category__doc__}, 1175 {"bidirectional", unicodedata_bidirectional, METH_VARARGS, 1176 unicodedata_bidirectional__doc__}, 1177 {"combining", unicodedata_combining, METH_VARARGS, 1178 unicodedata_combining__doc__}, 1179 {"mirrored", unicodedata_mirrored, METH_VARARGS, 1180 unicodedata_mirrored__doc__}, 1181 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS, 1182 unicodedata_east_asian_width__doc__}, 1183 {"decomposition", unicodedata_decomposition, METH_VARARGS, 1184 unicodedata_decomposition__doc__}, 1185 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__}, 1186 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__}, 1187 {"normalize", unicodedata_normalize, METH_VARARGS, 1188 unicodedata_normalize__doc__}, 1189 {NULL, NULL} /* sentinel */ 1190 }; 1191 1192 static PyTypeObject UCD_Type = { 1193 /* The ob_type field must be initialized in the module init function 1194 * to be portable to Windows without using C++. */ 1195 PyVarObject_HEAD_INIT(NULL, 0) 1196 "unicodedata.UCD", /*tp_name*/ 1197 sizeof(PreviousDBVersion), /*tp_basicsize*/ 1198 0, /*tp_itemsize*/ 1199 /* methods */ 1200 (destructor)PyObject_Del, /*tp_dealloc*/ 1201 0, /*tp_print*/ 1202 0, /*tp_getattr*/ 1203 0, /*tp_setattr*/ 1204 0, /*tp_compare*/ 1205 0, /*tp_repr*/ 1206 0, /*tp_as_number*/ 1207 0, /*tp_as_sequence*/ 1208 0, /*tp_as_mapping*/ 1209 0, /*tp_hash*/ 1210 0, /*tp_call*/ 1211 0, /*tp_str*/ 1212 PyObject_GenericGetAttr,/*tp_getattro*/ 1213 0, /*tp_setattro*/ 1214 0, /*tp_as_buffer*/ 1215 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 1216 0, /*tp_doc*/ 1217 0, /*tp_traverse*/ 1218 0, /*tp_clear*/ 1219 0, /*tp_richcompare*/ 1220 0, /*tp_weaklistoffset*/ 1221 0, /*tp_iter*/ 1222 0, /*tp_iternext*/ 1223 unicodedata_functions, /*tp_methods*/ 1224 DB_members, /*tp_members*/ 1225 0, /*tp_getset*/ 1226 0, /*tp_base*/ 1227 0, /*tp_dict*/ 1228 0, /*tp_descr_get*/ 1229 0, /*tp_descr_set*/ 1230 0, /*tp_dictoffset*/ 1231 0, /*tp_init*/ 1232 0, /*tp_alloc*/ 1233 0, /*tp_new*/ 1234 0, /*tp_free*/ 1235 0, /*tp_is_gc*/ 1236 }; 1237 1238 PyDoc_STRVAR(unicodedata_docstring, 1239 "This module provides access to the Unicode Character Database which\n\ 1240 defines character properties for all Unicode characters. The data in\n\ 1241 this database is based on the UnicodeData.txt file version\n\ 1242 5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\ 1243 \n\ 1244 The module uses the same names and symbols as defined by the\n\ 1245 UnicodeData File Format 5.2.0 (see\n\ 1246 http://www.unicode.org/reports/tr44/tr44-4.html)."); 1247 1248 PyMODINIT_FUNC 1249 initunicodedata(void) 1250 { 1251 PyObject *m, *v; 1252 1253 Py_TYPE(&UCD_Type) = &PyType_Type; 1254 1255 m = Py_InitModule3( 1256 "unicodedata", unicodedata_functions, unicodedata_docstring); 1257 if (!m) 1258 return; 1259 1260 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); 1261 Py_INCREF(&UCD_Type); 1262 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type); 1263 1264 /* Previous versions */ 1265 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); 1266 if (v != NULL) 1267 PyModule_AddObject(m, "ucd_3_2_0", v); 1268 1269 /* Export C API */ 1270 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL); 1271 if (v != NULL) 1272 PyModule_AddObject(m, "ucnhash_CAPI", v); 1273 } 1274 1275 /* 1276 Local variables: 1277 c-basic-offset: 4 1278 indent-tabs-mode: nil 1279 End: 1280 */ 1281