1 /* ------------------------------------------------------------------------ 2 3 unicodedata -- Provides access to the Unicode 5.2 data base. 4 5 Data was extracted from the Unicode 5.2 UnicodeData.txt file. 6 7 Written by Marc-Andre Lemburg (mal (at) lemburg.com). 8 Modified for Python 2.0 by Fredrik Lundh (fredrik (at) pythonware.com) 9 Modified by Martin v. Lwis (martin (at) v.loewis.de) 10 11 Copyright (c) Corporation for National Research Initiatives. 12 13 ------------------------------------------------------------------------ */ 14 15 #include "Python.h" 16 #include "ucnhash.h" 17 #include "structmember.h" 18 19 /* character properties */ 20 21 typedef struct { 22 const unsigned char category; /* index into 23 _PyUnicode_CategoryNames */ 24 const unsigned char combining; /* combining class value 0 - 255 */ 25 const unsigned char bidirectional; /* index into 26 _PyUnicode_BidirectionalNames */ 27 const unsigned char mirrored; /* true if mirrored in bidir mode */ 28 const unsigned char east_asian_width; /* index into 29 _PyUnicode_EastAsianWidth */ 30 const unsigned char normalization_quick_check; /* see is_normalized() */ 31 } _PyUnicode_DatabaseRecord; 32 33 typedef struct change_record { 34 /* sequence of fields should be the same as in merge_old_version */ 35 const unsigned char bidir_changed; 36 const unsigned char category_changed; 37 const unsigned char decimal_changed; 38 const unsigned char mirrored_changed; 39 const double numeric_changed; 40 } change_record; 41 42 /* data file generated by Tools/unicode/makeunicodedata.py */ 43 #include "unicodedata_db.h" 44 45 static const _PyUnicode_DatabaseRecord* 46 _getrecord_ex(Py_UCS4 code) 47 { 48 int index; 49 if (code >= 0x110000) 50 index = 0; 51 else { 52 index = index1[(code>>SHIFT)]; 53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 54 } 55 56 return &_PyUnicode_Database_Records[index]; 57 } 58 59 /* ------------- Previous-version API ------------------------------------- */ 60 typedef struct previous_version { 61 PyObject_HEAD 62 const char *name; 63 const change_record* (*getrecord)(Py_UCS4); 64 Py_UCS4 (*normalization)(Py_UCS4); 65 } PreviousDBVersion; 66 67 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v)) 68 69 static PyMemberDef DB_members[] = { 70 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, 71 {NULL} 72 }; 73 74 /* forward declaration */ 75 static PyTypeObject UCD_Type; 76 77 static PyObject* 78 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), 79 Py_UCS4 (*normalization)(Py_UCS4)) 80 { 81 PreviousDBVersion *self; 82 self = PyObject_New(PreviousDBVersion, &UCD_Type); 83 if (self == NULL) 84 return NULL; 85 self->name = name; 86 self->getrecord = getrecord; 87 self->normalization = normalization; 88 return (PyObject*)self; 89 } 90 91 92 static Py_UCS4 getuchar(PyUnicodeObject *obj) 93 { 94 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj); 95 96 if (PyUnicode_GET_SIZE(obj) == 1) 97 return *v; 98 #ifndef Py_UNICODE_WIDE 99 else if ((PyUnicode_GET_SIZE(obj) == 2) && 100 (0xD800 <= v[0] && v[0] <= 0xDBFF) && 101 (0xDC00 <= v[1] && v[1] <= 0xDFFF)) 102 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000; 103 #endif 104 PyErr_SetString(PyExc_TypeError, 105 "need a single Unicode character as parameter"); 106 return (Py_UCS4)-1; 107 } 108 109 /* --- Module API --------------------------------------------------------- */ 110 111 PyDoc_STRVAR(unicodedata_decimal__doc__, 112 "decimal(unichr[, default])\n\ 113 \n\ 114 Returns the decimal value assigned to the Unicode character unichr\n\ 115 as integer. If no such value is defined, default is returned, or, if\n\ 116 not given, ValueError is raised."); 117 118 static PyObject * 119 unicodedata_decimal(PyObject *self, PyObject *args) 120 { 121 PyUnicodeObject *v; 122 PyObject *defobj = NULL; 123 int have_old = 0; 124 long rc; 125 Py_UCS4 c; 126 127 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) 128 return NULL; 129 c = getuchar(v); 130 if (c == (Py_UCS4)-1) 131 return NULL; 132 133 if (self) { 134 const change_record *old = get_old_record(self, c); 135 if (old->category_changed == 0) { 136 /* unassigned */ 137 have_old = 1; 138 rc = -1; 139 } 140 else if (old->decimal_changed != 0xFF) { 141 have_old = 1; 142 rc = old->decimal_changed; 143 } 144 } 145 146 if (!have_old) 147 rc = Py_UNICODE_TODECIMAL(c); 148 if (rc < 0) { 149 if (defobj == NULL) { 150 PyErr_SetString(PyExc_ValueError, 151 "not a decimal"); 152 return NULL; 153 } 154 else { 155 Py_INCREF(defobj); 156 return defobj; 157 } 158 } 159 return PyInt_FromLong(rc); 160 } 161 162 PyDoc_STRVAR(unicodedata_digit__doc__, 163 "digit(unichr[, default])\n\ 164 \n\ 165 Returns the digit value assigned to the Unicode character unichr as\n\ 166 integer. If no such value is defined, default is returned, or, if\n\ 167 not given, ValueError is raised."); 168 169 static PyObject * 170 unicodedata_digit(PyObject *self, PyObject *args) 171 { 172 PyUnicodeObject *v; 173 PyObject *defobj = NULL; 174 long rc; 175 Py_UCS4 c; 176 177 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj)) 178 return NULL; 179 c = getuchar(v); 180 if (c == (Py_UCS4)-1) 181 return NULL; 182 rc = Py_UNICODE_TODIGIT(c); 183 if (rc < 0) { 184 if (defobj == NULL) { 185 PyErr_SetString(PyExc_ValueError, "not a digit"); 186 return NULL; 187 } 188 else { 189 Py_INCREF(defobj); 190 return defobj; 191 } 192 } 193 return PyInt_FromLong(rc); 194 } 195 196 PyDoc_STRVAR(unicodedata_numeric__doc__, 197 "numeric(unichr[, default])\n\ 198 \n\ 199 Returns the numeric value assigned to the Unicode character unichr\n\ 200 as float. If no such value is defined, default is returned, or, if\n\ 201 not given, ValueError is raised."); 202 203 static PyObject * 204 unicodedata_numeric(PyObject *self, PyObject *args) 205 { 206 PyUnicodeObject *v; 207 PyObject *defobj = NULL; 208 int have_old = 0; 209 double rc; 210 Py_UCS4 c; 211 212 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) 213 return NULL; 214 c = getuchar(v); 215 if (c == (Py_UCS4)-1) 216 return NULL; 217 218 if (self) { 219 const change_record *old = get_old_record(self, c); 220 if (old->category_changed == 0) { 221 /* unassigned */ 222 have_old = 1; 223 rc = -1.0; 224 } 225 else if (old->decimal_changed != 0xFF) { 226 have_old = 1; 227 rc = old->decimal_changed; 228 } 229 } 230 231 if (!have_old) 232 rc = Py_UNICODE_TONUMERIC(c); 233 if (rc == -1.0) { 234 if (defobj == NULL) { 235 PyErr_SetString(PyExc_ValueError, "not a numeric character"); 236 return NULL; 237 } 238 else { 239 Py_INCREF(defobj); 240 return defobj; 241 } 242 } 243 return PyFloat_FromDouble(rc); 244 } 245 246 PyDoc_STRVAR(unicodedata_category__doc__, 247 "category(unichr)\n\ 248 \n\ 249 Returns the general category assigned to the Unicode character\n\ 250 unichr as string."); 251 252 static PyObject * 253 unicodedata_category(PyObject *self, PyObject *args) 254 { 255 PyUnicodeObject *v; 256 int index; 257 Py_UCS4 c; 258 259 if (!PyArg_ParseTuple(args, "O!:category", 260 &PyUnicode_Type, &v)) 261 return NULL; 262 c = getuchar(v); 263 if (c == (Py_UCS4)-1) 264 return NULL; 265 index = (int) _getrecord_ex(c)->category; 266 if (self) { 267 const change_record *old = get_old_record(self, c); 268 if (old->category_changed != 0xFF) 269 index = old->category_changed; 270 } 271 return PyString_FromString(_PyUnicode_CategoryNames[index]); 272 } 273 274 PyDoc_STRVAR(unicodedata_bidirectional__doc__, 275 "bidirectional(unichr)\n\ 276 \n\ 277 Returns the bidirectional class assigned to the Unicode character\n\ 278 unichr as string. If no such value is defined, an empty string is\n\ 279 returned."); 280 281 static PyObject * 282 unicodedata_bidirectional(PyObject *self, PyObject *args) 283 { 284 PyUnicodeObject *v; 285 int index; 286 Py_UCS4 c; 287 288 if (!PyArg_ParseTuple(args, "O!:bidirectional", 289 &PyUnicode_Type, &v)) 290 return NULL; 291 c = getuchar(v); 292 if (c == (Py_UCS4)-1) 293 return NULL; 294 index = (int) _getrecord_ex(c)->bidirectional; 295 if (self) { 296 const change_record *old = get_old_record(self, c); 297 if (old->category_changed == 0) 298 index = 0; /* unassigned */ 299 else if (old->bidir_changed != 0xFF) 300 index = old->bidir_changed; 301 } 302 return PyString_FromString(_PyUnicode_BidirectionalNames[index]); 303 } 304 305 PyDoc_STRVAR(unicodedata_combining__doc__, 306 "combining(unichr)\n\ 307 \n\ 308 Returns the canonical combining class assigned to the Unicode\n\ 309 character unichr as integer. Returns 0 if no combining class is\n\ 310 defined."); 311 312 static PyObject * 313 unicodedata_combining(PyObject *self, PyObject *args) 314 { 315 PyUnicodeObject *v; 316 int index; 317 Py_UCS4 c; 318 319 if (!PyArg_ParseTuple(args, "O!:combining", 320 &PyUnicode_Type, &v)) 321 return NULL; 322 c = getuchar(v); 323 if (c == (Py_UCS4)-1) 324 return NULL; 325 index = (int) _getrecord_ex(c)->combining; 326 if (self) { 327 const change_record *old = get_old_record(self, c); 328 if (old->category_changed == 0) 329 index = 0; /* unassigned */ 330 } 331 return PyInt_FromLong(index); 332 } 333 334 PyDoc_STRVAR(unicodedata_mirrored__doc__, 335 "mirrored(unichr)\n\ 336 \n\ 337 Returns the mirrored property assigned to the Unicode character\n\ 338 unichr as integer. Returns 1 if the character has been identified as\n\ 339 a \"mirrored\" character in bidirectional text, 0 otherwise."); 340 341 static PyObject * 342 unicodedata_mirrored(PyObject *self, PyObject *args) 343 { 344 PyUnicodeObject *v; 345 int index; 346 Py_UCS4 c; 347 348 if (!PyArg_ParseTuple(args, "O!:mirrored", 349 &PyUnicode_Type, &v)) 350 return NULL; 351 c = getuchar(v); 352 if (c == (Py_UCS4)-1) 353 return NULL; 354 index = (int) _getrecord_ex(c)->mirrored; 355 if (self) { 356 const change_record *old = get_old_record(self, c); 357 if (old->category_changed == 0) 358 index = 0; /* unassigned */ 359 else if (old->mirrored_changed != 0xFF) 360 index = old->mirrored_changed; 361 } 362 return PyInt_FromLong(index); 363 } 364 365 PyDoc_STRVAR(unicodedata_east_asian_width__doc__, 366 "east_asian_width(unichr)\n\ 367 \n\ 368 Returns the east asian width assigned to the Unicode character\n\ 369 unichr as string."); 370 371 static PyObject * 372 unicodedata_east_asian_width(PyObject *self, PyObject *args) 373 { 374 PyUnicodeObject *v; 375 int index; 376 Py_UCS4 c; 377 378 if (!PyArg_ParseTuple(args, "O!:east_asian_width", 379 &PyUnicode_Type, &v)) 380 return NULL; 381 c = getuchar(v); 382 if (c == (Py_UCS4)-1) 383 return NULL; 384 index = (int) _getrecord_ex(c)->east_asian_width; 385 if (self) { 386 const change_record *old = get_old_record(self, c); 387 if (old->category_changed == 0) 388 index = 0; /* unassigned */ 389 } 390 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]); 391 } 392 393 PyDoc_STRVAR(unicodedata_decomposition__doc__, 394 "decomposition(unichr)\n\ 395 \n\ 396 Returns the character decomposition mapping assigned to the Unicode\n\ 397 character unichr as string. An empty string is returned in case no\n\ 398 such mapping is defined."); 399 400 static PyObject * 401 unicodedata_decomposition(PyObject *self, PyObject *args) 402 { 403 PyUnicodeObject *v; 404 char decomp[256]; 405 int code, index, count, i; 406 unsigned int prefix_index; 407 Py_UCS4 c; 408 409 if (!PyArg_ParseTuple(args, "O!:decomposition", 410 &PyUnicode_Type, &v)) 411 return NULL; 412 c = getuchar(v); 413 if (c == (Py_UCS4)-1) 414 return NULL; 415 416 code = (int)c; 417 418 if (self) { 419 const change_record *old = get_old_record(self, c); 420 if (old->category_changed == 0) 421 return PyString_FromString(""); /* unassigned */ 422 } 423 424 if (code < 0 || code >= 0x110000) 425 index = 0; 426 else { 427 index = decomp_index1[(code>>DECOMP_SHIFT)]; 428 index = decomp_index2[(index<<DECOMP_SHIFT)+ 429 (code&((1<<DECOMP_SHIFT)-1))]; 430 } 431 432 /* high byte is number of hex bytes (usually one or two), low byte 433 is prefix code (from*/ 434 count = decomp_data[index] >> 8; 435 436 /* XXX: could allocate the PyString up front instead 437 (strlen(prefix) + 5 * count + 1 bytes) */ 438 439 /* Based on how index is calculated above and decomp_data is generated 440 from Tools/unicode/makeunicodedata.py, it should not be possible 441 to overflow decomp_prefix. */ 442 prefix_index = decomp_data[index] & 255; 443 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix))); 444 445 /* copy prefix */ 446 i = strlen(decomp_prefix[prefix_index]); 447 memcpy(decomp, decomp_prefix[prefix_index], i); 448 449 while (count-- > 0) { 450 if (i) 451 decomp[i++] = ' '; 452 assert((size_t)i < sizeof(decomp)); 453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", 454 decomp_data[++index]); 455 i += strlen(decomp + i); 456 } 457 458 decomp[i] = '\0'; 459 460 return PyString_FromString(decomp); 461 } 462 463 static void 464 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) 465 { 466 if (code >= 0x110000) { 467 *index = 0; 468 } else if (self && get_old_record(self, code)->category_changed==0) { 469 /* unassigned in old version */ 470 *index = 0; 471 } 472 else { 473 *index = decomp_index1[(code>>DECOMP_SHIFT)]; 474 *index = decomp_index2[(*index<<DECOMP_SHIFT)+ 475 (code&((1<<DECOMP_SHIFT)-1))]; 476 } 477 478 /* high byte is number of hex bytes (usually one or two), low byte 479 is prefix code (from*/ 480 *count = decomp_data[*index] >> 8; 481 *prefix = decomp_data[*index] & 255; 482 483 (*index)++; 484 } 485 486 #define SBase 0xAC00 487 #define LBase 0x1100 488 #define VBase 0x1161 489 #define TBase 0x11A7 490 #define LCount 19 491 #define VCount 21 492 #define TCount 28 493 #define NCount (VCount*TCount) 494 #define SCount (LCount*NCount) 495 496 static PyObject* 497 nfd_nfkd(PyObject *self, PyObject *input, int k) 498 { 499 PyObject *result; 500 Py_UNICODE *i, *end, *o; 501 /* Longest decomposition in Unicode 3.2: U+FDFA */ 502 Py_UNICODE stack[20]; 503 Py_ssize_t space, isize; 504 int index, prefix, count, stackptr; 505 unsigned char prev, cur; 506 507 stackptr = 0; 508 isize = PyUnicode_GET_SIZE(input); 509 space = isize; 510 /* Overallocate at most 10 characters. */ 511 if (space > 10) { 512 if (space <= PY_SSIZE_T_MAX - 10) 513 space += 10; 514 } 515 else { 516 space *= 2; 517 } 518 result = PyUnicode_FromUnicode(NULL, space); 519 if (!result) 520 return NULL; 521 i = PyUnicode_AS_UNICODE(input); 522 end = i + isize; 523 o = PyUnicode_AS_UNICODE(result); 524 525 while (i < end) { 526 stack[stackptr++] = *i++; 527 while(stackptr) { 528 Py_UNICODE code = stack[--stackptr]; 529 /* Hangul Decomposition adds three characters in 530 a single step, so we need at least that much room. */ 531 if (space < 3) { 532 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10; 533 space += 10; 534 if (PyUnicode_Resize(&result, newsize) == -1) 535 return NULL; 536 o = PyUnicode_AS_UNICODE(result) + newsize - space; 537 } 538 /* Hangul Decomposition. */ 539 if (SBase <= code && code < (SBase+SCount)) { 540 int SIndex = code - SBase; 541 int L = LBase + SIndex / NCount; 542 int V = VBase + (SIndex % NCount) / TCount; 543 int T = TBase + SIndex % TCount; 544 *o++ = L; 545 *o++ = V; 546 space -= 2; 547 if (T != TBase) { 548 *o++ = T; 549 space --; 550 } 551 continue; 552 } 553 /* normalization changes */ 554 if (self) { 555 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); 556 if (value != 0) { 557 stack[stackptr++] = value; 558 continue; 559 } 560 } 561 562 /* Other decompositions. */ 563 get_decomp_record(self, code, &index, &prefix, &count); 564 565 /* Copy character if it is not decomposable, or has a 566 compatibility decomposition, but we do NFD. */ 567 if (!count || (prefix && !k)) { 568 *o++ = code; 569 space--; 570 continue; 571 } 572 /* Copy decomposition onto the stack, in reverse 573 order. */ 574 while(count) { 575 code = decomp_data[index + (--count)]; 576 stack[stackptr++] = code; 577 } 578 } 579 } 580 581 /* Drop overallocation. Cannot fail. */ 582 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space); 583 584 /* Sort canonically. */ 585 i = PyUnicode_AS_UNICODE(result); 586 prev = _getrecord_ex(*i)->combining; 587 end = i + PyUnicode_GET_SIZE(result); 588 for (i++; i < end; i++) { 589 cur = _getrecord_ex(*i)->combining; 590 if (prev == 0 || cur == 0 || prev <= cur) { 591 prev = cur; 592 continue; 593 } 594 /* Non-canonical order. Need to switch *i with previous. */ 595 o = i - 1; 596 while (1) { 597 Py_UNICODE tmp = o[1]; 598 o[1] = o[0]; 599 o[0] = tmp; 600 o--; 601 if (o < PyUnicode_AS_UNICODE(result)) 602 break; 603 prev = _getrecord_ex(*o)->combining; 604 if (prev == 0 || prev <= cur) 605 break; 606 } 607 prev = _getrecord_ex(*i)->combining; 608 } 609 return result; 610 } 611 612 static int 613 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code) 614 { 615 int index; 616 for (index = 0; nfc[index].start; index++) { 617 int start = nfc[index].start; 618 if (code < start) 619 return -1; 620 if (code <= start + nfc[index].count) { 621 int delta = code - start; 622 return nfc[index].index + delta; 623 } 624 } 625 return -1; 626 } 627 628 static PyObject* 629 nfc_nfkc(PyObject *self, PyObject *input, int k) 630 { 631 PyObject *result; 632 Py_UNICODE *i, *i1, *o, *end; 633 int f,l,index,index1,comb; 634 Py_UNICODE code; 635 Py_UNICODE *skipped[20]; 636 int cskipped = 0; 637 638 result = nfd_nfkd(self, input, k); 639 if (!result) 640 return NULL; 641 642 /* We are going to modify result in-place. 643 If nfd_nfkd is changed to sometimes return the input, 644 this code needs to be reviewed. */ 645 assert(result != input); 646 647 i = PyUnicode_AS_UNICODE(result); 648 end = i + PyUnicode_GET_SIZE(result); 649 o = PyUnicode_AS_UNICODE(result); 650 651 again: 652 while (i < end) { 653 for (index = 0; index < cskipped; index++) { 654 if (skipped[index] == i) { 655 /* *i character is skipped. 656 Remove from list. */ 657 skipped[index] = skipped[cskipped-1]; 658 cskipped--; 659 i++; 660 goto again; /* continue while */ 661 } 662 } 663 /* Hangul Composition. We don't need to check for <LV,T> 664 pairs, since we always have decomposed data. */ 665 if (LBase <= *i && *i < (LBase+LCount) && 666 i + 1 < end && 667 VBase <= i[1] && i[1] <= (VBase+VCount)) { 668 int LIndex, VIndex; 669 LIndex = i[0] - LBase; 670 VIndex = i[1] - VBase; 671 code = SBase + (LIndex*VCount+VIndex)*TCount; 672 i+=2; 673 if (i < end && 674 TBase <= *i && *i <= (TBase+TCount)) { 675 code += *i-TBase; 676 i++; 677 } 678 *o++ = code; 679 continue; 680 } 681 682 f = find_nfc_index(self, nfc_first, *i); 683 if (f == -1) { 684 *o++ = *i++; 685 continue; 686 } 687 /* Find next unblocked character. */ 688 i1 = i+1; 689 comb = 0; 690 while (i1 < end) { 691 int comb1 = _getrecord_ex(*i1)->combining; 692 if (comb) { 693 if (comb1 == 0) 694 break; 695 if (comb >= comb1) { 696 /* Character is blocked. */ 697 i1++; 698 continue; 699 } 700 } 701 l = find_nfc_index(self, nfc_last, *i1); 702 /* *i1 cannot be combined with *i. If *i1 703 is a starter, we don't need to look further. 704 Otherwise, record the combining class. */ 705 if (l == -1) { 706 not_combinable: 707 if (comb1 == 0) 708 break; 709 comb = comb1; 710 i1++; 711 continue; 712 } 713 index = f*TOTAL_LAST + l; 714 index1 = comp_index[index >> COMP_SHIFT]; 715 code = comp_data[(index1<<COMP_SHIFT)+ 716 (index&((1<<COMP_SHIFT)-1))]; 717 if (code == 0) 718 goto not_combinable; 719 720 /* Replace the original character. */ 721 *i = code; 722 /* Mark the second character unused. */ 723 assert(cskipped < 20); 724 skipped[cskipped++] = i1; 725 i1++; 726 f = find_nfc_index(self, nfc_first, *i); 727 if (f == -1) 728 break; 729 } 730 *o++ = *i++; 731 } 732 if (o != end) 733 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result)); 734 return result; 735 } 736 737 /* Return 1 if the input is certainly normalized, 0 if it might not be. */ 738 static int 739 is_normalized(PyObject *self, PyObject *input, int nfc, int k) 740 { 741 Py_UNICODE *i, *end; 742 unsigned char prev_combining = 0, quickcheck_mask; 743 744 /* An older version of the database is requested, quickchecks must be 745 disabled. */ 746 if (self != NULL) 747 return 0; 748 749 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, 750 as described in http://unicode.org/reports/tr15/#Annex8. */ 751 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); 752 753 i = PyUnicode_AS_UNICODE(input); 754 end = i + PyUnicode_GET_SIZE(input); 755 while (i < end) { 756 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++); 757 unsigned char combining = record->combining; 758 unsigned char quickcheck = record->normalization_quick_check; 759 760 if (quickcheck & quickcheck_mask) 761 return 0; /* this string might need normalization */ 762 if (combining && prev_combining > combining) 763 return 0; /* non-canonical sort order, not normalized */ 764 prev_combining = combining; 765 } 766 return 1; /* certainly normalized */ 767 } 768 769 PyDoc_STRVAR(unicodedata_normalize__doc__, 770 "normalize(form, unistr)\n\ 771 \n\ 772 Return the normal form 'form' for the Unicode string unistr. Valid\n\ 773 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'."); 774 775 static PyObject* 776 unicodedata_normalize(PyObject *self, PyObject *args) 777 { 778 char *form; 779 PyObject *input; 780 781 if(!PyArg_ParseTuple(args, "sO!:normalize", 782 &form, &PyUnicode_Type, &input)) 783 return NULL; 784 785 if (PyUnicode_GetSize(input) == 0) { 786 /* Special case empty input strings, since resizing 787 them later would cause internal errors. */ 788 Py_INCREF(input); 789 return input; 790 } 791 792 if (strcmp(form, "NFC") == 0) { 793 if (is_normalized(self, input, 1, 0)) { 794 Py_INCREF(input); 795 return input; 796 } 797 return nfc_nfkc(self, input, 0); 798 } 799 if (strcmp(form, "NFKC") == 0) { 800 if (is_normalized(self, input, 1, 1)) { 801 Py_INCREF(input); 802 return input; 803 } 804 return nfc_nfkc(self, input, 1); 805 } 806 if (strcmp(form, "NFD") == 0) { 807 if (is_normalized(self, input, 0, 0)) { 808 Py_INCREF(input); 809 return input; 810 } 811 return nfd_nfkd(self, input, 0); 812 } 813 if (strcmp(form, "NFKD") == 0) { 814 if (is_normalized(self, input, 0, 1)) { 815 Py_INCREF(input); 816 return input; 817 } 818 return nfd_nfkd(self, input, 1); 819 } 820 PyErr_SetString(PyExc_ValueError, "invalid normalization form"); 821 return NULL; 822 } 823 824 /* -------------------------------------------------------------------- */ 825 /* unicode character name tables */ 826 827 /* data file generated by Tools/unicode/makeunicodedata.py */ 828 #include "unicodename_db.h" 829 830 /* -------------------------------------------------------------------- */ 831 /* database code (cut and pasted from the unidb package) */ 832 833 static unsigned long 834 _gethash(const char *s, int len, int scale) 835 { 836 int i; 837 unsigned long h = 0; 838 unsigned long ix; 839 for (i = 0; i < len; i++) { 840 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i])); 841 ix = h & 0xff000000; 842 if (ix) 843 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; 844 } 845 return h; 846 } 847 848 static char *hangul_syllables[][3] = { 849 { "G", "A", "" }, 850 { "GG", "AE", "G" }, 851 { "N", "YA", "GG" }, 852 { "D", "YAE", "GS" }, 853 { "DD", "EO", "N", }, 854 { "R", "E", "NJ" }, 855 { "M", "YEO", "NH" }, 856 { "B", "YE", "D" }, 857 { "BB", "O", "L" }, 858 { "S", "WA", "LG" }, 859 { "SS", "WAE", "LM" }, 860 { "", "OE", "LB" }, 861 { "J", "YO", "LS" }, 862 { "JJ", "U", "LT" }, 863 { "C", "WEO", "LP" }, 864 { "K", "WE", "LH" }, 865 { "T", "WI", "M" }, 866 { "P", "YU", "B" }, 867 { "H", "EU", "BS" }, 868 { 0, "YI", "S" }, 869 { 0, "I", "SS" }, 870 { 0, 0, "NG" }, 871 { 0, 0, "J" }, 872 { 0, 0, "C" }, 873 { 0, 0, "K" }, 874 { 0, 0, "T" }, 875 { 0, 0, "P" }, 876 { 0, 0, "H" } 877 }; 878 879 static int 880 is_unified_ideograph(Py_UCS4 code) 881 { 882 return ( 883 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ 884 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */ 885 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */ 886 (0x2A700 <= code && code <= 0x2B734)); /* CJK Ideograph Extension C */ 887 } 888 889 static int 890 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen) 891 { 892 int offset; 893 int i; 894 int word; 895 unsigned char* w; 896 897 if (code >= 0x110000) 898 return 0; 899 900 if (self) { 901 const change_record *old = get_old_record(self, code); 902 if (old->category_changed == 0) { 903 /* unassigned */ 904 return 0; 905 } 906 } 907 908 if (SBase <= code && code < SBase+SCount) { 909 /* Hangul syllable. */ 910 int SIndex = code - SBase; 911 int L = SIndex / NCount; 912 int V = (SIndex % NCount) / TCount; 913 int T = SIndex % TCount; 914 915 if (buflen < 27) 916 /* Worst case: HANGUL SYLLABLE <10chars>. */ 917 return 0; 918 strcpy(buffer, "HANGUL SYLLABLE "); 919 buffer += 16; 920 strcpy(buffer, hangul_syllables[L][0]); 921 buffer += strlen(hangul_syllables[L][0]); 922 strcpy(buffer, hangul_syllables[V][1]); 923 buffer += strlen(hangul_syllables[V][1]); 924 strcpy(buffer, hangul_syllables[T][2]); 925 buffer += strlen(hangul_syllables[T][2]); 926 *buffer = '\0'; 927 return 1; 928 } 929 930 if (is_unified_ideograph(code)) { 931 if (buflen < 28) 932 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ 933 return 0; 934 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); 935 return 1; 936 } 937 938 /* get offset into phrasebook */ 939 offset = phrasebook_offset1[(code>>phrasebook_shift)]; 940 offset = phrasebook_offset2[(offset<<phrasebook_shift) + 941 (code&((1<<phrasebook_shift)-1))]; 942 if (!offset) 943 return 0; 944 945 i = 0; 946 947 for (;;) { 948 /* get word index */ 949 word = phrasebook[offset] - phrasebook_short; 950 if (word >= 0) { 951 word = (word << 8) + phrasebook[offset+1]; 952 offset += 2; 953 } else 954 word = phrasebook[offset++]; 955 if (i) { 956 if (i > buflen) 957 return 0; /* buffer overflow */ 958 buffer[i++] = ' '; 959 } 960 /* copy word string from lexicon. the last character in the 961 word has bit 7 set. the last word in a string ends with 962 0x80 */ 963 w = lexicon + lexicon_offset[word]; 964 while (*w < 128) { 965 if (i >= buflen) 966 return 0; /* buffer overflow */ 967 buffer[i++] = *w++; 968 } 969 if (i >= buflen) 970 return 0; /* buffer overflow */ 971 buffer[i++] = *w & 127; 972 if (*w == 128) 973 break; /* end of word */ 974 } 975 976 return 1; 977 } 978 979 static int 980 _cmpname(PyObject *self, int code, const char* name, int namelen) 981 { 982 /* check if code corresponds to the given name */ 983 int i; 984 char buffer[NAME_MAXLEN]; 985 if (!_getucname(self, code, buffer, sizeof(buffer))) 986 return 0; 987 for (i = 0; i < namelen; i++) { 988 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i]) 989 return 0; 990 } 991 return buffer[namelen] == '\0'; 992 } 993 994 static void 995 find_syllable(const char *str, int *len, int *pos, int count, int column) 996 { 997 int i, len1; 998 *len = -1; 999 for (i = 0; i < count; i++) { 1000 char *s = hangul_syllables[i][column]; 1001 len1 = strlen(s); 1002 if (len1 <= *len) 1003 continue; 1004 if (strncmp(str, s, len1) == 0) { 1005 *len = len1; 1006 *pos = i; 1007 } 1008 } 1009 if (*len == -1) { 1010 *len = 0; 1011 } 1012 } 1013 1014 static int 1015 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code) 1016 { 1017 unsigned int h, v; 1018 unsigned int mask = code_size-1; 1019 unsigned int i, incr; 1020 1021 /* Check for hangul syllables. */ 1022 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { 1023 int len, L = -1, V = -1, T = -1; 1024 const char *pos = name + 16; 1025 find_syllable(pos, &len, &L, LCount, 0); 1026 pos += len; 1027 find_syllable(pos, &len, &V, VCount, 1); 1028 pos += len; 1029 find_syllable(pos, &len, &T, TCount, 2); 1030 pos += len; 1031 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { 1032 *code = SBase + (L*VCount+V)*TCount + T; 1033 return 1; 1034 } 1035 /* Otherwise, it's an illegal syllable name. */ 1036 return 0; 1037 } 1038 1039 /* Check for unified ideographs. */ 1040 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { 1041 /* Four or five hexdigits must follow. */ 1042 v = 0; 1043 name += 22; 1044 namelen -= 22; 1045 if (namelen != 4 && namelen != 5) 1046 return 0; 1047 while (namelen--) { 1048 v *= 16; 1049 if (*name >= '0' && *name <= '9') 1050 v += *name - '0'; 1051 else if (*name >= 'A' && *name <= 'F') 1052 v += *name - 'A' + 10; 1053 else 1054 return 0; 1055 name++; 1056 } 1057 if (!is_unified_ideograph(v)) 1058 return 0; 1059 *code = v; 1060 return 1; 1061 } 1062 1063 /* the following is the same as python's dictionary lookup, with 1064 only minor changes. see the makeunicodedata script for more 1065 details */ 1066 1067 h = (unsigned int) _gethash(name, namelen, code_magic); 1068 i = (~h) & mask; 1069 v = code_hash[i]; 1070 if (!v) 1071 return 0; 1072 if (_cmpname(self, v, name, namelen)) { 1073 *code = v; 1074 return 1; 1075 } 1076 incr = (h ^ (h >> 3)) & mask; 1077 if (!incr) 1078 incr = mask; 1079 for (;;) { 1080 i = (i + incr) & mask; 1081 v = code_hash[i]; 1082 if (!v) 1083 return 0; 1084 if (_cmpname(self, v, name, namelen)) { 1085 *code = v; 1086 return 1; 1087 } 1088 incr = incr << 1; 1089 if (incr > mask) 1090 incr = incr ^ code_poly; 1091 } 1092 } 1093 1094 static const _PyUnicode_Name_CAPI hashAPI = 1095 { 1096 sizeof(_PyUnicode_Name_CAPI), 1097 _getucname, 1098 _getcode 1099 }; 1100 1101 /* -------------------------------------------------------------------- */ 1102 /* Python bindings */ 1103 1104 PyDoc_STRVAR(unicodedata_name__doc__, 1105 "name(unichr[, default])\n\ 1106 Returns the name assigned to the Unicode character unichr as a\n\ 1107 string. If no name is defined, default is returned, or, if not\n\ 1108 given, ValueError is raised."); 1109 1110 static PyObject * 1111 unicodedata_name(PyObject* self, PyObject* args) 1112 { 1113 char name[NAME_MAXLEN]; 1114 Py_UCS4 c; 1115 1116 PyUnicodeObject* v; 1117 PyObject* defobj = NULL; 1118 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj)) 1119 return NULL; 1120 1121 c = getuchar(v); 1122 if (c == (Py_UCS4)-1) 1123 return NULL; 1124 1125 if (!_getucname(self, c, name, sizeof(name))) { 1126 if (defobj == NULL) { 1127 PyErr_SetString(PyExc_ValueError, "no such name"); 1128 return NULL; 1129 } 1130 else { 1131 Py_INCREF(defobj); 1132 return defobj; 1133 } 1134 } 1135 1136 return Py_BuildValue("s", name); 1137 } 1138 1139 PyDoc_STRVAR(unicodedata_lookup__doc__, 1140 "lookup(name)\n\ 1141 \n\ 1142 Look up character by name. If a character with the\n\ 1143 given name is found, return the corresponding Unicode\n\ 1144 character. If not found, KeyError is raised."); 1145 1146 static PyObject * 1147 unicodedata_lookup(PyObject* self, PyObject* args) 1148 { 1149 Py_UCS4 code; 1150 Py_UNICODE str[2]; 1151 1152 char* name; 1153 int namelen; 1154 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) 1155 return NULL; 1156 1157 if (!_getcode(self, name, namelen, &code)) { 1158 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", 1159 name); 1160 return NULL; 1161 } 1162 1163 #ifndef Py_UNICODE_WIDE 1164 if (code >= 0x10000) { 1165 str[0] = 0xd800 + ((code - 0x10000) >> 10); 1166 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff); 1167 return PyUnicode_FromUnicode(str, 2); 1168 } 1169 #endif 1170 str[0] = (Py_UNICODE) code; 1171 return PyUnicode_FromUnicode(str, 1); 1172 } 1173 1174 /* XXX Add doc strings. */ 1175 1176 static PyMethodDef unicodedata_functions[] = { 1177 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__}, 1178 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__}, 1179 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__}, 1180 {"category", unicodedata_category, METH_VARARGS, 1181 unicodedata_category__doc__}, 1182 {"bidirectional", unicodedata_bidirectional, METH_VARARGS, 1183 unicodedata_bidirectional__doc__}, 1184 {"combining", unicodedata_combining, METH_VARARGS, 1185 unicodedata_combining__doc__}, 1186 {"mirrored", unicodedata_mirrored, METH_VARARGS, 1187 unicodedata_mirrored__doc__}, 1188 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS, 1189 unicodedata_east_asian_width__doc__}, 1190 {"decomposition", unicodedata_decomposition, METH_VARARGS, 1191 unicodedata_decomposition__doc__}, 1192 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__}, 1193 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__}, 1194 {"normalize", unicodedata_normalize, METH_VARARGS, 1195 unicodedata_normalize__doc__}, 1196 {NULL, NULL} /* sentinel */ 1197 }; 1198 1199 static PyTypeObject UCD_Type = { 1200 /* The ob_type field must be initialized in the module init function 1201 * to be portable to Windows without using C++. */ 1202 PyVarObject_HEAD_INIT(NULL, 0) 1203 "unicodedata.UCD", /*tp_name*/ 1204 sizeof(PreviousDBVersion), /*tp_basicsize*/ 1205 0, /*tp_itemsize*/ 1206 /* methods */ 1207 (destructor)PyObject_Del, /*tp_dealloc*/ 1208 0, /*tp_print*/ 1209 0, /*tp_getattr*/ 1210 0, /*tp_setattr*/ 1211 0, /*tp_compare*/ 1212 0, /*tp_repr*/ 1213 0, /*tp_as_number*/ 1214 0, /*tp_as_sequence*/ 1215 0, /*tp_as_mapping*/ 1216 0, /*tp_hash*/ 1217 0, /*tp_call*/ 1218 0, /*tp_str*/ 1219 PyObject_GenericGetAttr,/*tp_getattro*/ 1220 0, /*tp_setattro*/ 1221 0, /*tp_as_buffer*/ 1222 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 1223 0, /*tp_doc*/ 1224 0, /*tp_traverse*/ 1225 0, /*tp_clear*/ 1226 0, /*tp_richcompare*/ 1227 0, /*tp_weaklistoffset*/ 1228 0, /*tp_iter*/ 1229 0, /*tp_iternext*/ 1230 unicodedata_functions, /*tp_methods*/ 1231 DB_members, /*tp_members*/ 1232 0, /*tp_getset*/ 1233 0, /*tp_base*/ 1234 0, /*tp_dict*/ 1235 0, /*tp_descr_get*/ 1236 0, /*tp_descr_set*/ 1237 0, /*tp_dictoffset*/ 1238 0, /*tp_init*/ 1239 0, /*tp_alloc*/ 1240 0, /*tp_new*/ 1241 0, /*tp_free*/ 1242 0, /*tp_is_gc*/ 1243 }; 1244 1245 PyDoc_STRVAR(unicodedata_docstring, 1246 "This module provides access to the Unicode Character Database which\n\ 1247 defines character properties for all Unicode characters. The data in\n\ 1248 this database is based on the UnicodeData.txt file version\n\ 1249 5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\ 1250 \n\ 1251 The module uses the same names and symbols as defined by the\n\ 1252 UnicodeData File Format 5.2.0 (see\n\ 1253 http://www.unicode.org/reports/tr44/tr44-4.html)."); 1254 1255 PyMODINIT_FUNC 1256 initunicodedata(void) 1257 { 1258 PyObject *m, *v; 1259 1260 Py_TYPE(&UCD_Type) = &PyType_Type; 1261 1262 m = Py_InitModule3( 1263 "unicodedata", unicodedata_functions, unicodedata_docstring); 1264 if (!m) 1265 return; 1266 1267 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); 1268 Py_INCREF(&UCD_Type); 1269 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type); 1270 1271 /* Previous versions */ 1272 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); 1273 if (v != NULL) 1274 PyModule_AddObject(m, "ucd_3_2_0", v); 1275 1276 /* Export C API */ 1277 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL); 1278 if (v != NULL) 1279 PyModule_AddObject(m, "ucnhash_CAPI", v); 1280 } 1281 1282 /* 1283 Local variables: 1284 c-basic-offset: 4 1285 indent-tabs-mode: nil 1286 End: 1287 */ 1288