Home | History | Annotate | Download | only in Modules
      1 /* ------------------------------------------------------------------------
      2 
      3    unicodedata -- Provides access to the Unicode database.
      4 
      5    Data was extracted from the UnicodeData.txt file.
      6    The current version number is reported in the unidata_version constant.
      7 
      8    Written by Marc-Andre Lemburg (mal (at) lemburg.com).
      9    Modified for Python 2.0 by Fredrik Lundh (fredrik (at) pythonware.com)
     10    Modified by Martin v. Lwis (martin (at) v.loewis.de)
     11 
     12    Copyright (c) Corporation for National Research Initiatives.
     13 
     14    ------------------------------------------------------------------------ */
     15 
     16 #define PY_SSIZE_T_CLEAN
     17 
     18 #include "Python.h"
     19 #include "ucnhash.h"
     20 #include "structmember.h"
     21 
     22 /*[clinic input]
     23 module unicodedata
     24 class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
     25 [clinic start generated code]*/
     26 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
     27 
     28 /* character properties */
     29 
     30 typedef struct {
     31     const unsigned char category;       /* index into
     32                                            _PyUnicode_CategoryNames */
     33     const unsigned char combining;      /* combining class value 0 - 255 */
     34     const unsigned char bidirectional;  /* index into
     35                                            _PyUnicode_BidirectionalNames */
     36     const unsigned char mirrored;       /* true if mirrored in bidir mode */
     37     const unsigned char east_asian_width;       /* index into
     38                                                    _PyUnicode_EastAsianWidth */
     39     const unsigned char normalization_quick_check; /* see is_normalized() */
     40 } _PyUnicode_DatabaseRecord;
     41 
     42 typedef struct change_record {
     43     /* sequence of fields should be the same as in merge_old_version */
     44     const unsigned char bidir_changed;
     45     const unsigned char category_changed;
     46     const unsigned char decimal_changed;
     47     const unsigned char mirrored_changed;
     48     const unsigned char east_asian_width_changed;
     49     const double numeric_changed;
     50 } change_record;
     51 
     52 /* data file generated by Tools/unicode/makeunicodedata.py */
     53 #include "unicodedata_db.h"
     54 
     55 static const _PyUnicode_DatabaseRecord*
     56 _getrecord_ex(Py_UCS4 code)
     57 {
     58     int index;
     59     if (code >= 0x110000)
     60         index = 0;
     61     else {
     62         index = index1[(code>>SHIFT)];
     63         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
     64     }
     65 
     66     return &_PyUnicode_Database_Records[index];
     67 }
     68 
     69 /* ------------- Previous-version API ------------------------------------- */
     70 typedef struct previous_version {
     71     PyObject_HEAD
     72     const char *name;
     73     const change_record* (*getrecord)(Py_UCS4);
     74     Py_UCS4 (*normalization)(Py_UCS4);
     75 } PreviousDBVersion;
     76 
     77 #include "clinic/unicodedata.c.h"
     78 
     79 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
     80 
     81 static PyMemberDef DB_members[] = {
     82         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
     83         {NULL}
     84 };
     85 
     86 /* forward declaration */
     87 static PyTypeObject UCD_Type;
     88 #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
     89 
     90 static PyObject*
     91 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
     92                      Py_UCS4 (*normalization)(Py_UCS4))
     93 {
     94         PreviousDBVersion *self;
     95         self = PyObject_New(PreviousDBVersion, &UCD_Type);
     96         if (self == NULL)
     97                 return NULL;
     98         self->name = name;
     99         self->getrecord = getrecord;
    100         self->normalization = normalization;
    101         return (PyObject*)self;
    102 }
    103 
    104 
    105 /* --- Module API --------------------------------------------------------- */
    106 
    107 /*[clinic input]
    108 unicodedata.UCD.decimal
    109 
    110     self: self
    111     chr: int(accept={str})
    112     default: object=NULL
    113     /
    114 
    115 Converts a Unicode character into its equivalent decimal value.
    116 
    117 Returns the decimal value assigned to the character chr as integer.
    118 If no such value is defined, default is returned, or, if not given,
    119 ValueError is raised.
    120 [clinic start generated code]*/
    121 
    122 static PyObject *
    123 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
    124                              PyObject *default_value)
    125 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
    126 {
    127     int have_old = 0;
    128     long rc;
    129     Py_UCS4 c = (Py_UCS4)chr;
    130 
    131     if (self && UCD_Check(self)) {
    132         const change_record *old = get_old_record(self, c);
    133         if (old->category_changed == 0) {
    134             /* unassigned */
    135             have_old = 1;
    136             rc = -1;
    137         }
    138         else if (old->decimal_changed != 0xFF) {
    139             have_old = 1;
    140             rc = old->decimal_changed;
    141         }
    142     }
    143 
    144     if (!have_old)
    145         rc = Py_UNICODE_TODECIMAL(c);
    146     if (rc < 0) {
    147         if (default_value == NULL) {
    148             PyErr_SetString(PyExc_ValueError,
    149                             "not a decimal");
    150             return NULL;
    151         }
    152         else {
    153             Py_INCREF(default_value);
    154             return default_value;
    155         }
    156     }
    157     return PyLong_FromLong(rc);
    158 }
    159 
    160 /*[clinic input]
    161 unicodedata.UCD.digit
    162 
    163     self: self
    164     chr: int(accept={str})
    165     default: object=NULL
    166     /
    167 
    168 Converts a Unicode character into its equivalent digit value.
    169 
    170 Returns the digit value assigned to the character chr as integer.
    171 If no such value is defined, default is returned, or, if not given,
    172 ValueError is raised.
    173 [clinic start generated code]*/
    174 
    175 static PyObject *
    176 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
    177 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
    178 {
    179     long rc;
    180     Py_UCS4 c = (Py_UCS4)chr;
    181     rc = Py_UNICODE_TODIGIT(c);
    182     if (rc < 0) {
    183         if (default_value == NULL) {
    184             PyErr_SetString(PyExc_ValueError, "not a digit");
    185             return NULL;
    186         }
    187         else {
    188             Py_INCREF(default_value);
    189             return default_value;
    190         }
    191     }
    192     return PyLong_FromLong(rc);
    193 }
    194 
    195 /*[clinic input]
    196 unicodedata.UCD.numeric
    197 
    198     self: self
    199     chr: int(accept={str})
    200     default: object=NULL
    201     /
    202 
    203 Converts a Unicode character into its equivalent numeric value.
    204 
    205 Returns the numeric value assigned to the character chr as float.
    206 If no such value is defined, default is returned, or, if not given,
    207 ValueError is raised.
    208 [clinic start generated code]*/
    209 
    210 static PyObject *
    211 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
    212                              PyObject *default_value)
    213 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
    214 {
    215     int have_old = 0;
    216     double rc;
    217     Py_UCS4 c = (Py_UCS4)chr;
    218 
    219     if (self && UCD_Check(self)) {
    220         const change_record *old = get_old_record(self, c);
    221         if (old->category_changed == 0) {
    222             /* unassigned */
    223             have_old = 1;
    224             rc = -1.0;
    225         }
    226         else if (old->decimal_changed != 0xFF) {
    227             have_old = 1;
    228             rc = old->decimal_changed;
    229         }
    230     }
    231 
    232     if (!have_old)
    233         rc = Py_UNICODE_TONUMERIC(c);
    234     if (rc == -1.0) {
    235         if (default_value == NULL) {
    236             PyErr_SetString(PyExc_ValueError, "not a numeric character");
    237             return NULL;
    238         }
    239         else {
    240             Py_INCREF(default_value);
    241             return default_value;
    242         }
    243     }
    244     return PyFloat_FromDouble(rc);
    245 }
    246 
    247 /*[clinic input]
    248 unicodedata.UCD.category
    249 
    250     self: self
    251     chr: int(accept={str})
    252     /
    253 
    254 Returns the general category assigned to the character chr as string.
    255 [clinic start generated code]*/
    256 
    257 static PyObject *
    258 unicodedata_UCD_category_impl(PyObject *self, int chr)
    259 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
    260 {
    261     int index;
    262     Py_UCS4 c = (Py_UCS4)chr;
    263     index = (int) _getrecord_ex(c)->category;
    264     if (self && UCD_Check(self)) {
    265         const change_record *old = get_old_record(self, c);
    266         if (old->category_changed != 0xFF)
    267             index = old->category_changed;
    268     }
    269     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
    270 }
    271 
    272 /*[clinic input]
    273 unicodedata.UCD.bidirectional
    274 
    275     self: self
    276     chr: int(accept={str})
    277     /
    278 
    279 Returns the bidirectional class assigned to the character chr as string.
    280 
    281 If no such value is defined, an empty string is returned.
    282 [clinic start generated code]*/
    283 
    284 static PyObject *
    285 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
    286 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
    287 {
    288     int index;
    289     Py_UCS4 c = (Py_UCS4)chr;
    290     index = (int) _getrecord_ex(c)->bidirectional;
    291     if (self && UCD_Check(self)) {
    292         const change_record *old = get_old_record(self, c);
    293         if (old->category_changed == 0)
    294             index = 0; /* unassigned */
    295         else if (old->bidir_changed != 0xFF)
    296             index = old->bidir_changed;
    297     }
    298     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
    299 }
    300 
    301 /*[clinic input]
    302 unicodedata.UCD.combining -> int
    303 
    304     self: self
    305     chr: int(accept={str})
    306     /
    307 
    308 Returns the canonical combining class assigned to the character chr as integer.
    309 
    310 Returns 0 if no combining class is defined.
    311 [clinic start generated code]*/
    312 
    313 static int
    314 unicodedata_UCD_combining_impl(PyObject *self, int chr)
    315 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
    316 {
    317     int index;
    318     Py_UCS4 c = (Py_UCS4)chr;
    319     index = (int) _getrecord_ex(c)->combining;
    320     if (self && UCD_Check(self)) {
    321         const change_record *old = get_old_record(self, c);
    322         if (old->category_changed == 0)
    323             index = 0; /* unassigned */
    324     }
    325     return index;
    326 }
    327 
    328 /*[clinic input]
    329 unicodedata.UCD.mirrored -> int
    330 
    331     self: self
    332     chr: int(accept={str})
    333     /
    334 
    335 Returns the mirrored property assigned to the character chr as integer.
    336 
    337 Returns 1 if the character has been identified as a "mirrored"
    338 character in bidirectional text, 0 otherwise.
    339 [clinic start generated code]*/
    340 
    341 static int
    342 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
    343 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
    344 {
    345     int index;
    346     Py_UCS4 c = (Py_UCS4)chr;
    347     index = (int) _getrecord_ex(c)->mirrored;
    348     if (self && UCD_Check(self)) {
    349         const change_record *old = get_old_record(self, c);
    350         if (old->category_changed == 0)
    351             index = 0; /* unassigned */
    352         else if (old->mirrored_changed != 0xFF)
    353             index = old->mirrored_changed;
    354     }
    355     return index;
    356 }
    357 
    358 /*[clinic input]
    359 unicodedata.UCD.east_asian_width
    360 
    361     self: self
    362     chr: int(accept={str})
    363     /
    364 
    365 Returns the east asian width assigned to the character chr as string.
    366 [clinic start generated code]*/
    367 
    368 static PyObject *
    369 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
    370 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
    371 {
    372     int index;
    373     Py_UCS4 c = (Py_UCS4)chr;
    374     index = (int) _getrecord_ex(c)->east_asian_width;
    375     if (self && UCD_Check(self)) {
    376         const change_record *old = get_old_record(self, c);
    377         if (old->category_changed == 0)
    378             index = 0; /* unassigned */
    379         else if (old->east_asian_width_changed != 0xFF)
    380             index = old->east_asian_width_changed;
    381     }
    382     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
    383 }
    384 
    385 /*[clinic input]
    386 unicodedata.UCD.decomposition
    387 
    388     self: self
    389     chr: int(accept={str})
    390     /
    391 
    392 Returns the character decomposition mapping assigned to the character chr as string.
    393 
    394 An empty string is returned in case no such mapping is defined.
    395 [clinic start generated code]*/
    396 
    397 static PyObject *
    398 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
    399 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
    400 {
    401     char decomp[256];
    402     int code, index, count;
    403     size_t i;
    404     unsigned int prefix_index;
    405     Py_UCS4 c = (Py_UCS4)chr;
    406 
    407     code = (int)c;
    408 
    409     if (self && UCD_Check(self)) {
    410         const change_record *old = get_old_record(self, c);
    411         if (old->category_changed == 0)
    412             return PyUnicode_FromString(""); /* unassigned */
    413     }
    414 
    415     if (code < 0 || code >= 0x110000)
    416         index = 0;
    417     else {
    418         index = decomp_index1[(code>>DECOMP_SHIFT)];
    419         index = decomp_index2[(index<<DECOMP_SHIFT)+
    420                              (code&((1<<DECOMP_SHIFT)-1))];
    421     }
    422 
    423     /* high byte is number of hex bytes (usually one or two), low byte
    424        is prefix code (from*/
    425     count = decomp_data[index] >> 8;
    426 
    427     /* XXX: could allocate the PyString up front instead
    428        (strlen(prefix) + 5 * count + 1 bytes) */
    429 
    430     /* Based on how index is calculated above and decomp_data is generated
    431        from Tools/unicode/makeunicodedata.py, it should not be possible
    432        to overflow decomp_prefix. */
    433     prefix_index = decomp_data[index] & 255;
    434     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
    435 
    436     /* copy prefix */
    437     i = strlen(decomp_prefix[prefix_index]);
    438     memcpy(decomp, decomp_prefix[prefix_index], i);
    439 
    440     while (count-- > 0) {
    441         if (i)
    442             decomp[i++] = ' ';
    443         assert(i < sizeof(decomp));
    444         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
    445                       decomp_data[++index]);
    446         i += strlen(decomp + i);
    447     }
    448     return PyUnicode_FromStringAndSize(decomp, i);
    449 }
    450 
    451 static void
    452 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
    453 {
    454     if (code >= 0x110000) {
    455         *index = 0;
    456     } else if (self && UCD_Check(self) &&
    457                get_old_record(self, code)->category_changed==0) {
    458         /* unassigned in old version */
    459         *index = 0;
    460     }
    461     else {
    462         *index = decomp_index1[(code>>DECOMP_SHIFT)];
    463         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
    464                                (code&((1<<DECOMP_SHIFT)-1))];
    465     }
    466 
    467     /* high byte is number of hex bytes (usually one or two), low byte
    468        is prefix code (from*/
    469     *count = decomp_data[*index] >> 8;
    470     *prefix = decomp_data[*index] & 255;
    471 
    472     (*index)++;
    473 }
    474 
    475 #define SBase   0xAC00
    476 #define LBase   0x1100
    477 #define VBase   0x1161
    478 #define TBase   0x11A7
    479 #define LCount  19
    480 #define VCount  21
    481 #define TCount  28
    482 #define NCount  (VCount*TCount)
    483 #define SCount  (LCount*NCount)
    484 
    485 static PyObject*
    486 nfd_nfkd(PyObject *self, PyObject *input, int k)
    487 {
    488     PyObject *result;
    489     Py_UCS4 *output;
    490     Py_ssize_t i, o, osize;
    491     int kind;
    492     void *data;
    493     /* Longest decomposition in Unicode 3.2: U+FDFA */
    494     Py_UCS4 stack[20];
    495     Py_ssize_t space, isize;
    496     int index, prefix, count, stackptr;
    497     unsigned char prev, cur;
    498 
    499     stackptr = 0;
    500     isize = PyUnicode_GET_LENGTH(input);
    501     space = isize;
    502     /* Overallocate at most 10 characters. */
    503     if (space > 10) {
    504         if (space <= PY_SSIZE_T_MAX - 10)
    505             space += 10;
    506     }
    507     else {
    508         space *= 2;
    509     }
    510     osize = space;
    511     output = PyMem_NEW(Py_UCS4, space);
    512     if (!output) {
    513         PyErr_NoMemory();
    514         return NULL;
    515     }
    516     i = o = 0;
    517     kind = PyUnicode_KIND(input);
    518     data = PyUnicode_DATA(input);
    519 
    520     while (i < isize) {
    521         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
    522         while(stackptr) {
    523             Py_UCS4 code = stack[--stackptr];
    524             /* Hangul Decomposition adds three characters in
    525                a single step, so we need at least that much room. */
    526             if (space < 3) {
    527                 Py_UCS4 *new_output;
    528                 osize += 10;
    529                 space += 10;
    530                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
    531                 if (new_output == NULL) {
    532                     PyMem_Free(output);
    533                     PyErr_NoMemory();
    534                     return NULL;
    535                 }
    536                 output = new_output;
    537             }
    538             /* Hangul Decomposition. */
    539             if (SBase <= code && code < (SBase+SCount)) {
    540                 int SIndex = code - SBase;
    541                 int L = LBase + SIndex / NCount;
    542                 int V = VBase + (SIndex % NCount) / TCount;
    543                 int T = TBase + SIndex % TCount;
    544                 output[o++] = L;
    545                 output[o++] = V;
    546                 space -= 2;
    547                 if (T != TBase) {
    548                     output[o++] = T;
    549                     space --;
    550                 }
    551                 continue;
    552             }
    553             /* normalization changes */
    554             if (self && UCD_Check(self)) {
    555                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
    556                 if (value != 0) {
    557                     stack[stackptr++] = value;
    558                     continue;
    559                 }
    560             }
    561 
    562             /* Other decompositions. */
    563             get_decomp_record(self, code, &index, &prefix, &count);
    564 
    565             /* Copy character if it is not decomposable, or has a
    566                compatibility decomposition, but we do NFD. */
    567             if (!count || (prefix && !k)) {
    568                 output[o++] = code;
    569                 space--;
    570                 continue;
    571             }
    572             /* Copy decomposition onto the stack, in reverse
    573                order.  */
    574             while(count) {
    575                 code = decomp_data[index + (--count)];
    576                 stack[stackptr++] = code;
    577             }
    578         }
    579     }
    580 
    581     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
    582                                        output, o);
    583     PyMem_Free(output);
    584     if (!result)
    585         return NULL;
    586     /* result is guaranteed to be ready, as it is compact. */
    587     kind = PyUnicode_KIND(result);
    588     data = PyUnicode_DATA(result);
    589 
    590     /* Sort canonically. */
    591     i = 0;
    592     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
    593     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
    594         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
    595         if (prev == 0 || cur == 0 || prev <= cur) {
    596             prev = cur;
    597             continue;
    598         }
    599         /* Non-canonical order. Need to switch *i with previous. */
    600         o = i - 1;
    601         while (1) {
    602             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
    603             PyUnicode_WRITE(kind, data, o+1,
    604                             PyUnicode_READ(kind, data, o));
    605             PyUnicode_WRITE(kind, data, o, tmp);
    606             o--;
    607             if (o < 0)
    608                 break;
    609             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
    610             if (prev == 0 || prev <= cur)
    611                 break;
    612         }
    613         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
    614     }
    615     return result;
    616 }
    617 
    618 static int
    619 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
    620 {
    621     unsigned int index;
    622     for (index = 0; nfc[index].start; index++) {
    623         unsigned int start = nfc[index].start;
    624         if (code < start)
    625             return -1;
    626         if (code <= start + nfc[index].count) {
    627             unsigned int delta = code - start;
    628             return nfc[index].index + delta;
    629         }
    630     }
    631     return -1;
    632 }
    633 
    634 static PyObject*
    635 nfc_nfkc(PyObject *self, PyObject *input, int k)
    636 {
    637     PyObject *result;
    638     int kind;
    639     void *data;
    640     Py_UCS4 *output;
    641     Py_ssize_t i, i1, o, len;
    642     int f,l,index,index1,comb;
    643     Py_UCS4 code;
    644     Py_ssize_t skipped[20];
    645     int cskipped = 0;
    646 
    647     result = nfd_nfkd(self, input, k);
    648     if (!result)
    649         return NULL;
    650     /* result will be "ready". */
    651     kind = PyUnicode_KIND(result);
    652     data = PyUnicode_DATA(result);
    653     len = PyUnicode_GET_LENGTH(result);
    654 
    655     /* We allocate a buffer for the output.
    656        If we find that we made no changes, we still return
    657        the NFD result. */
    658     output = PyMem_NEW(Py_UCS4, len);
    659     if (!output) {
    660         PyErr_NoMemory();
    661         Py_DECREF(result);
    662         return 0;
    663     }
    664     i = o = 0;
    665 
    666   again:
    667     while (i < len) {
    668       for (index = 0; index < cskipped; index++) {
    669           if (skipped[index] == i) {
    670               /* *i character is skipped.
    671                  Remove from list. */
    672               skipped[index] = skipped[cskipped-1];
    673               cskipped--;
    674               i++;
    675               goto again; /* continue while */
    676           }
    677       }
    678       /* Hangul Composition. We don't need to check for <LV,T>
    679          pairs, since we always have decomposed data. */
    680       code = PyUnicode_READ(kind, data, i);
    681       if (LBase <= code && code < (LBase+LCount) &&
    682           i + 1 < len &&
    683           VBase <= PyUnicode_READ(kind, data, i+1) &&
    684           PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
    685           /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
    686              and V character is a modern vowel (0x1161 ~ 0x1175). */
    687           int LIndex, VIndex;
    688           LIndex = code - LBase;
    689           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
    690           code = SBase + (LIndex*VCount+VIndex)*TCount;
    691           i+=2;
    692           if (i < len &&
    693               TBase < PyUnicode_READ(kind, data, i) &&
    694               PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
    695               /* check T character is a modern trailing consonant
    696                  (0x11A8 ~ 0x11C2). */
    697               code += PyUnicode_READ(kind, data, i)-TBase;
    698               i++;
    699           }
    700           output[o++] = code;
    701           continue;
    702       }
    703 
    704       /* code is still input[i] here */
    705       f = find_nfc_index(self, nfc_first, code);
    706       if (f == -1) {
    707           output[o++] = code;
    708           i++;
    709           continue;
    710       }
    711       /* Find next unblocked character. */
    712       i1 = i+1;
    713       comb = 0;
    714       /* output base character for now; might be updated later. */
    715       output[o] = PyUnicode_READ(kind, data, i);
    716       while (i1 < len) {
    717           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
    718           int comb1 = _getrecord_ex(code1)->combining;
    719           if (comb) {
    720               if (comb1 == 0)
    721                   break;
    722               if (comb >= comb1) {
    723                   /* Character is blocked. */
    724                   i1++;
    725                   continue;
    726               }
    727           }
    728           l = find_nfc_index(self, nfc_last, code1);
    729           /* i1 cannot be combined with i. If i1
    730              is a starter, we don't need to look further.
    731              Otherwise, record the combining class. */
    732           if (l == -1) {
    733             not_combinable:
    734               if (comb1 == 0)
    735                   break;
    736               comb = comb1;
    737               i1++;
    738               continue;
    739           }
    740           index = f*TOTAL_LAST + l;
    741           index1 = comp_index[index >> COMP_SHIFT];
    742           code = comp_data[(index1<<COMP_SHIFT)+
    743                            (index&((1<<COMP_SHIFT)-1))];
    744           if (code == 0)
    745               goto not_combinable;
    746 
    747           /* Replace the original character. */
    748           output[o] = code;
    749           /* Mark the second character unused. */
    750           assert(cskipped < 20);
    751           skipped[cskipped++] = i1;
    752           i1++;
    753           f = find_nfc_index(self, nfc_first, output[o]);
    754           if (f == -1)
    755               break;
    756       }
    757       /* Output character was already written.
    758          Just advance the indices. */
    759       o++; i++;
    760     }
    761     if (o == len) {
    762         /* No changes. Return original string. */
    763         PyMem_Free(output);
    764         return result;
    765     }
    766     Py_DECREF(result);
    767     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
    768                                        output, o);
    769     PyMem_Free(output);
    770     return result;
    771 }
    772 
    773 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
    774 static int
    775 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
    776 {
    777     Py_ssize_t i, len;
    778     int kind;
    779     void *data;
    780     unsigned char prev_combining = 0, quickcheck_mask;
    781 
    782     /* An older version of the database is requested, quickchecks must be
    783        disabled. */
    784     if (self && UCD_Check(self))
    785         return 0;
    786 
    787     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
    788        as described in http://unicode.org/reports/tr15/#Annex8. */
    789     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
    790 
    791     i = 0;
    792     kind = PyUnicode_KIND(input);
    793     data = PyUnicode_DATA(input);
    794     len = PyUnicode_GET_LENGTH(input);
    795     while (i < len) {
    796         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
    797         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
    798         unsigned char combining = record->combining;
    799         unsigned char quickcheck = record->normalization_quick_check;
    800 
    801         if (quickcheck & quickcheck_mask)
    802             return 0; /* this string might need normalization */
    803         if (combining && prev_combining > combining)
    804             return 0; /* non-canonical sort order, not normalized */
    805         prev_combining = combining;
    806     }
    807     return 1; /* certainly normalized */
    808 }
    809 
    810 /*[clinic input]
    811 unicodedata.UCD.normalize
    812 
    813     self: self
    814     form: str
    815     unistr as input: unicode
    816     /
    817 
    818 Return the normal form 'form' for the Unicode string unistr.
    819 
    820 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
    821 [clinic start generated code]*/
    822 
    823 static PyObject *
    824 unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
    825                                PyObject *input)
    826 /*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/
    827 {
    828     if (PyUnicode_GET_LENGTH(input) == 0) {
    829         /* Special case empty input strings, since resizing
    830            them  later would cause internal errors. */
    831         Py_INCREF(input);
    832         return input;
    833     }
    834 
    835     if (strcmp(form, "NFC") == 0) {
    836         if (is_normalized(self, input, 1, 0)) {
    837             Py_INCREF(input);
    838             return input;
    839         }
    840         return nfc_nfkc(self, input, 0);
    841     }
    842     if (strcmp(form, "NFKC") == 0) {
    843         if (is_normalized(self, input, 1, 1)) {
    844             Py_INCREF(input);
    845             return input;
    846         }
    847         return nfc_nfkc(self, input, 1);
    848     }
    849     if (strcmp(form, "NFD") == 0) {
    850         if (is_normalized(self, input, 0, 0)) {
    851             Py_INCREF(input);
    852             return input;
    853         }
    854         return nfd_nfkd(self, input, 0);
    855     }
    856     if (strcmp(form, "NFKD") == 0) {
    857         if (is_normalized(self, input, 0, 1)) {
    858             Py_INCREF(input);
    859             return input;
    860         }
    861         return nfd_nfkd(self, input, 1);
    862     }
    863     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
    864     return NULL;
    865 }
    866 
    867 /* -------------------------------------------------------------------- */
    868 /* unicode character name tables */
    869 
    870 /* data file generated by Tools/unicode/makeunicodedata.py */
    871 #include "unicodename_db.h"
    872 
    873 /* -------------------------------------------------------------------- */
    874 /* database code (cut and pasted from the unidb package) */
    875 
    876 static unsigned long
    877 _gethash(const char *s, int len, int scale)
    878 {
    879     int i;
    880     unsigned long h = 0;
    881     unsigned long ix;
    882     for (i = 0; i < len; i++) {
    883         h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
    884         ix = h & 0xff000000;
    885         if (ix)
    886             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
    887     }
    888     return h;
    889 }
    890 
    891 static const char * const hangul_syllables[][3] = {
    892     { "G",  "A",   ""   },
    893     { "GG", "AE",  "G"  },
    894     { "N",  "YA",  "GG" },
    895     { "D",  "YAE", "GS" },
    896     { "DD", "EO",  "N", },
    897     { "R",  "E",   "NJ" },
    898     { "M",  "YEO", "NH" },
    899     { "B",  "YE",  "D"  },
    900     { "BB", "O",   "L"  },
    901     { "S",  "WA",  "LG" },
    902     { "SS", "WAE", "LM" },
    903     { "",   "OE",  "LB" },
    904     { "J",  "YO",  "LS" },
    905     { "JJ", "U",   "LT" },
    906     { "C",  "WEO", "LP" },
    907     { "K",  "WE",  "LH" },
    908     { "T",  "WI",  "M"  },
    909     { "P",  "YU",  "B"  },
    910     { "H",  "EU",  "BS" },
    911     { 0,    "YI",  "S"  },
    912     { 0,    "I",   "SS" },
    913     { 0,    0,     "NG" },
    914     { 0,    0,     "J"  },
    915     { 0,    0,     "C"  },
    916     { 0,    0,     "K"  },
    917     { 0,    0,     "T"  },
    918     { 0,    0,     "P"  },
    919     { 0,    0,     "H"  }
    920 };
    921 
    922 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
    923 static int
    924 is_unified_ideograph(Py_UCS4 code)
    925 {
    926     return
    927         (0x3400 <= code && code <= 0x4DB5)   || /* CJK Ideograph Extension A */
    928         (0x4E00 <= code && code <= 0x9FEF)   || /* CJK Ideograph */
    929         (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
    930         (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
    931         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
    932         (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
    933         (0x2CEB0 <= code && code <= 0x2EBEF);   /* CJK Ideograph Extension F */
    934 }
    935 
    936 /* macros used to determine if the given code point is in the PUA range that
    937  * we are using to store aliases and named sequences */
    938 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
    939 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
    940                           (cp < named_sequences_end))
    941 
    942 static int
    943 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
    944            int with_alias_and_seq)
    945 {
    946     /* Find the name associated with the given code point.
    947      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
    948      * that we are using for aliases and named sequences. */
    949     int offset;
    950     int i;
    951     int word;
    952     unsigned char* w;
    953 
    954     if (code >= 0x110000)
    955         return 0;
    956 
    957     /* XXX should we just skip all the code points in the PUAs here? */
    958     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
    959         return 0;
    960 
    961     if (self && UCD_Check(self)) {
    962         /* in 3.2.0 there are no aliases and named sequences */
    963         const change_record *old;
    964         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
    965             return 0;
    966         old = get_old_record(self, code);
    967         if (old->category_changed == 0) {
    968             /* unassigned */
    969             return 0;
    970         }
    971     }
    972 
    973     if (SBase <= code && code < SBase+SCount) {
    974         /* Hangul syllable. */
    975         int SIndex = code - SBase;
    976         int L = SIndex / NCount;
    977         int V = (SIndex % NCount) / TCount;
    978         int T = SIndex % TCount;
    979 
    980         if (buflen < 27)
    981             /* Worst case: HANGUL SYLLABLE <10chars>. */
    982             return 0;
    983         strcpy(buffer, "HANGUL SYLLABLE ");
    984         buffer += 16;
    985         strcpy(buffer, hangul_syllables[L][0]);
    986         buffer += strlen(hangul_syllables[L][0]);
    987         strcpy(buffer, hangul_syllables[V][1]);
    988         buffer += strlen(hangul_syllables[V][1]);
    989         strcpy(buffer, hangul_syllables[T][2]);
    990         buffer += strlen(hangul_syllables[T][2]);
    991         *buffer = '\0';
    992         return 1;
    993     }
    994 
    995     if (is_unified_ideograph(code)) {
    996         if (buflen < 28)
    997             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
    998             return 0;
    999         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
   1000         return 1;
   1001     }
   1002 
   1003     /* get offset into phrasebook */
   1004     offset = phrasebook_offset1[(code>>phrasebook_shift)];
   1005     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
   1006                                (code&((1<<phrasebook_shift)-1))];
   1007     if (!offset)
   1008         return 0;
   1009 
   1010     i = 0;
   1011 
   1012     for (;;) {
   1013         /* get word index */
   1014         word = phrasebook[offset] - phrasebook_short;
   1015         if (word >= 0) {
   1016             word = (word << 8) + phrasebook[offset+1];
   1017             offset += 2;
   1018         } else
   1019             word = phrasebook[offset++];
   1020         if (i) {
   1021             if (i > buflen)
   1022                 return 0; /* buffer overflow */
   1023             buffer[i++] = ' ';
   1024         }
   1025         /* copy word string from lexicon.  the last character in the
   1026            word has bit 7 set.  the last word in a string ends with
   1027            0x80 */
   1028         w = lexicon + lexicon_offset[word];
   1029         while (*w < 128) {
   1030             if (i >= buflen)
   1031                 return 0; /* buffer overflow */
   1032             buffer[i++] = *w++;
   1033         }
   1034         if (i >= buflen)
   1035             return 0; /* buffer overflow */
   1036         buffer[i++] = *w & 127;
   1037         if (*w == 128)
   1038             break; /* end of word */
   1039     }
   1040 
   1041     return 1;
   1042 }
   1043 
   1044 static int
   1045 _cmpname(PyObject *self, int code, const char* name, int namelen)
   1046 {
   1047     /* check if code corresponds to the given name */
   1048     int i;
   1049     char buffer[NAME_MAXLEN+1];
   1050     if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
   1051         return 0;
   1052     for (i = 0; i < namelen; i++) {
   1053         if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
   1054             return 0;
   1055     }
   1056     return buffer[namelen] == '\0';
   1057 }
   1058 
   1059 static void
   1060 find_syllable(const char *str, int *len, int *pos, int count, int column)
   1061 {
   1062     int i, len1;
   1063     *len = -1;
   1064     for (i = 0; i < count; i++) {
   1065         const char *s = hangul_syllables[i][column];
   1066         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
   1067         if (len1 <= *len)
   1068             continue;
   1069         if (strncmp(str, s, len1) == 0) {
   1070             *len = len1;
   1071             *pos = i;
   1072         }
   1073     }
   1074     if (*len == -1) {
   1075         *len = 0;
   1076     }
   1077 }
   1078 
   1079 static int
   1080 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
   1081 {
   1082     /* check if named sequences are allowed */
   1083     if (!with_named_seq && IS_NAMED_SEQ(cp))
   1084         return 0;
   1085     /* if the code point is in the PUA range that we use for aliases,
   1086      * convert it to obtain the right code point */
   1087     if (IS_ALIAS(cp))
   1088         *code = name_aliases[cp-aliases_start];
   1089     else
   1090         *code = cp;
   1091     return 1;
   1092 }
   1093 
   1094 static int
   1095 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
   1096          int with_named_seq)
   1097 {
   1098     /* Return the code point associated with the given name.
   1099      * Named aliases are resolved too (unless self != NULL (i.e. we are using
   1100      * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
   1101      * using for the named sequence, and the caller must then convert it. */
   1102     unsigned int h, v;
   1103     unsigned int mask = code_size-1;
   1104     unsigned int i, incr;
   1105 
   1106     /* Check for hangul syllables. */
   1107     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
   1108         int len, L = -1, V = -1, T = -1;
   1109         const char *pos = name + 16;
   1110         find_syllable(pos, &len, &L, LCount, 0);
   1111         pos += len;
   1112         find_syllable(pos, &len, &V, VCount, 1);
   1113         pos += len;
   1114         find_syllable(pos, &len, &T, TCount, 2);
   1115         pos += len;
   1116         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
   1117             *code = SBase + (L*VCount+V)*TCount + T;
   1118             return 1;
   1119         }
   1120         /* Otherwise, it's an illegal syllable name. */
   1121         return 0;
   1122     }
   1123 
   1124     /* Check for unified ideographs. */
   1125     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
   1126         /* Four or five hexdigits must follow. */
   1127         v = 0;
   1128         name += 22;
   1129         namelen -= 22;
   1130         if (namelen != 4 && namelen != 5)
   1131             return 0;
   1132         while (namelen--) {
   1133             v *= 16;
   1134             if (*name >= '0' && *name <= '9')
   1135                 v += *name - '0';
   1136             else if (*name >= 'A' && *name <= 'F')
   1137                 v += *name - 'A' + 10;
   1138             else
   1139                 return 0;
   1140             name++;
   1141         }
   1142         if (!is_unified_ideograph(v))
   1143             return 0;
   1144         *code = v;
   1145         return 1;
   1146     }
   1147 
   1148     /* the following is the same as python's dictionary lookup, with
   1149        only minor changes.  see the makeunicodedata script for more
   1150        details */
   1151 
   1152     h = (unsigned int) _gethash(name, namelen, code_magic);
   1153     i = (~h) & mask;
   1154     v = code_hash[i];
   1155     if (!v)
   1156         return 0;
   1157     if (_cmpname(self, v, name, namelen))
   1158         return _check_alias_and_seq(v, code, with_named_seq);
   1159     incr = (h ^ (h >> 3)) & mask;
   1160     if (!incr)
   1161         incr = mask;
   1162     for (;;) {
   1163         i = (i + incr) & mask;
   1164         v = code_hash[i];
   1165         if (!v)
   1166             return 0;
   1167         if (_cmpname(self, v, name, namelen))
   1168             return _check_alias_and_seq(v, code, with_named_seq);
   1169         incr = incr << 1;
   1170         if (incr > mask)
   1171             incr = incr ^ code_poly;
   1172     }
   1173 }
   1174 
   1175 static const _PyUnicode_Name_CAPI hashAPI =
   1176 {
   1177     sizeof(_PyUnicode_Name_CAPI),
   1178     _getucname,
   1179     _getcode
   1180 };
   1181 
   1182 /* -------------------------------------------------------------------- */
   1183 /* Python bindings */
   1184 
   1185 /*[clinic input]
   1186 unicodedata.UCD.name
   1187 
   1188     self: self
   1189     chr: int(accept={str})
   1190     default: object=NULL
   1191     /
   1192 
   1193 Returns the name assigned to the character chr as a string.
   1194 
   1195 If no name is defined, default is returned, or, if not given,
   1196 ValueError is raised.
   1197 [clinic start generated code]*/
   1198 
   1199 static PyObject *
   1200 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
   1201 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
   1202 {
   1203     char name[NAME_MAXLEN+1];
   1204     Py_UCS4 c = (Py_UCS4)chr;
   1205 
   1206     if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
   1207         if (default_value == NULL) {
   1208             PyErr_SetString(PyExc_ValueError, "no such name");
   1209             return NULL;
   1210         }
   1211         else {
   1212             Py_INCREF(default_value);
   1213             return default_value;
   1214         }
   1215     }
   1216 
   1217     return PyUnicode_FromString(name);
   1218 }
   1219 
   1220 /*[clinic input]
   1221 unicodedata.UCD.lookup
   1222 
   1223     self: self
   1224     name: str(accept={str, robuffer}, zeroes=True)
   1225     /
   1226 
   1227 Look up character by name.
   1228 
   1229 If a character with the given name is found, return the
   1230 corresponding character.  If not found, KeyError is raised.
   1231 [clinic start generated code]*/
   1232 
   1233 static PyObject *
   1234 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
   1235                             Py_ssize_clean_t name_length)
   1236 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
   1237 {
   1238     Py_UCS4 code;
   1239     unsigned int index;
   1240     if (name_length > NAME_MAXLEN) {
   1241         PyErr_SetString(PyExc_KeyError, "name too long");
   1242         return NULL;
   1243     }
   1244 
   1245     if (!_getcode(self, name, (int)name_length, &code, 1)) {
   1246         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
   1247         return NULL;
   1248     }
   1249     /* check if code is in the PUA range that we use for named sequences
   1250        and convert it */
   1251     if (IS_NAMED_SEQ(code)) {
   1252         index = code-named_sequences_start;
   1253         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
   1254                                          named_sequences[index].seq,
   1255                                          named_sequences[index].seqlen);
   1256     }
   1257     return PyUnicode_FromOrdinal(code);
   1258 }
   1259 
   1260 /* XXX Add doc strings. */
   1261 
   1262 static PyMethodDef unicodedata_functions[] = {
   1263     UNICODEDATA_UCD_DECIMAL_METHODDEF
   1264     UNICODEDATA_UCD_DIGIT_METHODDEF
   1265     UNICODEDATA_UCD_NUMERIC_METHODDEF
   1266     UNICODEDATA_UCD_CATEGORY_METHODDEF
   1267     UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
   1268     UNICODEDATA_UCD_COMBINING_METHODDEF
   1269     UNICODEDATA_UCD_MIRRORED_METHODDEF
   1270     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
   1271     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
   1272     UNICODEDATA_UCD_NAME_METHODDEF
   1273     UNICODEDATA_UCD_LOOKUP_METHODDEF
   1274     UNICODEDATA_UCD_NORMALIZE_METHODDEF
   1275     {NULL, NULL}                /* sentinel */
   1276 };
   1277 
   1278 static PyTypeObject UCD_Type = {
   1279         /* The ob_type field must be initialized in the module init function
   1280          * to be portable to Windows without using C++. */
   1281         PyVarObject_HEAD_INIT(NULL, 0)
   1282         "unicodedata.UCD",              /*tp_name*/
   1283         sizeof(PreviousDBVersion),      /*tp_basicsize*/
   1284         0,                      /*tp_itemsize*/
   1285         /* methods */
   1286         (destructor)PyObject_Del, /*tp_dealloc*/
   1287         0,                      /*tp_print*/
   1288         0,                      /*tp_getattr*/
   1289         0,                      /*tp_setattr*/
   1290         0,                      /*tp_reserved*/
   1291         0,                      /*tp_repr*/
   1292         0,                      /*tp_as_number*/
   1293         0,                      /*tp_as_sequence*/
   1294         0,                      /*tp_as_mapping*/
   1295         0,                      /*tp_hash*/
   1296         0,                      /*tp_call*/
   1297         0,                      /*tp_str*/
   1298         PyObject_GenericGetAttr,/*tp_getattro*/
   1299         0,                      /*tp_setattro*/
   1300         0,                      /*tp_as_buffer*/
   1301         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
   1302         0,                      /*tp_doc*/
   1303         0,                      /*tp_traverse*/
   1304         0,                      /*tp_clear*/
   1305         0,                      /*tp_richcompare*/
   1306         0,                      /*tp_weaklistoffset*/
   1307         0,                      /*tp_iter*/
   1308         0,                      /*tp_iternext*/
   1309         unicodedata_functions,  /*tp_methods*/
   1310         DB_members,             /*tp_members*/
   1311         0,                      /*tp_getset*/
   1312         0,                      /*tp_base*/
   1313         0,                      /*tp_dict*/
   1314         0,                      /*tp_descr_get*/
   1315         0,                      /*tp_descr_set*/
   1316         0,                      /*tp_dictoffset*/
   1317         0,                      /*tp_init*/
   1318         0,                      /*tp_alloc*/
   1319         0,                      /*tp_new*/
   1320         0,                      /*tp_free*/
   1321         0,                      /*tp_is_gc*/
   1322 };
   1323 
   1324 PyDoc_STRVAR(unicodedata_docstring,
   1325 "This module provides access to the Unicode Character Database which\n\
   1326 defines character properties for all Unicode characters. The data in\n\
   1327 this database is based on the UnicodeData.txt file version\n\
   1328 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
   1329 \n\
   1330 The module uses the same names and symbols as defined by the\n\
   1331 UnicodeData File Format " UNIDATA_VERSION ".");
   1332 
   1333 static struct PyModuleDef unicodedatamodule = {
   1334         PyModuleDef_HEAD_INIT,
   1335         "unicodedata",
   1336         unicodedata_docstring,
   1337         -1,
   1338         unicodedata_functions,
   1339         NULL,
   1340         NULL,
   1341         NULL,
   1342         NULL
   1343 };
   1344 
   1345 PyMODINIT_FUNC
   1346 PyInit_unicodedata(void)
   1347 {
   1348     PyObject *m, *v;
   1349 
   1350     Py_TYPE(&UCD_Type) = &PyType_Type;
   1351 
   1352     m = PyModule_Create(&unicodedatamodule);
   1353     if (!m)
   1354         return NULL;
   1355 
   1356     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
   1357     Py_INCREF(&UCD_Type);
   1358     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
   1359 
   1360     /* Previous versions */
   1361     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
   1362     if (v != NULL)
   1363         PyModule_AddObject(m, "ucd_3_2_0", v);
   1364 
   1365     /* Export C API */
   1366     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
   1367     if (v != NULL)
   1368         PyModule_AddObject(m, "ucnhash_CAPI", v);
   1369     return m;
   1370 }
   1371 
   1372 /*
   1373 Local variables:
   1374 c-basic-offset: 4
   1375 indent-tabs-mode: nil
   1376 End:
   1377 */
   1378