Home | History | Annotate | Download | only in Modules
      1 /* ------------------------------------------------------------------------
      2 
      3    unicodedata -- Provides access to the Unicode database.
      4 
      5    Data was extracted from the UnicodeData.txt file.
      6    The current version number is reported in the unidata_version constant.
      7 
      8    Written by Marc-Andre Lemburg (mal (at) lemburg.com).
      9    Modified for Python 2.0 by Fredrik Lundh (fredrik (at) pythonware.com)
     10    Modified by Martin v. Lwis (martin (at) v.loewis.de)
     11 
     12    Copyright (c) Corporation for National Research Initiatives.
     13 
     14    ------------------------------------------------------------------------ */
     15 
     16 #define PY_SSIZE_T_CLEAN
     17 
     18 #include "Python.h"
     19 #include "ucnhash.h"
     20 #include "structmember.h"
     21 
     22 /*[clinic input]
     23 module unicodedata
     24 class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
     25 [clinic start generated code]*/
     26 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
     27 
     28 /* character properties */
     29 
     30 typedef struct {
     31     const unsigned char category;       /* index into
     32                                            _PyUnicode_CategoryNames */
     33     const unsigned char combining;      /* combining class value 0 - 255 */
     34     const unsigned char bidirectional;  /* index into
     35                                            _PyUnicode_BidirectionalNames */
     36     const unsigned char mirrored;       /* true if mirrored in bidir mode */
     37     const unsigned char east_asian_width;       /* index into
     38                                                    _PyUnicode_EastAsianWidth */
     39     const unsigned char normalization_quick_check; /* see is_normalized() */
     40 } _PyUnicode_DatabaseRecord;
     41 
     42 typedef struct change_record {
     43     /* sequence of fields should be the same as in merge_old_version */
     44     const unsigned char bidir_changed;
     45     const unsigned char category_changed;
     46     const unsigned char decimal_changed;
     47     const unsigned char mirrored_changed;
     48     const unsigned char east_asian_width_changed;
     49     const double numeric_changed;
     50 } change_record;
     51 
     52 /* data file generated by Tools/unicode/makeunicodedata.py */
     53 #include "unicodedata_db.h"
     54 
     55 static const _PyUnicode_DatabaseRecord*
     56 _getrecord_ex(Py_UCS4 code)
     57 {
     58     int index;
     59     if (code >= 0x110000)
     60         index = 0;
     61     else {
     62         index = index1[(code>>SHIFT)];
     63         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
     64     }
     65 
     66     return &_PyUnicode_Database_Records[index];
     67 }
     68 
     69 /* ------------- Previous-version API ------------------------------------- */
     70 typedef struct previous_version {
     71     PyObject_HEAD
     72     const char *name;
     73     const change_record* (*getrecord)(Py_UCS4);
     74     Py_UCS4 (*normalization)(Py_UCS4);
     75 } PreviousDBVersion;
     76 
     77 #include "clinic/unicodedata.c.h"
     78 
     79 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
     80 
     81 static PyMemberDef DB_members[] = {
     82         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
     83         {NULL}
     84 };
     85 
     86 /* forward declaration */
     87 static PyTypeObject UCD_Type;
     88 #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
     89 
     90 static PyObject*
     91 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
     92                      Py_UCS4 (*normalization)(Py_UCS4))
     93 {
     94         PreviousDBVersion *self;
     95         self = PyObject_New(PreviousDBVersion, &UCD_Type);
     96         if (self == NULL)
     97                 return NULL;
     98         self->name = name;
     99         self->getrecord = getrecord;
    100         self->normalization = normalization;
    101         return (PyObject*)self;
    102 }
    103 
    104 
    105 /* --- Module API --------------------------------------------------------- */
    106 
    107 /*[clinic input]
    108 unicodedata.UCD.decimal
    109 
    110     self: self
    111     chr: int(accept={str})
    112     default: object=NULL
    113     /
    114 
    115 Converts a Unicode character into its equivalent decimal value.
    116 
    117 Returns the decimal value assigned to the character chr as integer.
    118 If no such value is defined, default is returned, or, if not given,
    119 ValueError is raised.
    120 [clinic start generated code]*/
    121 
    122 static PyObject *
    123 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
    124                              PyObject *default_value)
    125 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
    126 {
    127     int have_old = 0;
    128     long rc;
    129     Py_UCS4 c = (Py_UCS4)chr;
    130 
    131     if (self && UCD_Check(self)) {
    132         const change_record *old = get_old_record(self, c);
    133         if (old->category_changed == 0) {
    134             /* unassigned */
    135             have_old = 1;
    136             rc = -1;
    137         }
    138         else if (old->decimal_changed != 0xFF) {
    139             have_old = 1;
    140             rc = old->decimal_changed;
    141         }
    142     }
    143 
    144     if (!have_old)
    145         rc = Py_UNICODE_TODECIMAL(c);
    146     if (rc < 0) {
    147         if (default_value == NULL) {
    148             PyErr_SetString(PyExc_ValueError,
    149                             "not a decimal");
    150             return NULL;
    151         }
    152         else {
    153             Py_INCREF(default_value);
    154             return default_value;
    155         }
    156     }
    157     return PyLong_FromLong(rc);
    158 }
    159 
    160 /*[clinic input]
    161 unicodedata.UCD.digit
    162 
    163     self: self
    164     chr: int(accept={str})
    165     default: object=NULL
    166     /
    167 
    168 Converts a Unicode character into its equivalent digit value.
    169 
    170 Returns the digit value assigned to the character chr as integer.
    171 If no such value is defined, default is returned, or, if not given,
    172 ValueError is raised.
    173 [clinic start generated code]*/
    174 
    175 static PyObject *
    176 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
    177 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
    178 {
    179     long rc;
    180     Py_UCS4 c = (Py_UCS4)chr;
    181     rc = Py_UNICODE_TODIGIT(c);
    182     if (rc < 0) {
    183         if (default_value == NULL) {
    184             PyErr_SetString(PyExc_ValueError, "not a digit");
    185             return NULL;
    186         }
    187         else {
    188             Py_INCREF(default_value);
    189             return default_value;
    190         }
    191     }
    192     return PyLong_FromLong(rc);
    193 }
    194 
    195 /*[clinic input]
    196 unicodedata.UCD.numeric
    197 
    198     self: self
    199     chr: int(accept={str})
    200     default: object=NULL
    201     /
    202 
    203 Converts a Unicode character into its equivalent numeric value.
    204 
    205 Returns the numeric value assigned to the character chr as float.
    206 If no such value is defined, default is returned, or, if not given,
    207 ValueError is raised.
    208 [clinic start generated code]*/
    209 
    210 static PyObject *
    211 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
    212                              PyObject *default_value)
    213 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
    214 {
    215     int have_old = 0;
    216     double rc;
    217     Py_UCS4 c = (Py_UCS4)chr;
    218 
    219     if (self && UCD_Check(self)) {
    220         const change_record *old = get_old_record(self, c);
    221         if (old->category_changed == 0) {
    222             /* unassigned */
    223             have_old = 1;
    224             rc = -1.0;
    225         }
    226         else if (old->decimal_changed != 0xFF) {
    227             have_old = 1;
    228             rc = old->decimal_changed;
    229         }
    230     }
    231 
    232     if (!have_old)
    233         rc = Py_UNICODE_TONUMERIC(c);
    234     if (rc == -1.0) {
    235         if (default_value == NULL) {
    236             PyErr_SetString(PyExc_ValueError, "not a numeric character");
    237             return NULL;
    238         }
    239         else {
    240             Py_INCREF(default_value);
    241             return default_value;
    242         }
    243     }
    244     return PyFloat_FromDouble(rc);
    245 }
    246 
    247 /*[clinic input]
    248 unicodedata.UCD.category
    249 
    250     self: self
    251     chr: int(accept={str})
    252     /
    253 
    254 Returns the general category assigned to the character chr as string.
    255 [clinic start generated code]*/
    256 
    257 static PyObject *
    258 unicodedata_UCD_category_impl(PyObject *self, int chr)
    259 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
    260 {
    261     int index;
    262     Py_UCS4 c = (Py_UCS4)chr;
    263     index = (int) _getrecord_ex(c)->category;
    264     if (self && UCD_Check(self)) {
    265         const change_record *old = get_old_record(self, c);
    266         if (old->category_changed != 0xFF)
    267             index = old->category_changed;
    268     }
    269     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
    270 }
    271 
    272 /*[clinic input]
    273 unicodedata.UCD.bidirectional
    274 
    275     self: self
    276     chr: int(accept={str})
    277     /
    278 
    279 Returns the bidirectional class assigned to the character chr as string.
    280 
    281 If no such value is defined, an empty string is returned.
    282 [clinic start generated code]*/
    283 
    284 static PyObject *
    285 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
    286 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
    287 {
    288     int index;
    289     Py_UCS4 c = (Py_UCS4)chr;
    290     index = (int) _getrecord_ex(c)->bidirectional;
    291     if (self && UCD_Check(self)) {
    292         const change_record *old = get_old_record(self, c);
    293         if (old->category_changed == 0)
    294             index = 0; /* unassigned */
    295         else if (old->bidir_changed != 0xFF)
    296             index = old->bidir_changed;
    297     }
    298     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
    299 }
    300 
    301 /*[clinic input]
    302 unicodedata.UCD.combining -> int
    303 
    304     self: self
    305     chr: int(accept={str})
    306     /
    307 
    308 Returns the canonical combining class assigned to the character chr as integer.
    309 
    310 Returns 0 if no combining class is defined.
    311 [clinic start generated code]*/
    312 
    313 static int
    314 unicodedata_UCD_combining_impl(PyObject *self, int chr)
    315 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
    316 {
    317     int index;
    318     Py_UCS4 c = (Py_UCS4)chr;
    319     index = (int) _getrecord_ex(c)->combining;
    320     if (self && UCD_Check(self)) {
    321         const change_record *old = get_old_record(self, c);
    322         if (old->category_changed == 0)
    323             index = 0; /* unassigned */
    324     }
    325     return index;
    326 }
    327 
    328 /*[clinic input]
    329 unicodedata.UCD.mirrored -> int
    330 
    331     self: self
    332     chr: int(accept={str})
    333     /
    334 
    335 Returns the mirrored property assigned to the character chr as integer.
    336 
    337 Returns 1 if the character has been identified as a "mirrored"
    338 character in bidirectional text, 0 otherwise.
    339 [clinic start generated code]*/
    340 
    341 static int
    342 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
    343 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
    344 {
    345     int index;
    346     Py_UCS4 c = (Py_UCS4)chr;
    347     index = (int) _getrecord_ex(c)->mirrored;
    348     if (self && UCD_Check(self)) {
    349         const change_record *old = get_old_record(self, c);
    350         if (old->category_changed == 0)
    351             index = 0; /* unassigned */
    352         else if (old->mirrored_changed != 0xFF)
    353             index = old->mirrored_changed;
    354     }
    355     return index;
    356 }
    357 
    358 /*[clinic input]
    359 unicodedata.UCD.east_asian_width
    360 
    361     self: self
    362     chr: int(accept={str})
    363     /
    364 
    365 Returns the east asian width assigned to the character chr as string.
    366 [clinic start generated code]*/
    367 
    368 static PyObject *
    369 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
    370 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
    371 {
    372     int index;
    373     Py_UCS4 c = (Py_UCS4)chr;
    374     index = (int) _getrecord_ex(c)->east_asian_width;
    375     if (self && UCD_Check(self)) {
    376         const change_record *old = get_old_record(self, c);
    377         if (old->category_changed == 0)
    378             index = 0; /* unassigned */
    379         else if (old->east_asian_width_changed != 0xFF)
    380             index = old->east_asian_width_changed;
    381     }
    382     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
    383 }
    384 
    385 /*[clinic input]
    386 unicodedata.UCD.decomposition
    387 
    388     self: self
    389     chr: int(accept={str})
    390     /
    391 
    392 Returns the character decomposition mapping assigned to the character chr as string.
    393 
    394 An empty string is returned in case no such mapping is defined.
    395 [clinic start generated code]*/
    396 
    397 static PyObject *
    398 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
    399 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
    400 {
    401     char decomp[256];
    402     int code, index, count;
    403     size_t i;
    404     unsigned int prefix_index;
    405     Py_UCS4 c = (Py_UCS4)chr;
    406 
    407     code = (int)c;
    408 
    409     if (self && UCD_Check(self)) {
    410         const change_record *old = get_old_record(self, c);
    411         if (old->category_changed == 0)
    412             return PyUnicode_FromString(""); /* unassigned */
    413     }
    414 
    415     if (code < 0 || code >= 0x110000)
    416         index = 0;
    417     else {
    418         index = decomp_index1[(code>>DECOMP_SHIFT)];
    419         index = decomp_index2[(index<<DECOMP_SHIFT)+
    420                              (code&((1<<DECOMP_SHIFT)-1))];
    421     }
    422 
    423     /* high byte is number of hex bytes (usually one or two), low byte
    424        is prefix code (from*/
    425     count = decomp_data[index] >> 8;
    426 
    427     /* XXX: could allocate the PyString up front instead
    428        (strlen(prefix) + 5 * count + 1 bytes) */
    429 
    430     /* Based on how index is calculated above and decomp_data is generated
    431        from Tools/unicode/makeunicodedata.py, it should not be possible
    432        to overflow decomp_prefix. */
    433     prefix_index = decomp_data[index] & 255;
    434     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
    435 
    436     /* copy prefix */
    437     i = strlen(decomp_prefix[prefix_index]);
    438     memcpy(decomp, decomp_prefix[prefix_index], i);
    439 
    440     while (count-- > 0) {
    441         if (i)
    442             decomp[i++] = ' ';
    443         assert(i < sizeof(decomp));
    444         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
    445                       decomp_data[++index]);
    446         i += strlen(decomp + i);
    447     }
    448     return PyUnicode_FromStringAndSize(decomp, i);
    449 }
    450 
    451 static void
    452 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
    453 {
    454     if (code >= 0x110000) {
    455         *index = 0;
    456     } else if (self && UCD_Check(self) &&
    457                get_old_record(self, code)->category_changed==0) {
    458         /* unassigned in old version */
    459         *index = 0;
    460     }
    461     else {
    462         *index = decomp_index1[(code>>DECOMP_SHIFT)];
    463         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
    464                                (code&((1<<DECOMP_SHIFT)-1))];
    465     }
    466 
    467     /* high byte is number of hex bytes (usually one or two), low byte
    468        is prefix code (from*/
    469     *count = decomp_data[*index] >> 8;
    470     *prefix = decomp_data[*index] & 255;
    471 
    472     (*index)++;
    473 }
    474 
    475 #define SBase   0xAC00
    476 #define LBase   0x1100
    477 #define VBase   0x1161
    478 #define TBase   0x11A7
    479 #define LCount  19
    480 #define VCount  21
    481 #define TCount  28
    482 #define NCount  (VCount*TCount)
    483 #define SCount  (LCount*NCount)
    484 
    485 static PyObject*
    486 nfd_nfkd(PyObject *self, PyObject *input, int k)
    487 {
    488     PyObject *result;
    489     Py_UCS4 *output;
    490     Py_ssize_t i, o, osize;
    491     int kind;
    492     void *data;
    493     /* Longest decomposition in Unicode 3.2: U+FDFA */
    494     Py_UCS4 stack[20];
    495     Py_ssize_t space, isize;
    496     int index, prefix, count, stackptr;
    497     unsigned char prev, cur;
    498 
    499     stackptr = 0;
    500     isize = PyUnicode_GET_LENGTH(input);
    501     space = isize;
    502     /* Overallocate at most 10 characters. */
    503     if (space > 10) {
    504         if (space <= PY_SSIZE_T_MAX - 10)
    505             space += 10;
    506     }
    507     else {
    508         space *= 2;
    509     }
    510     osize = space;
    511     output = PyMem_NEW(Py_UCS4, space);
    512     if (!output) {
    513         PyErr_NoMemory();
    514         return NULL;
    515     }
    516     i = o = 0;
    517     kind = PyUnicode_KIND(input);
    518     data = PyUnicode_DATA(input);
    519 
    520     while (i < isize) {
    521         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
    522         while(stackptr) {
    523             Py_UCS4 code = stack[--stackptr];
    524             /* Hangul Decomposition adds three characters in
    525                a single step, so we need at least that much room. */
    526             if (space < 3) {
    527                 Py_UCS4 *new_output;
    528                 osize += 10;
    529                 space += 10;
    530                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
    531                 if (new_output == NULL) {
    532                     PyMem_Free(output);
    533                     PyErr_NoMemory();
    534                     return NULL;
    535                 }
    536                 output = new_output;
    537             }
    538             /* Hangul Decomposition. */
    539             if (SBase <= code && code < (SBase+SCount)) {
    540                 int SIndex = code - SBase;
    541                 int L = LBase + SIndex / NCount;
    542                 int V = VBase + (SIndex % NCount) / TCount;
    543                 int T = TBase + SIndex % TCount;
    544                 output[o++] = L;
    545                 output[o++] = V;
    546                 space -= 2;
    547                 if (T != TBase) {
    548                     output[o++] = T;
    549                     space --;
    550                 }
    551                 continue;
    552             }
    553             /* normalization changes */
    554             if (self && UCD_Check(self)) {
    555                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
    556                 if (value != 0) {
    557                     stack[stackptr++] = value;
    558                     continue;
    559                 }
    560             }
    561 
    562             /* Other decompositions. */
    563             get_decomp_record(self, code, &index, &prefix, &count);
    564 
    565             /* Copy character if it is not decomposable, or has a
    566                compatibility decomposition, but we do NFD. */
    567             if (!count || (prefix && !k)) {
    568                 output[o++] = code;
    569                 space--;
    570                 continue;
    571             }
    572             /* Copy decomposition onto the stack, in reverse
    573                order.  */
    574             while(count) {
    575                 code = decomp_data[index + (--count)];
    576                 stack[stackptr++] = code;
    577             }
    578         }
    579     }
    580 
    581     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
    582                                        output, o);
    583     PyMem_Free(output);
    584     if (!result)
    585         return NULL;
    586     /* result is guaranteed to be ready, as it is compact. */
    587     kind = PyUnicode_KIND(result);
    588     data = PyUnicode_DATA(result);
    589 
    590     /* Sort canonically. */
    591     i = 0;
    592     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
    593     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
    594         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
    595         if (prev == 0 || cur == 0 || prev <= cur) {
    596             prev = cur;
    597             continue;
    598         }
    599         /* Non-canonical order. Need to switch *i with previous. */
    600         o = i - 1;
    601         while (1) {
    602             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
    603             PyUnicode_WRITE(kind, data, o+1,
    604                             PyUnicode_READ(kind, data, o));
    605             PyUnicode_WRITE(kind, data, o, tmp);
    606             o--;
    607             if (o < 0)
    608                 break;
    609             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
    610             if (prev == 0 || prev <= cur)
    611                 break;
    612         }
    613         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
    614     }
    615     return result;
    616 }
    617 
    618 static int
    619 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
    620 {
    621     unsigned int index;
    622     for (index = 0; nfc[index].start; index++) {
    623         unsigned int start = nfc[index].start;
    624         if (code < start)
    625             return -1;
    626         if (code <= start + nfc[index].count) {
    627             unsigned int delta = code - start;
    628             return nfc[index].index + delta;
    629         }
    630     }
    631     return -1;
    632 }
    633 
    634 static PyObject*
    635 nfc_nfkc(PyObject *self, PyObject *input, int k)
    636 {
    637     PyObject *result;
    638     int kind;
    639     void *data;
    640     Py_UCS4 *output;
    641     Py_ssize_t i, i1, o, len;
    642     int f,l,index,index1,comb;
    643     Py_UCS4 code;
    644     Py_ssize_t skipped[20];
    645     int cskipped = 0;
    646 
    647     result = nfd_nfkd(self, input, k);
    648     if (!result)
    649         return NULL;
    650     /* result will be "ready". */
    651     kind = PyUnicode_KIND(result);
    652     data = PyUnicode_DATA(result);
    653     len = PyUnicode_GET_LENGTH(result);
    654 
    655     /* We allocate a buffer for the output.
    656        If we find that we made no changes, we still return
    657        the NFD result. */
    658     output = PyMem_NEW(Py_UCS4, len);
    659     if (!output) {
    660         PyErr_NoMemory();
    661         Py_DECREF(result);
    662         return 0;
    663     }
    664     i = o = 0;
    665 
    666   again:
    667     while (i < len) {
    668       for (index = 0; index < cskipped; index++) {
    669           if (skipped[index] == i) {
    670               /* *i character is skipped.
    671                  Remove from list. */
    672               skipped[index] = skipped[cskipped-1];
    673               cskipped--;
    674               i++;
    675               goto again; /* continue while */
    676           }
    677       }
    678       /* Hangul Composition. We don't need to check for <LV,T>
    679          pairs, since we always have decomposed data. */
    680       code = PyUnicode_READ(kind, data, i);
    681       if (LBase <= code && code < (LBase+LCount) &&
    682           i + 1 < len &&
    683           VBase <= PyUnicode_READ(kind, data, i+1) &&
    684           PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
    685           int LIndex, VIndex;
    686           LIndex = code - LBase;
    687           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
    688           code = SBase + (LIndex*VCount+VIndex)*TCount;
    689           i+=2;
    690           if (i < len &&
    691               TBase <= PyUnicode_READ(kind, data, i) &&
    692               PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
    693               code += PyUnicode_READ(kind, data, i)-TBase;
    694               i++;
    695           }
    696           output[o++] = code;
    697           continue;
    698       }
    699 
    700       /* code is still input[i] here */
    701       f = find_nfc_index(self, nfc_first, code);
    702       if (f == -1) {
    703           output[o++] = code;
    704           i++;
    705           continue;
    706       }
    707       /* Find next unblocked character. */
    708       i1 = i+1;
    709       comb = 0;
    710       /* output base character for now; might be updated later. */
    711       output[o] = PyUnicode_READ(kind, data, i);
    712       while (i1 < len) {
    713           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
    714           int comb1 = _getrecord_ex(code1)->combining;
    715           if (comb) {
    716               if (comb1 == 0)
    717                   break;
    718               if (comb >= comb1) {
    719                   /* Character is blocked. */
    720                   i1++;
    721                   continue;
    722               }
    723           }
    724           l = find_nfc_index(self, nfc_last, code1);
    725           /* i1 cannot be combined with i. If i1
    726              is a starter, we don't need to look further.
    727              Otherwise, record the combining class. */
    728           if (l == -1) {
    729             not_combinable:
    730               if (comb1 == 0)
    731                   break;
    732               comb = comb1;
    733               i1++;
    734               continue;
    735           }
    736           index = f*TOTAL_LAST + l;
    737           index1 = comp_index[index >> COMP_SHIFT];
    738           code = comp_data[(index1<<COMP_SHIFT)+
    739                            (index&((1<<COMP_SHIFT)-1))];
    740           if (code == 0)
    741               goto not_combinable;
    742 
    743           /* Replace the original character. */
    744           output[o] = code;
    745           /* Mark the second character unused. */
    746           assert(cskipped < 20);
    747           skipped[cskipped++] = i1;
    748           i1++;
    749           f = find_nfc_index(self, nfc_first, output[o]);
    750           if (f == -1)
    751               break;
    752       }
    753       /* Output character was already written.
    754          Just advance the indices. */
    755       o++; i++;
    756     }
    757     if (o == len) {
    758         /* No changes. Return original string. */
    759         PyMem_Free(output);
    760         return result;
    761     }
    762     Py_DECREF(result);
    763     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
    764                                        output, o);
    765     PyMem_Free(output);
    766     return result;
    767 }
    768 
    769 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
    770 static int
    771 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
    772 {
    773     Py_ssize_t i, len;
    774     int kind;
    775     void *data;
    776     unsigned char prev_combining = 0, quickcheck_mask;
    777 
    778     /* An older version of the database is requested, quickchecks must be
    779        disabled. */
    780     if (self && UCD_Check(self))
    781         return 0;
    782 
    783     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
    784        as described in http://unicode.org/reports/tr15/#Annex8. */
    785     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
    786 
    787     i = 0;
    788     kind = PyUnicode_KIND(input);
    789     data = PyUnicode_DATA(input);
    790     len = PyUnicode_GET_LENGTH(input);
    791     while (i < len) {
    792         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
    793         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
    794         unsigned char combining = record->combining;
    795         unsigned char quickcheck = record->normalization_quick_check;
    796 
    797         if (quickcheck & quickcheck_mask)
    798             return 0; /* this string might need normalization */
    799         if (combining && prev_combining > combining)
    800             return 0; /* non-canonical sort order, not normalized */
    801         prev_combining = combining;
    802     }
    803     return 1; /* certainly normalized */
    804 }
    805 
    806 /*[clinic input]
    807 unicodedata.UCD.normalize
    808 
    809     self: self
    810     form: str
    811     unistr as input: object(subclass_of='&PyUnicode_Type')
    812     /
    813 
    814 Return the normal form 'form' for the Unicode string unistr.
    815 
    816 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
    817 [clinic start generated code]*/
    818 
    819 static PyObject *
    820 unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
    821                                PyObject *input)
    822 /*[clinic end generated code: output=62d1f8870027efdc input=cd092e631cf11883]*/
    823 {
    824     if (PyUnicode_READY(input) == -1)
    825         return NULL;
    826 
    827     if (PyUnicode_GET_LENGTH(input) == 0) {
    828         /* Special case empty input strings, since resizing
    829            them  later would cause internal errors. */
    830         Py_INCREF(input);
    831         return input;
    832     }
    833 
    834     if (strcmp(form, "NFC") == 0) {
    835         if (is_normalized(self, input, 1, 0)) {
    836             Py_INCREF(input);
    837             return input;
    838         }
    839         return nfc_nfkc(self, input, 0);
    840     }
    841     if (strcmp(form, "NFKC") == 0) {
    842         if (is_normalized(self, input, 1, 1)) {
    843             Py_INCREF(input);
    844             return input;
    845         }
    846         return nfc_nfkc(self, input, 1);
    847     }
    848     if (strcmp(form, "NFD") == 0) {
    849         if (is_normalized(self, input, 0, 0)) {
    850             Py_INCREF(input);
    851             return input;
    852         }
    853         return nfd_nfkd(self, input, 0);
    854     }
    855     if (strcmp(form, "NFKD") == 0) {
    856         if (is_normalized(self, input, 0, 1)) {
    857             Py_INCREF(input);
    858             return input;
    859         }
    860         return nfd_nfkd(self, input, 1);
    861     }
    862     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
    863     return NULL;
    864 }
    865 
    866 /* -------------------------------------------------------------------- */
    867 /* unicode character name tables */
    868 
    869 /* data file generated by Tools/unicode/makeunicodedata.py */
    870 #include "unicodename_db.h"
    871 
    872 /* -------------------------------------------------------------------- */
    873 /* database code (cut and pasted from the unidb package) */
    874 
    875 static unsigned long
    876 _gethash(const char *s, int len, int scale)
    877 {
    878     int i;
    879     unsigned long h = 0;
    880     unsigned long ix;
    881     for (i = 0; i < len; i++) {
    882         h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
    883         ix = h & 0xff000000;
    884         if (ix)
    885             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
    886     }
    887     return h;
    888 }
    889 
    890 static const char * const hangul_syllables[][3] = {
    891     { "G",  "A",   ""   },
    892     { "GG", "AE",  "G"  },
    893     { "N",  "YA",  "GG" },
    894     { "D",  "YAE", "GS" },
    895     { "DD", "EO",  "N", },
    896     { "R",  "E",   "NJ" },
    897     { "M",  "YEO", "NH" },
    898     { "B",  "YE",  "D"  },
    899     { "BB", "O",   "L"  },
    900     { "S",  "WA",  "LG" },
    901     { "SS", "WAE", "LM" },
    902     { "",   "OE",  "LB" },
    903     { "J",  "YO",  "LS" },
    904     { "JJ", "U",   "LT" },
    905     { "C",  "WEO", "LP" },
    906     { "K",  "WE",  "LH" },
    907     { "T",  "WI",  "M"  },
    908     { "P",  "YU",  "B"  },
    909     { "H",  "EU",  "BS" },
    910     { 0,    "YI",  "S"  },
    911     { 0,    "I",   "SS" },
    912     { 0,    0,     "NG" },
    913     { 0,    0,     "J"  },
    914     { 0,    0,     "C"  },
    915     { 0,    0,     "K"  },
    916     { 0,    0,     "T"  },
    917     { 0,    0,     "P"  },
    918     { 0,    0,     "H"  }
    919 };
    920 
    921 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
    922 static int
    923 is_unified_ideograph(Py_UCS4 code)
    924 {
    925     return
    926         (0x3400 <= code && code <= 0x4DB5)   || /* CJK Ideograph Extension A */
    927         (0x4E00 <= code && code <= 0x9FD5)   || /* CJK Ideograph */
    928         (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
    929         (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
    930         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
    931         (0x2B820 <= code && code <= 0x2CEA1);   /* CJK Ideograph Extension E */
    932 }
    933 
    934 /* macros used to determine if the given code point is in the PUA range that
    935  * we are using to store aliases and named sequences */
    936 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
    937 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
    938                           (cp < named_sequences_end))
    939 
    940 static int
    941 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
    942            int with_alias_and_seq)
    943 {
    944     /* Find the name associated with the given code point.
    945      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
    946      * that we are using for aliases and named sequences. */
    947     int offset;
    948     int i;
    949     int word;
    950     unsigned char* w;
    951 
    952     if (code >= 0x110000)
    953         return 0;
    954 
    955     /* XXX should we just skip all the code points in the PUAs here? */
    956     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
    957         return 0;
    958 
    959     if (self && UCD_Check(self)) {
    960         /* in 3.2.0 there are no aliases and named sequences */
    961         const change_record *old;
    962         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
    963             return 0;
    964         old = get_old_record(self, code);
    965         if (old->category_changed == 0) {
    966             /* unassigned */
    967             return 0;
    968         }
    969     }
    970 
    971     if (SBase <= code && code < SBase+SCount) {
    972         /* Hangul syllable. */
    973         int SIndex = code - SBase;
    974         int L = SIndex / NCount;
    975         int V = (SIndex % NCount) / TCount;
    976         int T = SIndex % TCount;
    977 
    978         if (buflen < 27)
    979             /* Worst case: HANGUL SYLLABLE <10chars>. */
    980             return 0;
    981         strcpy(buffer, "HANGUL SYLLABLE ");
    982         buffer += 16;
    983         strcpy(buffer, hangul_syllables[L][0]);
    984         buffer += strlen(hangul_syllables[L][0]);
    985         strcpy(buffer, hangul_syllables[V][1]);
    986         buffer += strlen(hangul_syllables[V][1]);
    987         strcpy(buffer, hangul_syllables[T][2]);
    988         buffer += strlen(hangul_syllables[T][2]);
    989         *buffer = '\0';
    990         return 1;
    991     }
    992 
    993     if (is_unified_ideograph(code)) {
    994         if (buflen < 28)
    995             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
    996             return 0;
    997         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
    998         return 1;
    999     }
   1000 
   1001     /* get offset into phrasebook */
   1002     offset = phrasebook_offset1[(code>>phrasebook_shift)];
   1003     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
   1004                                (code&((1<<phrasebook_shift)-1))];
   1005     if (!offset)
   1006         return 0;
   1007 
   1008     i = 0;
   1009 
   1010     for (;;) {
   1011         /* get word index */
   1012         word = phrasebook[offset] - phrasebook_short;
   1013         if (word >= 0) {
   1014             word = (word << 8) + phrasebook[offset+1];
   1015             offset += 2;
   1016         } else
   1017             word = phrasebook[offset++];
   1018         if (i) {
   1019             if (i > buflen)
   1020                 return 0; /* buffer overflow */
   1021             buffer[i++] = ' ';
   1022         }
   1023         /* copy word string from lexicon.  the last character in the
   1024            word has bit 7 set.  the last word in a string ends with
   1025            0x80 */
   1026         w = lexicon + lexicon_offset[word];
   1027         while (*w < 128) {
   1028             if (i >= buflen)
   1029                 return 0; /* buffer overflow */
   1030             buffer[i++] = *w++;
   1031         }
   1032         if (i >= buflen)
   1033             return 0; /* buffer overflow */
   1034         buffer[i++] = *w & 127;
   1035         if (*w == 128)
   1036             break; /* end of word */
   1037     }
   1038 
   1039     return 1;
   1040 }
   1041 
   1042 static int
   1043 _cmpname(PyObject *self, int code, const char* name, int namelen)
   1044 {
   1045     /* check if code corresponds to the given name */
   1046     int i;
   1047     char buffer[NAME_MAXLEN+1];
   1048     if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
   1049         return 0;
   1050     for (i = 0; i < namelen; i++) {
   1051         if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
   1052             return 0;
   1053     }
   1054     return buffer[namelen] == '\0';
   1055 }
   1056 
   1057 static void
   1058 find_syllable(const char *str, int *len, int *pos, int count, int column)
   1059 {
   1060     int i, len1;
   1061     *len = -1;
   1062     for (i = 0; i < count; i++) {
   1063         const char *s = hangul_syllables[i][column];
   1064         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
   1065         if (len1 <= *len)
   1066             continue;
   1067         if (strncmp(str, s, len1) == 0) {
   1068             *len = len1;
   1069             *pos = i;
   1070         }
   1071     }
   1072     if (*len == -1) {
   1073         *len = 0;
   1074     }
   1075 }
   1076 
   1077 static int
   1078 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
   1079 {
   1080     /* check if named sequences are allowed */
   1081     if (!with_named_seq && IS_NAMED_SEQ(cp))
   1082         return 0;
   1083     /* if the code point is in the PUA range that we use for aliases,
   1084      * convert it to obtain the right code point */
   1085     if (IS_ALIAS(cp))
   1086         *code = name_aliases[cp-aliases_start];
   1087     else
   1088         *code = cp;
   1089     return 1;
   1090 }
   1091 
   1092 static int
   1093 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
   1094          int with_named_seq)
   1095 {
   1096     /* Return the code point associated with the given name.
   1097      * Named aliases are resolved too (unless self != NULL (i.e. we are using
   1098      * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
   1099      * using for the named sequence, and the caller must then convert it. */
   1100     unsigned int h, v;
   1101     unsigned int mask = code_size-1;
   1102     unsigned int i, incr;
   1103 
   1104     /* Check for hangul syllables. */
   1105     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
   1106         int len, L = -1, V = -1, T = -1;
   1107         const char *pos = name + 16;
   1108         find_syllable(pos, &len, &L, LCount, 0);
   1109         pos += len;
   1110         find_syllable(pos, &len, &V, VCount, 1);
   1111         pos += len;
   1112         find_syllable(pos, &len, &T, TCount, 2);
   1113         pos += len;
   1114         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
   1115             *code = SBase + (L*VCount+V)*TCount + T;
   1116             return 1;
   1117         }
   1118         /* Otherwise, it's an illegal syllable name. */
   1119         return 0;
   1120     }
   1121 
   1122     /* Check for unified ideographs. */
   1123     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
   1124         /* Four or five hexdigits must follow. */
   1125         v = 0;
   1126         name += 22;
   1127         namelen -= 22;
   1128         if (namelen != 4 && namelen != 5)
   1129             return 0;
   1130         while (namelen--) {
   1131             v *= 16;
   1132             if (*name >= '0' && *name <= '9')
   1133                 v += *name - '0';
   1134             else if (*name >= 'A' && *name <= 'F')
   1135                 v += *name - 'A' + 10;
   1136             else
   1137                 return 0;
   1138             name++;
   1139         }
   1140         if (!is_unified_ideograph(v))
   1141             return 0;
   1142         *code = v;
   1143         return 1;
   1144     }
   1145 
   1146     /* the following is the same as python's dictionary lookup, with
   1147        only minor changes.  see the makeunicodedata script for more
   1148        details */
   1149 
   1150     h = (unsigned int) _gethash(name, namelen, code_magic);
   1151     i = (~h) & mask;
   1152     v = code_hash[i];
   1153     if (!v)
   1154         return 0;
   1155     if (_cmpname(self, v, name, namelen))
   1156         return _check_alias_and_seq(v, code, with_named_seq);
   1157     incr = (h ^ (h >> 3)) & mask;
   1158     if (!incr)
   1159         incr = mask;
   1160     for (;;) {
   1161         i = (i + incr) & mask;
   1162         v = code_hash[i];
   1163         if (!v)
   1164             return 0;
   1165         if (_cmpname(self, v, name, namelen))
   1166             return _check_alias_and_seq(v, code, with_named_seq);
   1167         incr = incr << 1;
   1168         if (incr > mask)
   1169             incr = incr ^ code_poly;
   1170     }
   1171 }
   1172 
   1173 static const _PyUnicode_Name_CAPI hashAPI =
   1174 {
   1175     sizeof(_PyUnicode_Name_CAPI),
   1176     _getucname,
   1177     _getcode
   1178 };
   1179 
   1180 /* -------------------------------------------------------------------- */
   1181 /* Python bindings */
   1182 
   1183 /*[clinic input]
   1184 unicodedata.UCD.name
   1185 
   1186     self: self
   1187     chr: int(accept={str})
   1188     default: object=NULL
   1189     /
   1190 
   1191 Returns the name assigned to the character chr as a string.
   1192 
   1193 If no name is defined, default is returned, or, if not given,
   1194 ValueError is raised.
   1195 [clinic start generated code]*/
   1196 
   1197 static PyObject *
   1198 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
   1199 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
   1200 {
   1201     char name[NAME_MAXLEN+1];
   1202     Py_UCS4 c = (Py_UCS4)chr;
   1203 
   1204     if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
   1205         if (default_value == NULL) {
   1206             PyErr_SetString(PyExc_ValueError, "no such name");
   1207             return NULL;
   1208         }
   1209         else {
   1210             Py_INCREF(default_value);
   1211             return default_value;
   1212         }
   1213     }
   1214 
   1215     return PyUnicode_FromString(name);
   1216 }
   1217 
   1218 /*[clinic input]
   1219 unicodedata.UCD.lookup
   1220 
   1221     self: self
   1222     name: str(accept={str, robuffer}, zeroes=True)
   1223     /
   1224 
   1225 Look up character by name.
   1226 
   1227 If a character with the given name is found, return the
   1228 corresponding character.  If not found, KeyError is raised.
   1229 [clinic start generated code]*/
   1230 
   1231 static PyObject *
   1232 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
   1233                             Py_ssize_clean_t name_length)
   1234 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
   1235 {
   1236     Py_UCS4 code;
   1237     unsigned int index;
   1238     if (name_length > NAME_MAXLEN) {
   1239         PyErr_SetString(PyExc_KeyError, "name too long");
   1240         return NULL;
   1241     }
   1242 
   1243     if (!_getcode(self, name, (int)name_length, &code, 1)) {
   1244         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
   1245         return NULL;
   1246     }
   1247     /* check if code is in the PUA range that we use for named sequences
   1248        and convert it */
   1249     if (IS_NAMED_SEQ(code)) {
   1250         index = code-named_sequences_start;
   1251         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
   1252                                          named_sequences[index].seq,
   1253                                          named_sequences[index].seqlen);
   1254     }
   1255     return PyUnicode_FromOrdinal(code);
   1256 }
   1257 
   1258 /* XXX Add doc strings. */
   1259 
   1260 static PyMethodDef unicodedata_functions[] = {
   1261     UNICODEDATA_UCD_DECIMAL_METHODDEF
   1262     UNICODEDATA_UCD_DIGIT_METHODDEF
   1263     UNICODEDATA_UCD_NUMERIC_METHODDEF
   1264     UNICODEDATA_UCD_CATEGORY_METHODDEF
   1265     UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
   1266     UNICODEDATA_UCD_COMBINING_METHODDEF
   1267     UNICODEDATA_UCD_MIRRORED_METHODDEF
   1268     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
   1269     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
   1270     UNICODEDATA_UCD_NAME_METHODDEF
   1271     UNICODEDATA_UCD_LOOKUP_METHODDEF
   1272     UNICODEDATA_UCD_NORMALIZE_METHODDEF
   1273     {NULL, NULL}                /* sentinel */
   1274 };
   1275 
   1276 static PyTypeObject UCD_Type = {
   1277         /* The ob_type field must be initialized in the module init function
   1278          * to be portable to Windows without using C++. */
   1279         PyVarObject_HEAD_INIT(NULL, 0)
   1280         "unicodedata.UCD",              /*tp_name*/
   1281         sizeof(PreviousDBVersion),      /*tp_basicsize*/
   1282         0,                      /*tp_itemsize*/
   1283         /* methods */
   1284         (destructor)PyObject_Del, /*tp_dealloc*/
   1285         0,                      /*tp_print*/
   1286         0,                      /*tp_getattr*/
   1287         0,                      /*tp_setattr*/
   1288         0,                      /*tp_reserved*/
   1289         0,                      /*tp_repr*/
   1290         0,                      /*tp_as_number*/
   1291         0,                      /*tp_as_sequence*/
   1292         0,                      /*tp_as_mapping*/
   1293         0,                      /*tp_hash*/
   1294         0,                      /*tp_call*/
   1295         0,                      /*tp_str*/
   1296         PyObject_GenericGetAttr,/*tp_getattro*/
   1297         0,                      /*tp_setattro*/
   1298         0,                      /*tp_as_buffer*/
   1299         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
   1300         0,                      /*tp_doc*/
   1301         0,                      /*tp_traverse*/
   1302         0,                      /*tp_clear*/
   1303         0,                      /*tp_richcompare*/
   1304         0,                      /*tp_weaklistoffset*/
   1305         0,                      /*tp_iter*/
   1306         0,                      /*tp_iternext*/
   1307         unicodedata_functions,  /*tp_methods*/
   1308         DB_members,             /*tp_members*/
   1309         0,                      /*tp_getset*/
   1310         0,                      /*tp_base*/
   1311         0,                      /*tp_dict*/
   1312         0,                      /*tp_descr_get*/
   1313         0,                      /*tp_descr_set*/
   1314         0,                      /*tp_dictoffset*/
   1315         0,                      /*tp_init*/
   1316         0,                      /*tp_alloc*/
   1317         0,                      /*tp_new*/
   1318         0,                      /*tp_free*/
   1319         0,                      /*tp_is_gc*/
   1320 };
   1321 
   1322 PyDoc_STRVAR(unicodedata_docstring,
   1323 "This module provides access to the Unicode Character Database which\n\
   1324 defines character properties for all Unicode characters. The data in\n\
   1325 this database is based on the UnicodeData.txt file version\n\
   1326 " UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
   1327 \n\
   1328 The module uses the same names and symbols as defined by the\n\
   1329 UnicodeData File Format " UNIDATA_VERSION ".");
   1330 
   1331 static struct PyModuleDef unicodedatamodule = {
   1332         PyModuleDef_HEAD_INIT,
   1333         "unicodedata",
   1334         unicodedata_docstring,
   1335         -1,
   1336         unicodedata_functions,
   1337         NULL,
   1338         NULL,
   1339         NULL,
   1340         NULL
   1341 };
   1342 
   1343 PyMODINIT_FUNC
   1344 PyInit_unicodedata(void)
   1345 {
   1346     PyObject *m, *v;
   1347 
   1348     Py_TYPE(&UCD_Type) = &PyType_Type;
   1349 
   1350     m = PyModule_Create(&unicodedatamodule);
   1351     if (!m)
   1352         return NULL;
   1353 
   1354     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
   1355     Py_INCREF(&UCD_Type);
   1356     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
   1357 
   1358     /* Previous versions */
   1359     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
   1360     if (v != NULL)
   1361         PyModule_AddObject(m, "ucd_3_2_0", v);
   1362 
   1363     /* Export C API */
   1364     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
   1365     if (v != NULL)
   1366         PyModule_AddObject(m, "ucnhash_CAPI", v);
   1367     return m;
   1368 }
   1369 
   1370 /*
   1371 Local variables:
   1372 c-basic-offset: 4
   1373 indent-tabs-mode: nil
   1374 End:
   1375 */
   1376