Home | History | Annotate | Download | only in Modules
      1 /* ------------------------------------------------------------------------
      2 
      3    unicodedata -- Provides access to the Unicode 5.2 data base.
      4 
      5    Data was extracted from the Unicode 5.2 UnicodeData.txt file.
      6 
      7    Written by Marc-Andre Lemburg (mal (at) lemburg.com).
      8    Modified for Python 2.0 by Fredrik Lundh (fredrik (at) pythonware.com)
      9    Modified by Martin v. Lwis (martin (at) v.loewis.de)
     10 
     11    Copyright (c) Corporation for National Research Initiatives.
     12 
     13    ------------------------------------------------------------------------ */
     14 
     15 #include "Python.h"
     16 #include "ucnhash.h"
     17 #include "structmember.h"
     18 
     19 /* character properties */
     20 
     21 typedef struct {
     22     const unsigned char category;       /* index into
     23                                            _PyUnicode_CategoryNames */
     24     const unsigned char combining;      /* combining class value 0 - 255 */
     25     const unsigned char bidirectional;  /* index into
     26                                            _PyUnicode_BidirectionalNames */
     27     const unsigned char mirrored;       /* true if mirrored in bidir mode */
     28     const unsigned char east_asian_width;       /* index into
     29                                                    _PyUnicode_EastAsianWidth */
     30     const unsigned char normalization_quick_check; /* see is_normalized() */
     31 } _PyUnicode_DatabaseRecord;
     32 
     33 typedef struct change_record {
     34     /* sequence of fields should be the same as in merge_old_version */
     35     const unsigned char bidir_changed;
     36     const unsigned char category_changed;
     37     const unsigned char decimal_changed;
     38     const unsigned char mirrored_changed;
     39     const double numeric_changed;
     40 } change_record;
     41 
     42 /* data file generated by Tools/unicode/makeunicodedata.py */
     43 #include "unicodedata_db.h"
     44 
     45 static const _PyUnicode_DatabaseRecord*
     46 _getrecord_ex(Py_UCS4 code)
     47 {
     48     int index;
     49     if (code >= 0x110000)
     50         index = 0;
     51     else {
     52         index = index1[(code>>SHIFT)];
     53         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
     54     }
     55 
     56     return &_PyUnicode_Database_Records[index];
     57 }
     58 
     59 /* ------------- Previous-version API ------------------------------------- */
     60 typedef struct previous_version {
     61     PyObject_HEAD
     62     const char *name;
     63     const change_record* (*getrecord)(Py_UCS4);
     64     Py_UCS4 (*normalization)(Py_UCS4);
     65 } PreviousDBVersion;
     66 
     67 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
     68 
     69 static PyMemberDef DB_members[] = {
     70         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
     71         {NULL}
     72 };
     73 
     74 /* forward declaration */
     75 static PyTypeObject UCD_Type;
     76 
     77 static PyObject*
     78 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
     79                      Py_UCS4 (*normalization)(Py_UCS4))
     80 {
     81         PreviousDBVersion *self;
     82         self = PyObject_New(PreviousDBVersion, &UCD_Type);
     83         if (self == NULL)
     84                 return NULL;
     85         self->name = name;
     86         self->getrecord = getrecord;
     87         self->normalization = normalization;
     88         return (PyObject*)self;
     89 }
     90 
     91 
     92 static Py_UCS4 getuchar(PyUnicodeObject *obj)
     93 {
     94     Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
     95 
     96     if (PyUnicode_GET_SIZE(obj) == 1)
     97         return *v;
     98 #ifndef Py_UNICODE_WIDE
     99     else if ((PyUnicode_GET_SIZE(obj) == 2) &&
    100              (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
    101              (0xDC00 <= v[1] && v[1] <= 0xDFFF))
    102         return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
    103 #endif
    104     PyErr_SetString(PyExc_TypeError,
    105                     "need a single Unicode character as parameter");
    106     return (Py_UCS4)-1;
    107 }
    108 
    109 /* --- Module API --------------------------------------------------------- */
    110 
    111 PyDoc_STRVAR(unicodedata_decimal__doc__,
    112 "decimal(unichr[, default])\n\
    113 \n\
    114 Returns the decimal value assigned to the Unicode character unichr\n\
    115 as integer. If no such value is defined, default is returned, or, if\n\
    116 not given, ValueError is raised.");
    117 
    118 static PyObject *
    119 unicodedata_decimal(PyObject *self, PyObject *args)
    120 {
    121     PyUnicodeObject *v;
    122     PyObject *defobj = NULL;
    123     int have_old = 0;
    124     long rc;
    125     Py_UCS4 c;
    126 
    127     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
    128         return NULL;
    129     c = getuchar(v);
    130     if (c == (Py_UCS4)-1)
    131         return NULL;
    132 
    133     if (self) {
    134         const change_record *old = get_old_record(self, c);
    135         if (old->category_changed == 0) {
    136             /* unassigned */
    137             have_old = 1;
    138             rc = -1;
    139         }
    140         else if (old->decimal_changed != 0xFF) {
    141             have_old = 1;
    142             rc = old->decimal_changed;
    143         }
    144     }
    145 
    146     if (!have_old)
    147         rc = Py_UNICODE_TODECIMAL(c);
    148     if (rc < 0) {
    149         if (defobj == NULL) {
    150             PyErr_SetString(PyExc_ValueError,
    151                             "not a decimal");
    152             return NULL;
    153         }
    154         else {
    155             Py_INCREF(defobj);
    156             return defobj;
    157         }
    158     }
    159     return PyInt_FromLong(rc);
    160 }
    161 
    162 PyDoc_STRVAR(unicodedata_digit__doc__,
    163 "digit(unichr[, default])\n\
    164 \n\
    165 Returns the digit value assigned to the Unicode character unichr as\n\
    166 integer. If no such value is defined, default is returned, or, if\n\
    167 not given, ValueError is raised.");
    168 
    169 static PyObject *
    170 unicodedata_digit(PyObject *self, PyObject *args)
    171 {
    172     PyUnicodeObject *v;
    173     PyObject *defobj = NULL;
    174     long rc;
    175     Py_UCS4 c;
    176 
    177     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
    178         return NULL;
    179     c = getuchar(v);
    180     if (c == (Py_UCS4)-1)
    181         return NULL;
    182     rc = Py_UNICODE_TODIGIT(c);
    183     if (rc < 0) {
    184         if (defobj == NULL) {
    185             PyErr_SetString(PyExc_ValueError, "not a digit");
    186             return NULL;
    187         }
    188         else {
    189             Py_INCREF(defobj);
    190             return defobj;
    191         }
    192     }
    193     return PyInt_FromLong(rc);
    194 }
    195 
    196 PyDoc_STRVAR(unicodedata_numeric__doc__,
    197 "numeric(unichr[, default])\n\
    198 \n\
    199 Returns the numeric value assigned to the Unicode character unichr\n\
    200 as float. If no such value is defined, default is returned, or, if\n\
    201 not given, ValueError is raised.");
    202 
    203 static PyObject *
    204 unicodedata_numeric(PyObject *self, PyObject *args)
    205 {
    206     PyUnicodeObject *v;
    207     PyObject *defobj = NULL;
    208     int have_old = 0;
    209     double rc;
    210     Py_UCS4 c;
    211 
    212     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
    213         return NULL;
    214     c = getuchar(v);
    215     if (c == (Py_UCS4)-1)
    216         return NULL;
    217 
    218     if (self) {
    219         const change_record *old = get_old_record(self, c);
    220         if (old->category_changed == 0) {
    221             /* unassigned */
    222             have_old = 1;
    223             rc = -1.0;
    224         }
    225         else if (old->decimal_changed != 0xFF) {
    226             have_old = 1;
    227             rc = old->decimal_changed;
    228         }
    229     }
    230 
    231     if (!have_old)
    232         rc = Py_UNICODE_TONUMERIC(c);
    233     if (rc == -1.0) {
    234         if (defobj == NULL) {
    235             PyErr_SetString(PyExc_ValueError, "not a numeric character");
    236             return NULL;
    237         }
    238         else {
    239             Py_INCREF(defobj);
    240             return defobj;
    241         }
    242     }
    243     return PyFloat_FromDouble(rc);
    244 }
    245 
    246 PyDoc_STRVAR(unicodedata_category__doc__,
    247 "category(unichr)\n\
    248 \n\
    249 Returns the general category assigned to the Unicode character\n\
    250 unichr as string.");
    251 
    252 static PyObject *
    253 unicodedata_category(PyObject *self, PyObject *args)
    254 {
    255     PyUnicodeObject *v;
    256     int index;
    257     Py_UCS4 c;
    258 
    259     if (!PyArg_ParseTuple(args, "O!:category",
    260                           &PyUnicode_Type, &v))
    261         return NULL;
    262     c = getuchar(v);
    263     if (c == (Py_UCS4)-1)
    264         return NULL;
    265     index = (int) _getrecord_ex(c)->category;
    266     if (self) {
    267         const change_record *old = get_old_record(self, c);
    268         if (old->category_changed != 0xFF)
    269             index = old->category_changed;
    270     }
    271     return PyString_FromString(_PyUnicode_CategoryNames[index]);
    272 }
    273 
    274 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
    275 "bidirectional(unichr)\n\
    276 \n\
    277 Returns the bidirectional class assigned to the Unicode character\n\
    278 unichr as string. If no such value is defined, an empty string is\n\
    279 returned.");
    280 
    281 static PyObject *
    282 unicodedata_bidirectional(PyObject *self, PyObject *args)
    283 {
    284     PyUnicodeObject *v;
    285     int index;
    286     Py_UCS4 c;
    287 
    288     if (!PyArg_ParseTuple(args, "O!:bidirectional",
    289                           &PyUnicode_Type, &v))
    290         return NULL;
    291     c = getuchar(v);
    292     if (c == (Py_UCS4)-1)
    293         return NULL;
    294     index = (int) _getrecord_ex(c)->bidirectional;
    295     if (self) {
    296         const change_record *old = get_old_record(self, c);
    297         if (old->category_changed == 0)
    298             index = 0; /* unassigned */
    299         else if (old->bidir_changed != 0xFF)
    300             index = old->bidir_changed;
    301     }
    302     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
    303 }
    304 
    305 PyDoc_STRVAR(unicodedata_combining__doc__,
    306 "combining(unichr)\n\
    307 \n\
    308 Returns the canonical combining class assigned to the Unicode\n\
    309 character unichr as integer. Returns 0 if no combining class is\n\
    310 defined.");
    311 
    312 static PyObject *
    313 unicodedata_combining(PyObject *self, PyObject *args)
    314 {
    315     PyUnicodeObject *v;
    316     int index;
    317     Py_UCS4 c;
    318 
    319     if (!PyArg_ParseTuple(args, "O!:combining",
    320                           &PyUnicode_Type, &v))
    321         return NULL;
    322     c = getuchar(v);
    323     if (c == (Py_UCS4)-1)
    324         return NULL;
    325     index = (int) _getrecord_ex(c)->combining;
    326     if (self) {
    327         const change_record *old = get_old_record(self, c);
    328         if (old->category_changed == 0)
    329             index = 0; /* unassigned */
    330     }
    331     return PyInt_FromLong(index);
    332 }
    333 
    334 PyDoc_STRVAR(unicodedata_mirrored__doc__,
    335 "mirrored(unichr)\n\
    336 \n\
    337 Returns the mirrored property assigned to the Unicode character\n\
    338 unichr as integer. Returns 1 if the character has been identified as\n\
    339 a \"mirrored\" character in bidirectional text, 0 otherwise.");
    340 
    341 static PyObject *
    342 unicodedata_mirrored(PyObject *self, PyObject *args)
    343 {
    344     PyUnicodeObject *v;
    345     int index;
    346     Py_UCS4 c;
    347 
    348     if (!PyArg_ParseTuple(args, "O!:mirrored",
    349                           &PyUnicode_Type, &v))
    350         return NULL;
    351     c = getuchar(v);
    352     if (c == (Py_UCS4)-1)
    353         return NULL;
    354     index = (int) _getrecord_ex(c)->mirrored;
    355     if (self) {
    356         const change_record *old = get_old_record(self, c);
    357         if (old->category_changed == 0)
    358             index = 0; /* unassigned */
    359         else if (old->mirrored_changed != 0xFF)
    360             index = old->mirrored_changed;
    361     }
    362     return PyInt_FromLong(index);
    363 }
    364 
    365 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
    366 "east_asian_width(unichr)\n\
    367 \n\
    368 Returns the east asian width assigned to the Unicode character\n\
    369 unichr as string.");
    370 
    371 static PyObject *
    372 unicodedata_east_asian_width(PyObject *self, PyObject *args)
    373 {
    374     PyUnicodeObject *v;
    375     int index;
    376     Py_UCS4 c;
    377 
    378     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
    379                           &PyUnicode_Type, &v))
    380         return NULL;
    381     c = getuchar(v);
    382     if (c == (Py_UCS4)-1)
    383         return NULL;
    384     index = (int) _getrecord_ex(c)->east_asian_width;
    385     if (self) {
    386         const change_record *old = get_old_record(self, c);
    387         if (old->category_changed == 0)
    388             index = 0; /* unassigned */
    389     }
    390     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
    391 }
    392 
    393 PyDoc_STRVAR(unicodedata_decomposition__doc__,
    394 "decomposition(unichr)\n\
    395 \n\
    396 Returns the character decomposition mapping assigned to the Unicode\n\
    397 character unichr as string. An empty string is returned in case no\n\
    398 such mapping is defined.");
    399 
    400 static PyObject *
    401 unicodedata_decomposition(PyObject *self, PyObject *args)
    402 {
    403     PyUnicodeObject *v;
    404     char decomp[256];
    405     int code, index, count, i;
    406     unsigned int prefix_index;
    407     Py_UCS4 c;
    408 
    409     if (!PyArg_ParseTuple(args, "O!:decomposition",
    410                           &PyUnicode_Type, &v))
    411         return NULL;
    412     c = getuchar(v);
    413     if (c == (Py_UCS4)-1)
    414         return NULL;
    415 
    416     code = (int)c;
    417 
    418     if (self) {
    419         const change_record *old = get_old_record(self, c);
    420         if (old->category_changed == 0)
    421             return PyString_FromString(""); /* unassigned */
    422     }
    423 
    424     if (code < 0 || code >= 0x110000)
    425         index = 0;
    426     else {
    427         index = decomp_index1[(code>>DECOMP_SHIFT)];
    428         index = decomp_index2[(index<<DECOMP_SHIFT)+
    429                              (code&((1<<DECOMP_SHIFT)-1))];
    430     }
    431 
    432     /* high byte is number of hex bytes (usually one or two), low byte
    433        is prefix code (from*/
    434     count = decomp_data[index] >> 8;
    435 
    436     /* XXX: could allocate the PyString up front instead
    437        (strlen(prefix) + 5 * count + 1 bytes) */
    438 
    439     /* Based on how index is calculated above and decomp_data is generated
    440        from Tools/unicode/makeunicodedata.py, it should not be possible
    441        to overflow decomp_prefix. */
    442     prefix_index = decomp_data[index] & 255;
    443     assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
    444 
    445     /* copy prefix */
    446     i = strlen(decomp_prefix[prefix_index]);
    447     memcpy(decomp, decomp_prefix[prefix_index], i);
    448 
    449     while (count-- > 0) {
    450         if (i)
    451             decomp[i++] = ' ';
    452         assert((size_t)i < sizeof(decomp));
    453         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
    454                       decomp_data[++index]);
    455         i += strlen(decomp + i);
    456     }
    457 
    458     decomp[i] = '\0';
    459 
    460     return PyString_FromString(decomp);
    461 }
    462 
    463 static void
    464 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
    465 {
    466     if (code >= 0x110000) {
    467         *index = 0;
    468     } else if (self && get_old_record(self, code)->category_changed==0) {
    469         /* unassigned in old version */
    470         *index = 0;
    471     }
    472     else {
    473         *index = decomp_index1[(code>>DECOMP_SHIFT)];
    474         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
    475                                (code&((1<<DECOMP_SHIFT)-1))];
    476     }
    477 
    478     /* high byte is number of hex bytes (usually one or two), low byte
    479        is prefix code (from*/
    480     *count = decomp_data[*index] >> 8;
    481     *prefix = decomp_data[*index] & 255;
    482 
    483     (*index)++;
    484 }
    485 
    486 #define SBase   0xAC00
    487 #define LBase   0x1100
    488 #define VBase   0x1161
    489 #define TBase   0x11A7
    490 #define LCount  19
    491 #define VCount  21
    492 #define TCount  28
    493 #define NCount  (VCount*TCount)
    494 #define SCount  (LCount*NCount)
    495 
    496 static PyObject*
    497 nfd_nfkd(PyObject *self, PyObject *input, int k)
    498 {
    499     PyObject *result;
    500     Py_UNICODE *i, *end, *o;
    501     /* Longest decomposition in Unicode 3.2: U+FDFA */
    502     Py_UNICODE stack[20];
    503     Py_ssize_t space, isize;
    504     int index, prefix, count, stackptr;
    505     unsigned char prev, cur;
    506 
    507     stackptr = 0;
    508     isize = PyUnicode_GET_SIZE(input);
    509     space = isize;
    510     /* Overallocate at most 10 characters. */
    511     if (space > 10) {
    512         if (space <= PY_SSIZE_T_MAX - 10)
    513             space += 10;
    514     }
    515     else {
    516         space *= 2;
    517     }
    518     result = PyUnicode_FromUnicode(NULL, space);
    519     if (!result)
    520         return NULL;
    521     i = PyUnicode_AS_UNICODE(input);
    522     end = i + isize;
    523     o = PyUnicode_AS_UNICODE(result);
    524 
    525     while (i < end) {
    526         stack[stackptr++] = *i++;
    527         while(stackptr) {
    528             Py_UNICODE code = stack[--stackptr];
    529             /* Hangul Decomposition adds three characters in
    530                a single step, so we need at least that much room. */
    531             if (space < 3) {
    532                 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
    533                 space += 10;
    534                 if (PyUnicode_Resize(&result, newsize) == -1)
    535                     return NULL;
    536                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
    537             }
    538             /* Hangul Decomposition. */
    539             if (SBase <= code && code < (SBase+SCount)) {
    540                 int SIndex = code - SBase;
    541                 int L = LBase + SIndex / NCount;
    542                 int V = VBase + (SIndex % NCount) / TCount;
    543                 int T = TBase + SIndex % TCount;
    544                 *o++ = L;
    545                 *o++ = V;
    546                 space -= 2;
    547                 if (T != TBase) {
    548                     *o++ = T;
    549                     space --;
    550                 }
    551                 continue;
    552             }
    553             /* normalization changes */
    554             if (self) {
    555                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
    556                 if (value != 0) {
    557                     stack[stackptr++] = value;
    558                     continue;
    559                 }
    560             }
    561 
    562             /* Other decompositions. */
    563             get_decomp_record(self, code, &index, &prefix, &count);
    564 
    565             /* Copy character if it is not decomposable, or has a
    566                compatibility decomposition, but we do NFD. */
    567             if (!count || (prefix && !k)) {
    568                 *o++ = code;
    569                 space--;
    570                 continue;
    571             }
    572             /* Copy decomposition onto the stack, in reverse
    573                order.  */
    574             while(count) {
    575                 code = decomp_data[index + (--count)];
    576                 stack[stackptr++] = code;
    577             }
    578         }
    579     }
    580 
    581     /* Drop overallocation. Cannot fail. */
    582     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
    583 
    584     /* Sort canonically. */
    585     i = PyUnicode_AS_UNICODE(result);
    586     prev = _getrecord_ex(*i)->combining;
    587     end = i + PyUnicode_GET_SIZE(result);
    588     for (i++; i < end; i++) {
    589         cur = _getrecord_ex(*i)->combining;
    590         if (prev == 0 || cur == 0 || prev <= cur) {
    591             prev = cur;
    592             continue;
    593         }
    594         /* Non-canonical order. Need to switch *i with previous. */
    595         o = i - 1;
    596         while (1) {
    597             Py_UNICODE tmp = o[1];
    598             o[1] = o[0];
    599             o[0] = tmp;
    600             o--;
    601             if (o < PyUnicode_AS_UNICODE(result))
    602                 break;
    603             prev = _getrecord_ex(*o)->combining;
    604             if (prev == 0 || prev <= cur)
    605                 break;
    606         }
    607         prev = _getrecord_ex(*i)->combining;
    608     }
    609     return result;
    610 }
    611 
    612 static int
    613 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
    614 {
    615     int index;
    616     for (index = 0; nfc[index].start; index++) {
    617         int start = nfc[index].start;
    618         if (code < start)
    619             return -1;
    620         if (code <= start + nfc[index].count) {
    621             int delta = code - start;
    622             return nfc[index].index + delta;
    623         }
    624     }
    625     return -1;
    626 }
    627 
    628 static PyObject*
    629 nfc_nfkc(PyObject *self, PyObject *input, int k)
    630 {
    631     PyObject *result;
    632     Py_UNICODE *i, *i1, *o, *end;
    633     int f,l,index,index1,comb;
    634     Py_UNICODE code;
    635     Py_UNICODE *skipped[20];
    636     int cskipped = 0;
    637 
    638     result = nfd_nfkd(self, input, k);
    639     if (!result)
    640         return NULL;
    641 
    642     /* We are going to modify result in-place.
    643        If nfd_nfkd is changed to sometimes return the input,
    644        this code needs to be reviewed. */
    645     assert(result != input);
    646 
    647     i = PyUnicode_AS_UNICODE(result);
    648     end = i + PyUnicode_GET_SIZE(result);
    649     o = PyUnicode_AS_UNICODE(result);
    650 
    651   again:
    652     while (i < end) {
    653       for (index = 0; index < cskipped; index++) {
    654           if (skipped[index] == i) {
    655               /* *i character is skipped.
    656                  Remove from list. */
    657               skipped[index] = skipped[cskipped-1];
    658               cskipped--;
    659               i++;
    660               goto again; /* continue while */
    661           }
    662       }
    663       /* Hangul Composition. We don't need to check for <LV,T>
    664          pairs, since we always have decomposed data. */
    665       if (LBase <= *i && *i < (LBase+LCount) &&
    666           i + 1 < end &&
    667           VBase <= i[1] && i[1] <= (VBase+VCount)) {
    668           int LIndex, VIndex;
    669           LIndex = i[0] - LBase;
    670           VIndex = i[1] - VBase;
    671           code = SBase + (LIndex*VCount+VIndex)*TCount;
    672           i+=2;
    673           if (i < end &&
    674               TBase <= *i && *i <= (TBase+TCount)) {
    675               code += *i-TBase;
    676               i++;
    677           }
    678           *o++ = code;
    679           continue;
    680       }
    681 
    682       f = find_nfc_index(self, nfc_first, *i);
    683       if (f == -1) {
    684           *o++ = *i++;
    685           continue;
    686       }
    687       /* Find next unblocked character. */
    688       i1 = i+1;
    689       comb = 0;
    690       while (i1 < end) {
    691           int comb1 = _getrecord_ex(*i1)->combining;
    692           if (comb) {
    693               if (comb1 == 0)
    694                   break;
    695               if (comb >= comb1) {
    696                   /* Character is blocked. */
    697                   i1++;
    698                   continue;
    699               }
    700           }
    701           l = find_nfc_index(self, nfc_last, *i1);
    702           /* *i1 cannot be combined with *i. If *i1
    703              is a starter, we don't need to look further.
    704              Otherwise, record the combining class. */
    705           if (l == -1) {
    706             not_combinable:
    707               if (comb1 == 0)
    708                   break;
    709               comb = comb1;
    710               i1++;
    711               continue;
    712           }
    713           index = f*TOTAL_LAST + l;
    714           index1 = comp_index[index >> COMP_SHIFT];
    715           code = comp_data[(index1<<COMP_SHIFT)+
    716                            (index&((1<<COMP_SHIFT)-1))];
    717           if (code == 0)
    718               goto not_combinable;
    719 
    720           /* Replace the original character. */
    721           *i = code;
    722           /* Mark the second character unused. */
    723           assert(cskipped < 20);
    724           skipped[cskipped++] = i1;
    725           i1++;
    726           f = find_nfc_index(self, nfc_first, *i);
    727           if (f == -1)
    728               break;
    729       }
    730       *o++ = *i++;
    731     }
    732     if (o != end)
    733         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
    734     return result;
    735 }
    736 
    737 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
    738 static int
    739 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
    740 {
    741     Py_UNICODE *i, *end;
    742     unsigned char prev_combining = 0, quickcheck_mask;
    743 
    744     /* An older version of the database is requested, quickchecks must be
    745        disabled. */
    746     if (self != NULL)
    747         return 0;
    748 
    749     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
    750        as described in http://unicode.org/reports/tr15/#Annex8. */
    751     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
    752 
    753     i = PyUnicode_AS_UNICODE(input);
    754     end = i + PyUnicode_GET_SIZE(input);
    755     while (i < end) {
    756         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
    757         unsigned char combining = record->combining;
    758         unsigned char quickcheck = record->normalization_quick_check;
    759 
    760         if (quickcheck & quickcheck_mask)
    761             return 0; /* this string might need normalization */
    762         if (combining && prev_combining > combining)
    763             return 0; /* non-canonical sort order, not normalized */
    764         prev_combining = combining;
    765     }
    766     return 1; /* certainly normalized */
    767 }
    768 
    769 PyDoc_STRVAR(unicodedata_normalize__doc__,
    770 "normalize(form, unistr)\n\
    771 \n\
    772 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
    773 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
    774 
    775 static PyObject*
    776 unicodedata_normalize(PyObject *self, PyObject *args)
    777 {
    778     char *form;
    779     PyObject *input;
    780 
    781     if(!PyArg_ParseTuple(args, "sO!:normalize",
    782                          &form, &PyUnicode_Type, &input))
    783         return NULL;
    784 
    785     if (PyUnicode_GetSize(input) == 0) {
    786         /* Special case empty input strings, since resizing
    787            them  later would cause internal errors. */
    788         Py_INCREF(input);
    789         return input;
    790     }
    791 
    792     if (strcmp(form, "NFC") == 0) {
    793         if (is_normalized(self, input, 1, 0)) {
    794             Py_INCREF(input);
    795             return input;
    796         }
    797         return nfc_nfkc(self, input, 0);
    798     }
    799     if (strcmp(form, "NFKC") == 0) {
    800         if (is_normalized(self, input, 1, 1)) {
    801             Py_INCREF(input);
    802             return input;
    803         }
    804         return nfc_nfkc(self, input, 1);
    805     }
    806     if (strcmp(form, "NFD") == 0) {
    807         if (is_normalized(self, input, 0, 0)) {
    808             Py_INCREF(input);
    809             return input;
    810         }
    811         return nfd_nfkd(self, input, 0);
    812     }
    813     if (strcmp(form, "NFKD") == 0) {
    814         if (is_normalized(self, input, 0, 1)) {
    815             Py_INCREF(input);
    816             return input;
    817         }
    818         return nfd_nfkd(self, input, 1);
    819     }
    820     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
    821     return NULL;
    822 }
    823 
    824 /* -------------------------------------------------------------------- */
    825 /* unicode character name tables */
    826 
    827 /* data file generated by Tools/unicode/makeunicodedata.py */
    828 #include "unicodename_db.h"
    829 
    830 /* -------------------------------------------------------------------- */
    831 /* database code (cut and pasted from the unidb package) */
    832 
    833 static unsigned long
    834 _gethash(const char *s, int len, int scale)
    835 {
    836     int i;
    837     unsigned long h = 0;
    838     unsigned long ix;
    839     for (i = 0; i < len; i++) {
    840         h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
    841         ix = h & 0xff000000;
    842         if (ix)
    843             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
    844     }
    845     return h;
    846 }
    847 
    848 static char *hangul_syllables[][3] = {
    849     { "G",  "A",   ""   },
    850     { "GG", "AE",  "G"  },
    851     { "N",  "YA",  "GG" },
    852     { "D",  "YAE", "GS" },
    853     { "DD", "EO",  "N", },
    854     { "R",  "E",   "NJ" },
    855     { "M",  "YEO", "NH" },
    856     { "B",  "YE",  "D"  },
    857     { "BB", "O",   "L"  },
    858     { "S",  "WA",  "LG" },
    859     { "SS", "WAE", "LM" },
    860     { "",   "OE",  "LB" },
    861     { "J",  "YO",  "LS" },
    862     { "JJ", "U",   "LT" },
    863     { "C",  "WEO", "LP" },
    864     { "K",  "WE",  "LH" },
    865     { "T",  "WI",  "M"  },
    866     { "P",  "YU",  "B"  },
    867     { "H",  "EU",  "BS" },
    868     { 0,    "YI",  "S"  },
    869     { 0,    "I",   "SS" },
    870     { 0,    0,     "NG" },
    871     { 0,    0,     "J"  },
    872     { 0,    0,     "C"  },
    873     { 0,    0,     "K"  },
    874     { 0,    0,     "T"  },
    875     { 0,    0,     "P"  },
    876     { 0,    0,     "H"  }
    877 };
    878 
    879 static int
    880 is_unified_ideograph(Py_UCS4 code)
    881 {
    882     return (
    883         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
    884         (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
    885         (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
    886         (0x2A700 <= code && code <= 0x2B734));  /* CJK Ideograph Extension C */
    887 }
    888 
    889 static int
    890 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
    891 {
    892     int offset;
    893     int i;
    894     int word;
    895     unsigned char* w;
    896 
    897     if (code >= 0x110000)
    898         return 0;
    899 
    900     if (self) {
    901         const change_record *old = get_old_record(self, code);
    902         if (old->category_changed == 0) {
    903             /* unassigned */
    904             return 0;
    905         }
    906     }
    907 
    908     if (SBase <= code && code < SBase+SCount) {
    909         /* Hangul syllable. */
    910         int SIndex = code - SBase;
    911         int L = SIndex / NCount;
    912         int V = (SIndex % NCount) / TCount;
    913         int T = SIndex % TCount;
    914 
    915         if (buflen < 27)
    916             /* Worst case: HANGUL SYLLABLE <10chars>. */
    917             return 0;
    918         strcpy(buffer, "HANGUL SYLLABLE ");
    919         buffer += 16;
    920         strcpy(buffer, hangul_syllables[L][0]);
    921         buffer += strlen(hangul_syllables[L][0]);
    922         strcpy(buffer, hangul_syllables[V][1]);
    923         buffer += strlen(hangul_syllables[V][1]);
    924         strcpy(buffer, hangul_syllables[T][2]);
    925         buffer += strlen(hangul_syllables[T][2]);
    926         *buffer = '\0';
    927         return 1;
    928     }
    929 
    930     if (is_unified_ideograph(code)) {
    931         if (buflen < 28)
    932             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
    933             return 0;
    934         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
    935         return 1;
    936     }
    937 
    938     /* get offset into phrasebook */
    939     offset = phrasebook_offset1[(code>>phrasebook_shift)];
    940     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
    941                                (code&((1<<phrasebook_shift)-1))];
    942     if (!offset)
    943         return 0;
    944 
    945     i = 0;
    946 
    947     for (;;) {
    948         /* get word index */
    949         word = phrasebook[offset] - phrasebook_short;
    950         if (word >= 0) {
    951             word = (word << 8) + phrasebook[offset+1];
    952             offset += 2;
    953         } else
    954             word = phrasebook[offset++];
    955         if (i) {
    956             if (i > buflen)
    957                 return 0; /* buffer overflow */
    958             buffer[i++] = ' ';
    959         }
    960         /* copy word string from lexicon.  the last character in the
    961            word has bit 7 set.  the last word in a string ends with
    962            0x80 */
    963         w = lexicon + lexicon_offset[word];
    964         while (*w < 128) {
    965             if (i >= buflen)
    966                 return 0; /* buffer overflow */
    967             buffer[i++] = *w++;
    968         }
    969         if (i >= buflen)
    970             return 0; /* buffer overflow */
    971         buffer[i++] = *w & 127;
    972         if (*w == 128)
    973             break; /* end of word */
    974     }
    975 
    976     return 1;
    977 }
    978 
    979 static int
    980 _cmpname(PyObject *self, int code, const char* name, int namelen)
    981 {
    982     /* check if code corresponds to the given name */
    983     int i;
    984     char buffer[NAME_MAXLEN];
    985     if (!_getucname(self, code, buffer, sizeof(buffer)))
    986         return 0;
    987     for (i = 0; i < namelen; i++) {
    988         if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
    989             return 0;
    990     }
    991     return buffer[namelen] == '\0';
    992 }
    993 
    994 static void
    995 find_syllable(const char *str, int *len, int *pos, int count, int column)
    996 {
    997     int i, len1;
    998     *len = -1;
    999     for (i = 0; i < count; i++) {
   1000         char *s = hangul_syllables[i][column];
   1001         len1 = strlen(s);
   1002         if (len1 <= *len)
   1003             continue;
   1004         if (strncmp(str, s, len1) == 0) {
   1005             *len = len1;
   1006             *pos = i;
   1007         }
   1008     }
   1009     if (*len == -1) {
   1010         *len = 0;
   1011     }
   1012 }
   1013 
   1014 static int
   1015 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
   1016 {
   1017     unsigned int h, v;
   1018     unsigned int mask = code_size-1;
   1019     unsigned int i, incr;
   1020 
   1021     /* Check for hangul syllables. */
   1022     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
   1023         int len, L = -1, V = -1, T = -1;
   1024         const char *pos = name + 16;
   1025         find_syllable(pos, &len, &L, LCount, 0);
   1026         pos += len;
   1027         find_syllable(pos, &len, &V, VCount, 1);
   1028         pos += len;
   1029         find_syllable(pos, &len, &T, TCount, 2);
   1030         pos += len;
   1031         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
   1032             *code = SBase + (L*VCount+V)*TCount + T;
   1033             return 1;
   1034         }
   1035         /* Otherwise, it's an illegal syllable name. */
   1036         return 0;
   1037     }
   1038 
   1039     /* Check for unified ideographs. */
   1040     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
   1041         /* Four or five hexdigits must follow. */
   1042         v = 0;
   1043         name += 22;
   1044         namelen -= 22;
   1045         if (namelen != 4 && namelen != 5)
   1046             return 0;
   1047         while (namelen--) {
   1048             v *= 16;
   1049             if (*name >= '0' && *name <= '9')
   1050                 v += *name - '0';
   1051             else if (*name >= 'A' && *name <= 'F')
   1052                 v += *name - 'A' + 10;
   1053             else
   1054                 return 0;
   1055             name++;
   1056         }
   1057         if (!is_unified_ideograph(v))
   1058             return 0;
   1059         *code = v;
   1060         return 1;
   1061     }
   1062 
   1063     /* the following is the same as python's dictionary lookup, with
   1064        only minor changes.  see the makeunicodedata script for more
   1065        details */
   1066 
   1067     h = (unsigned int) _gethash(name, namelen, code_magic);
   1068     i = (~h) & mask;
   1069     v = code_hash[i];
   1070     if (!v)
   1071         return 0;
   1072     if (_cmpname(self, v, name, namelen)) {
   1073         *code = v;
   1074         return 1;
   1075     }
   1076     incr = (h ^ (h >> 3)) & mask;
   1077     if (!incr)
   1078         incr = mask;
   1079     for (;;) {
   1080         i = (i + incr) & mask;
   1081         v = code_hash[i];
   1082         if (!v)
   1083             return 0;
   1084         if (_cmpname(self, v, name, namelen)) {
   1085             *code = v;
   1086             return 1;
   1087         }
   1088         incr = incr << 1;
   1089         if (incr > mask)
   1090             incr = incr ^ code_poly;
   1091     }
   1092 }
   1093 
   1094 static const _PyUnicode_Name_CAPI hashAPI =
   1095 {
   1096     sizeof(_PyUnicode_Name_CAPI),
   1097     _getucname,
   1098     _getcode
   1099 };
   1100 
   1101 /* -------------------------------------------------------------------- */
   1102 /* Python bindings */
   1103 
   1104 PyDoc_STRVAR(unicodedata_name__doc__,
   1105 "name(unichr[, default])\n\
   1106 Returns the name assigned to the Unicode character unichr as a\n\
   1107 string. If no name is defined, default is returned, or, if not\n\
   1108 given, ValueError is raised.");
   1109 
   1110 static PyObject *
   1111 unicodedata_name(PyObject* self, PyObject* args)
   1112 {
   1113     char name[NAME_MAXLEN];
   1114     Py_UCS4 c;
   1115 
   1116     PyUnicodeObject* v;
   1117     PyObject* defobj = NULL;
   1118     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
   1119         return NULL;
   1120 
   1121     c = getuchar(v);
   1122     if (c == (Py_UCS4)-1)
   1123         return NULL;
   1124 
   1125     if (!_getucname(self, c, name, sizeof(name))) {
   1126         if (defobj == NULL) {
   1127             PyErr_SetString(PyExc_ValueError, "no such name");
   1128             return NULL;
   1129         }
   1130         else {
   1131             Py_INCREF(defobj);
   1132             return defobj;
   1133         }
   1134     }
   1135 
   1136     return Py_BuildValue("s", name);
   1137 }
   1138 
   1139 PyDoc_STRVAR(unicodedata_lookup__doc__,
   1140 "lookup(name)\n\
   1141 \n\
   1142 Look up character by name.  If a character with the\n\
   1143 given name is found, return the corresponding Unicode\n\
   1144 character.  If not found, KeyError is raised.");
   1145 
   1146 static PyObject *
   1147 unicodedata_lookup(PyObject* self, PyObject* args)
   1148 {
   1149     Py_UCS4 code;
   1150     Py_UNICODE str[2];
   1151 
   1152     char* name;
   1153     int namelen;
   1154     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
   1155         return NULL;
   1156 
   1157     if (!_getcode(self, name, namelen, &code)) {
   1158         PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
   1159                      name);
   1160         return NULL;
   1161     }
   1162 
   1163 #ifndef Py_UNICODE_WIDE
   1164     if (code >= 0x10000) {
   1165         str[0] = 0xd800 + ((code - 0x10000) >> 10);
   1166         str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
   1167         return PyUnicode_FromUnicode(str, 2);
   1168     }
   1169 #endif
   1170     str[0] = (Py_UNICODE) code;
   1171     return PyUnicode_FromUnicode(str, 1);
   1172 }
   1173 
   1174 /* XXX Add doc strings. */
   1175 
   1176 static PyMethodDef unicodedata_functions[] = {
   1177     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
   1178     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
   1179     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
   1180     {"category", unicodedata_category, METH_VARARGS,
   1181                  unicodedata_category__doc__},
   1182     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
   1183                       unicodedata_bidirectional__doc__},
   1184     {"combining", unicodedata_combining, METH_VARARGS,
   1185                   unicodedata_combining__doc__},
   1186     {"mirrored", unicodedata_mirrored, METH_VARARGS,
   1187                  unicodedata_mirrored__doc__},
   1188     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
   1189                          unicodedata_east_asian_width__doc__},
   1190     {"decomposition", unicodedata_decomposition, METH_VARARGS,
   1191                       unicodedata_decomposition__doc__},
   1192     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
   1193     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
   1194     {"normalize", unicodedata_normalize, METH_VARARGS,
   1195                   unicodedata_normalize__doc__},
   1196     {NULL, NULL}                /* sentinel */
   1197 };
   1198 
   1199 static PyTypeObject UCD_Type = {
   1200         /* The ob_type field must be initialized in the module init function
   1201          * to be portable to Windows without using C++. */
   1202         PyVarObject_HEAD_INIT(NULL, 0)
   1203         "unicodedata.UCD",              /*tp_name*/
   1204         sizeof(PreviousDBVersion),      /*tp_basicsize*/
   1205         0,                      /*tp_itemsize*/
   1206         /* methods */
   1207         (destructor)PyObject_Del, /*tp_dealloc*/
   1208         0,                      /*tp_print*/
   1209         0,                      /*tp_getattr*/
   1210         0,                      /*tp_setattr*/
   1211         0,                      /*tp_compare*/
   1212         0,                      /*tp_repr*/
   1213         0,                      /*tp_as_number*/
   1214         0,                      /*tp_as_sequence*/
   1215         0,                      /*tp_as_mapping*/
   1216         0,                      /*tp_hash*/
   1217         0,                      /*tp_call*/
   1218         0,                      /*tp_str*/
   1219         PyObject_GenericGetAttr,/*tp_getattro*/
   1220         0,                      /*tp_setattro*/
   1221         0,                      /*tp_as_buffer*/
   1222         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
   1223         0,                      /*tp_doc*/
   1224         0,                      /*tp_traverse*/
   1225         0,                      /*tp_clear*/
   1226         0,                      /*tp_richcompare*/
   1227         0,                      /*tp_weaklistoffset*/
   1228         0,                      /*tp_iter*/
   1229         0,                      /*tp_iternext*/
   1230         unicodedata_functions,  /*tp_methods*/
   1231         DB_members,             /*tp_members*/
   1232         0,                      /*tp_getset*/
   1233         0,                      /*tp_base*/
   1234         0,                      /*tp_dict*/
   1235         0,                      /*tp_descr_get*/
   1236         0,                      /*tp_descr_set*/
   1237         0,                      /*tp_dictoffset*/
   1238         0,                      /*tp_init*/
   1239         0,                      /*tp_alloc*/
   1240         0,                      /*tp_new*/
   1241         0,                      /*tp_free*/
   1242         0,                      /*tp_is_gc*/
   1243 };
   1244 
   1245 PyDoc_STRVAR(unicodedata_docstring,
   1246 "This module provides access to the Unicode Character Database which\n\
   1247 defines character properties for all Unicode characters. The data in\n\
   1248 this database is based on the UnicodeData.txt file version\n\
   1249 5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
   1250 \n\
   1251 The module uses the same names and symbols as defined by the\n\
   1252 UnicodeData File Format 5.2.0 (see\n\
   1253 http://www.unicode.org/reports/tr44/tr44-4.html).");
   1254 
   1255 PyMODINIT_FUNC
   1256 initunicodedata(void)
   1257 {
   1258     PyObject *m, *v;
   1259 
   1260     Py_TYPE(&UCD_Type) = &PyType_Type;
   1261 
   1262     m = Py_InitModule3(
   1263         "unicodedata", unicodedata_functions, unicodedata_docstring);
   1264     if (!m)
   1265         return;
   1266 
   1267     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
   1268     Py_INCREF(&UCD_Type);
   1269     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
   1270 
   1271     /* Previous versions */
   1272     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
   1273     if (v != NULL)
   1274         PyModule_AddObject(m, "ucd_3_2_0", v);
   1275 
   1276     /* Export C API */
   1277     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
   1278     if (v != NULL)
   1279         PyModule_AddObject(m, "ucnhash_CAPI", v);
   1280 }
   1281 
   1282 /*
   1283 Local variables:
   1284 c-basic-offset: 4
   1285 indent-tabs-mode: nil
   1286 End:
   1287 */
   1288