1 /* 2 Unicode character type helpers. 3 4 Written by Marc-Andre Lemburg (mal (at) lemburg.com). 5 Modified for Python 2.0 by Fredrik Lundh (fredrik (at) pythonware.com) 6 7 Copyright (c) Corporation for National Research Initiatives. 8 9 */ 10 11 #include "Python.h" 12 13 #define ALPHA_MASK 0x01 14 #define DECIMAL_MASK 0x02 15 #define DIGIT_MASK 0x04 16 #define LOWER_MASK 0x08 17 #define LINEBREAK_MASK 0x10 18 #define SPACE_MASK 0x20 19 #define TITLE_MASK 0x40 20 #define UPPER_MASK 0x80 21 #define XID_START_MASK 0x100 22 #define XID_CONTINUE_MASK 0x200 23 #define PRINTABLE_MASK 0x400 24 #define NUMERIC_MASK 0x800 25 #define CASE_IGNORABLE_MASK 0x1000 26 #define CASED_MASK 0x2000 27 #define EXTENDED_CASE_MASK 0x4000 28 29 typedef struct { 30 /* 31 These are either deltas to the character or offsets in 32 _PyUnicode_ExtendedCase. 33 */ 34 const int upper; 35 const int lower; 36 const int title; 37 /* Note if more flag space is needed, decimal and digit could be unified. */ 38 const unsigned char decimal; 39 const unsigned char digit; 40 const unsigned short flags; 41 } _PyUnicode_TypeRecord; 42 43 #include "unicodetype_db.h" 44 45 static const _PyUnicode_TypeRecord * 46 gettyperecord(Py_UCS4 code) 47 { 48 int index; 49 50 if (code >= 0x110000) 51 index = 0; 52 else 53 { 54 index = index1[(code>>SHIFT)]; 55 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 56 } 57 58 return &_PyUnicode_TypeRecords[index]; 59 } 60 61 /* Returns the titlecase Unicode characters corresponding to ch or just 62 ch if no titlecase mapping is known. */ 63 64 Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch) 65 { 66 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 67 68 if (ctype->flags & EXTENDED_CASE_MASK) 69 return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF]; 70 return ch + ctype->title; 71 } 72 73 /* Returns 1 for Unicode characters having the category 'Lt', 0 74 otherwise. */ 75 76 int _PyUnicode_IsTitlecase(Py_UCS4 ch) 77 { 78 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 79 80 return (ctype->flags & TITLE_MASK) != 0; 81 } 82 83 /* Returns 1 for Unicode characters having the XID_Start property, 0 84 otherwise. */ 85 86 int _PyUnicode_IsXidStart(Py_UCS4 ch) 87 { 88 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 89 90 return (ctype->flags & XID_START_MASK) != 0; 91 } 92 93 /* Returns 1 for Unicode characters having the XID_Continue property, 94 0 otherwise. */ 95 96 int _PyUnicode_IsXidContinue(Py_UCS4 ch) 97 { 98 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 99 100 return (ctype->flags & XID_CONTINUE_MASK) != 0; 101 } 102 103 /* Returns the integer decimal (0-9) for Unicode characters having 104 this property, -1 otherwise. */ 105 106 int _PyUnicode_ToDecimalDigit(Py_UCS4 ch) 107 { 108 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 109 110 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1; 111 } 112 113 int _PyUnicode_IsDecimalDigit(Py_UCS4 ch) 114 { 115 if (_PyUnicode_ToDecimalDigit(ch) < 0) 116 return 0; 117 return 1; 118 } 119 120 /* Returns the integer digit (0-9) for Unicode characters having 121 this property, -1 otherwise. */ 122 123 int _PyUnicode_ToDigit(Py_UCS4 ch) 124 { 125 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 126 127 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1; 128 } 129 130 int _PyUnicode_IsDigit(Py_UCS4 ch) 131 { 132 if (_PyUnicode_ToDigit(ch) < 0) 133 return 0; 134 return 1; 135 } 136 137 /* Returns the numeric value as double for Unicode characters having 138 this property, -1.0 otherwise. */ 139 140 int _PyUnicode_IsNumeric(Py_UCS4 ch) 141 { 142 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 143 144 return (ctype->flags & NUMERIC_MASK) != 0; 145 } 146 147 /* Returns 1 for Unicode characters to be hex-escaped when repr()ed, 148 0 otherwise. 149 All characters except those characters defined in the Unicode character 150 database as following categories are considered printable. 151 * Cc (Other, Control) 152 * Cf (Other, Format) 153 * Cs (Other, Surrogate) 154 * Co (Other, Private Use) 155 * Cn (Other, Not Assigned) 156 * Zl Separator, Line ('\u2028', LINE SEPARATOR) 157 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) 158 * Zs (Separator, Space) other than ASCII space('\x20'). 159 */ 160 int _PyUnicode_IsPrintable(Py_UCS4 ch) 161 { 162 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 163 164 return (ctype->flags & PRINTABLE_MASK) != 0; 165 } 166 167 /* Returns 1 for Unicode characters having the category 'Ll', 0 168 otherwise. */ 169 170 int _PyUnicode_IsLowercase(Py_UCS4 ch) 171 { 172 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 173 174 return (ctype->flags & LOWER_MASK) != 0; 175 } 176 177 /* Returns 1 for Unicode characters having the category 'Lu', 0 178 otherwise. */ 179 180 int _PyUnicode_IsUppercase(Py_UCS4 ch) 181 { 182 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 183 184 return (ctype->flags & UPPER_MASK) != 0; 185 } 186 187 /* Returns the uppercase Unicode characters corresponding to ch or just 188 ch if no uppercase mapping is known. */ 189 190 Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) 191 { 192 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 193 194 if (ctype->flags & EXTENDED_CASE_MASK) 195 return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF]; 196 return ch + ctype->upper; 197 } 198 199 /* Returns the lowercase Unicode characters corresponding to ch or just 200 ch if no lowercase mapping is known. */ 201 202 Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) 203 { 204 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 205 206 if (ctype->flags & EXTENDED_CASE_MASK) 207 return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF]; 208 return ch + ctype->lower; 209 } 210 211 int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) 212 { 213 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 214 215 if (ctype->flags & EXTENDED_CASE_MASK) { 216 int index = ctype->lower & 0xFFFF; 217 int n = ctype->lower >> 24; 218 int i; 219 for (i = 0; i < n; i++) 220 res[i] = _PyUnicode_ExtendedCase[index + i]; 221 return n; 222 } 223 res[0] = ch + ctype->lower; 224 return 1; 225 } 226 227 int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) 228 { 229 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 230 231 if (ctype->flags & EXTENDED_CASE_MASK) { 232 int index = ctype->title & 0xFFFF; 233 int n = ctype->title >> 24; 234 int i; 235 for (i = 0; i < n; i++) 236 res[i] = _PyUnicode_ExtendedCase[index + i]; 237 return n; 238 } 239 res[0] = ch + ctype->title; 240 return 1; 241 } 242 243 int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) 244 { 245 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 246 247 if (ctype->flags & EXTENDED_CASE_MASK) { 248 int index = ctype->upper & 0xFFFF; 249 int n = ctype->upper >> 24; 250 int i; 251 for (i = 0; i < n; i++) 252 res[i] = _PyUnicode_ExtendedCase[index + i]; 253 return n; 254 } 255 res[0] = ch + ctype->upper; 256 return 1; 257 } 258 259 int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) 260 { 261 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 262 263 if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { 264 int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); 265 int n = (ctype->lower >> 20) & 7; 266 int i; 267 for (i = 0; i < n; i++) 268 res[i] = _PyUnicode_ExtendedCase[index + i]; 269 return n; 270 } 271 return _PyUnicode_ToLowerFull(ch, res); 272 } 273 274 int _PyUnicode_IsCased(Py_UCS4 ch) 275 { 276 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 277 278 return (ctype->flags & CASED_MASK) != 0; 279 } 280 281 int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch) 282 { 283 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 284 285 return (ctype->flags & CASE_IGNORABLE_MASK) != 0; 286 } 287 288 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', 289 'Lo' or 'Lm', 0 otherwise. */ 290 291 int _PyUnicode_IsAlpha(Py_UCS4 ch) 292 { 293 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 294 295 return (ctype->flags & ALPHA_MASK) != 0; 296 } 297 298