1 /* guniprop.c - Unicode character properties. 2 * 3 * Copyright (C) 1999 Tom Tromey 4 * Copyright (C) 2000 Red Hat, Inc. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, write to the 18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 19 * Boston, MA 02111-1307, USA. 20 */ 21 22 #include "config.h" 23 24 #include <stdlib.h> 25 #include <stddef.h> 26 #include <string.h> 27 #include <locale.h> 28 29 #include "glib.h" 30 #include "gunichartables.h" 31 #include "gmirroringtable.h" 32 #include "gscripttable.h" 33 #include "gunicodeprivate.h" 34 #include "galias.h" 35 36 #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \ 37 ? attr_table_part1[Page] \ 38 : attr_table_part2[(Page) - 0xe00]) 39 40 #define ATTTABLE(Page, Char) \ 41 ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char])) 42 43 #define TTYPE_PART1(Page, Char) \ 44 ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ 45 ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ 46 : (type_data[type_table_part1[Page]][Char])) 47 48 #define TTYPE_PART2(Page, Char) \ 49 ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ 50 ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ 51 : (type_data[type_table_part2[Page]][Char])) 52 53 #define TYPE(Char) \ 54 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ 55 ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \ 56 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ 57 ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ 58 : G_UNICODE_UNASSIGNED)) 59 60 61 #define IS(Type, Class) (((guint)1 << (Type)) & (Class)) 62 #define OR(Type, Rest) (((guint)1 << (Type)) | (Rest)) 63 64 65 66 #define ISALPHA(Type) IS ((Type), \ 67 OR (G_UNICODE_LOWERCASE_LETTER, \ 68 OR (G_UNICODE_UPPERCASE_LETTER, \ 69 OR (G_UNICODE_TITLECASE_LETTER, \ 70 OR (G_UNICODE_MODIFIER_LETTER, \ 71 OR (G_UNICODE_OTHER_LETTER, 0)))))) 72 73 #define ISALDIGIT(Type) IS ((Type), \ 74 OR (G_UNICODE_DECIMAL_NUMBER, \ 75 OR (G_UNICODE_LETTER_NUMBER, \ 76 OR (G_UNICODE_OTHER_NUMBER, \ 77 OR (G_UNICODE_LOWERCASE_LETTER, \ 78 OR (G_UNICODE_UPPERCASE_LETTER, \ 79 OR (G_UNICODE_TITLECASE_LETTER, \ 80 OR (G_UNICODE_MODIFIER_LETTER, \ 81 OR (G_UNICODE_OTHER_LETTER, 0))))))))) 82 83 #define ISMARK(Type) IS ((Type), \ 84 OR (G_UNICODE_NON_SPACING_MARK, \ 85 OR (G_UNICODE_COMBINING_MARK, \ 86 OR (G_UNICODE_ENCLOSING_MARK, 0)))) 87 88 #define ISZEROWIDTHTYPE(Type) IS ((Type), \ 89 OR (G_UNICODE_NON_SPACING_MARK, \ 90 OR (G_UNICODE_ENCLOSING_MARK, \ 91 OR (G_UNICODE_FORMAT, 0)))) 92 93 /** 94 * g_unichar_isalnum: 95 * @c: a Unicode character 96 * 97 * Determines whether a character is alphanumeric. 98 * Given some UTF-8 text, obtain a character value 99 * with g_utf8_get_char(). 100 * 101 * Return value: %TRUE if @c is an alphanumeric character 102 **/ 103 gboolean 104 g_unichar_isalnum (gunichar c) 105 { 106 return ISALDIGIT (TYPE (c)) ? TRUE : FALSE; 107 } 108 109 /** 110 * g_unichar_isalpha: 111 * @c: a Unicode character 112 * 113 * Determines whether a character is alphabetic (i.e. a letter). 114 * Given some UTF-8 text, obtain a character value with 115 * g_utf8_get_char(). 116 * 117 * Return value: %TRUE if @c is an alphabetic character 118 **/ 119 gboolean 120 g_unichar_isalpha (gunichar c) 121 { 122 return ISALPHA (TYPE (c)) ? TRUE : FALSE; 123 } 124 125 126 /** 127 * g_unichar_iscntrl: 128 * @c: a Unicode character 129 * 130 * Determines whether a character is a control character. 131 * Given some UTF-8 text, obtain a character value with 132 * g_utf8_get_char(). 133 * 134 * Return value: %TRUE if @c is a control character 135 **/ 136 gboolean 137 g_unichar_iscntrl (gunichar c) 138 { 139 return TYPE (c) == G_UNICODE_CONTROL; 140 } 141 142 /** 143 * g_unichar_isdigit: 144 * @c: a Unicode character 145 * 146 * Determines whether a character is numeric (i.e. a digit). This 147 * covers ASCII 0-9 and also digits in other languages/scripts. Given 148 * some UTF-8 text, obtain a character value with g_utf8_get_char(). 149 * 150 * Return value: %TRUE if @c is a digit 151 **/ 152 gboolean 153 g_unichar_isdigit (gunichar c) 154 { 155 return TYPE (c) == G_UNICODE_DECIMAL_NUMBER; 156 } 157 158 159 /** 160 * g_unichar_isgraph: 161 * @c: a Unicode character 162 * 163 * Determines whether a character is printable and not a space 164 * (returns %FALSE for control characters, format characters, and 165 * spaces). g_unichar_isprint() is similar, but returns %TRUE for 166 * spaces. Given some UTF-8 text, obtain a character value with 167 * g_utf8_get_char(). 168 * 169 * Return value: %TRUE if @c is printable unless it's a space 170 **/ 171 gboolean 172 g_unichar_isgraph (gunichar c) 173 { 174 return !IS (TYPE(c), 175 OR (G_UNICODE_CONTROL, 176 OR (G_UNICODE_FORMAT, 177 OR (G_UNICODE_UNASSIGNED, 178 OR (G_UNICODE_SURROGATE, 179 OR (G_UNICODE_SPACE_SEPARATOR, 180 0)))))); 181 } 182 183 /** 184 * g_unichar_islower: 185 * @c: a Unicode character 186 * 187 * Determines whether a character is a lowercase letter. 188 * Given some UTF-8 text, obtain a character value with 189 * g_utf8_get_char(). 190 * 191 * Return value: %TRUE if @c is a lowercase letter 192 **/ 193 gboolean 194 g_unichar_islower (gunichar c) 195 { 196 return TYPE (c) == G_UNICODE_LOWERCASE_LETTER; 197 } 198 199 200 /** 201 * g_unichar_isprint: 202 * @c: a Unicode character 203 * 204 * Determines whether a character is printable. 205 * Unlike g_unichar_isgraph(), returns %TRUE for spaces. 206 * Given some UTF-8 text, obtain a character value with 207 * g_utf8_get_char(). 208 * 209 * Return value: %TRUE if @c is printable 210 **/ 211 gboolean 212 g_unichar_isprint (gunichar c) 213 { 214 return !IS (TYPE(c), 215 OR (G_UNICODE_CONTROL, 216 OR (G_UNICODE_FORMAT, 217 OR (G_UNICODE_UNASSIGNED, 218 OR (G_UNICODE_SURROGATE, 219 0))))); 220 } 221 222 /** 223 * g_unichar_ispunct: 224 * @c: a Unicode character 225 * 226 * Determines whether a character is punctuation or a symbol. 227 * Given some UTF-8 text, obtain a character value with 228 * g_utf8_get_char(). 229 * 230 * Return value: %TRUE if @c is a punctuation or symbol character 231 **/ 232 gboolean 233 g_unichar_ispunct (gunichar c) 234 { 235 return IS (TYPE(c), 236 OR (G_UNICODE_CONNECT_PUNCTUATION, 237 OR (G_UNICODE_DASH_PUNCTUATION, 238 OR (G_UNICODE_CLOSE_PUNCTUATION, 239 OR (G_UNICODE_FINAL_PUNCTUATION, 240 OR (G_UNICODE_INITIAL_PUNCTUATION, 241 OR (G_UNICODE_OTHER_PUNCTUATION, 242 OR (G_UNICODE_OPEN_PUNCTUATION, 243 OR (G_UNICODE_CURRENCY_SYMBOL, 244 OR (G_UNICODE_MODIFIER_SYMBOL, 245 OR (G_UNICODE_MATH_SYMBOL, 246 OR (G_UNICODE_OTHER_SYMBOL, 247 0)))))))))))) ? TRUE : FALSE; 248 } 249 250 /** 251 * g_unichar_isspace: 252 * @c: a Unicode character 253 * 254 * Determines whether a character is a space, tab, or line separator 255 * (newline, carriage return, etc.). Given some UTF-8 text, obtain a 256 * character value with g_utf8_get_char(). 257 * 258 * (Note: don't use this to do word breaking; you have to use 259 * Pango or equivalent to get word breaking right, the algorithm 260 * is fairly complex.) 261 * 262 * Return value: %TRUE if @c is a space character 263 **/ 264 gboolean 265 g_unichar_isspace (gunichar c) 266 { 267 switch (c) 268 { 269 /* special-case these since Unicode thinks they are not spaces */ 270 case '\t': 271 case '\n': 272 case '\r': 273 case '\f': 274 return TRUE; 275 break; 276 277 default: 278 { 279 return IS (TYPE(c), 280 OR (G_UNICODE_SPACE_SEPARATOR, 281 OR (G_UNICODE_LINE_SEPARATOR, 282 OR (G_UNICODE_PARAGRAPH_SEPARATOR, 283 0)))) ? TRUE : FALSE; 284 } 285 break; 286 } 287 } 288 289 /** 290 * g_unichar_ismark: 291 * @c: a Unicode character 292 * 293 * Determines whether a character is a mark (non-spacing mark, 294 * combining mark, or enclosing mark in Unicode speak). 295 * Given some UTF-8 text, obtain a character value 296 * with g_utf8_get_char(). 297 * 298 * Note: in most cases where isalpha characters are allowed, 299 * ismark characters should be allowed to as they are essential 300 * for writing most European languages as well as many non-Latin 301 * scripts. 302 * 303 * Return value: %TRUE if @c is a mark character 304 * 305 * Since: 2.14 306 **/ 307 gboolean 308 g_unichar_ismark (gunichar c) 309 { 310 return ISMARK (TYPE (c)); 311 } 312 313 /** 314 * g_unichar_isupper: 315 * @c: a Unicode character 316 * 317 * Determines if a character is uppercase. 318 * 319 * Return value: %TRUE if @c is an uppercase character 320 **/ 321 gboolean 322 g_unichar_isupper (gunichar c) 323 { 324 return TYPE (c) == G_UNICODE_UPPERCASE_LETTER; 325 } 326 327 /** 328 * g_unichar_istitle: 329 * @c: a Unicode character 330 * 331 * Determines if a character is titlecase. Some characters in 332 * Unicode which are composites, such as the DZ digraph 333 * have three case variants instead of just two. The titlecase 334 * form is used at the beginning of a word where only the 335 * first letter is capitalized. The titlecase form of the DZ 336 * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. 337 * 338 * Return value: %TRUE if the character is titlecase 339 **/ 340 gboolean 341 g_unichar_istitle (gunichar c) 342 { 343 unsigned int i; 344 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) 345 if (title_table[i][0] == c) 346 return TRUE; 347 return FALSE; 348 } 349 350 /** 351 * g_unichar_isxdigit: 352 * @c: a Unicode character. 353 * 354 * Determines if a character is a hexidecimal digit. 355 * 356 * Return value: %TRUE if the character is a hexadecimal digit 357 **/ 358 gboolean 359 g_unichar_isxdigit (gunichar c) 360 { 361 return ((c >= 'a' && c <= 'f') 362 || (c >= 'A' && c <= 'F') 363 || (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)); 364 } 365 366 /** 367 * g_unichar_isdefined: 368 * @c: a Unicode character 369 * 370 * Determines if a given character is assigned in the Unicode 371 * standard. 372 * 373 * Return value: %TRUE if the character has an assigned value 374 **/ 375 gboolean 376 g_unichar_isdefined (gunichar c) 377 { 378 return !IS (TYPE(c), 379 OR (G_UNICODE_UNASSIGNED, 380 OR (G_UNICODE_SURROGATE, 381 0))); 382 } 383 384 /** 385 * g_unichar_iszerowidth: 386 * @c: a Unicode character 387 * 388 * Determines if a given character typically takes zero width when rendered. 389 * The return value is %TRUE for all non-spacing and enclosing marks 390 * (e.g., combining accents), format characters, zero-width 391 * space, but not U+00AD SOFT HYPHEN. 392 * 393 * A typical use of this function is with one of g_unichar_iswide() or 394 * g_unichar_iswide_cjk() to determine the number of cells a string occupies 395 * when displayed on a grid display (terminals). However, note that not all 396 * terminals support zero-width rendering of zero-width marks. 397 * 398 * Return value: %TRUE if the character has zero width 399 * 400 * Since: 2.14 401 **/ 402 gboolean 403 g_unichar_iszerowidth (gunichar c) 404 { 405 if (G_UNLIKELY (c == 0x00AD)) 406 return FALSE; 407 408 if (G_UNLIKELY (ISZEROWIDTHTYPE (TYPE (c)))) 409 return TRUE; 410 411 if (G_UNLIKELY ((c >= 0x1160 && c < 0x1200) || 412 c == 0x200B)) 413 return TRUE; 414 415 return FALSE; 416 } 417 418 struct Interval 419 { 420 gunichar start, end; 421 }; 422 423 static int 424 interval_compare (const void *key, const void *elt) 425 { 426 gunichar c = GPOINTER_TO_UINT (key); 427 struct Interval *interval = (struct Interval *)elt; 428 429 if (c < interval->start) 430 return -1; 431 if (c > interval->end) 432 return +1; 433 434 return 0; 435 } 436 437 /** 438 * g_unichar_iswide: 439 * @c: a Unicode character 440 * 441 * Determines if a character is typically rendered in a double-width 442 * cell. 443 * 444 * Return value: %TRUE if the character is wide 445 **/ 446 gboolean 447 g_unichar_iswide (gunichar c) 448 { 449 /* sorted list of intervals of East_Asian_Width = W and F characters 450 * from Unicode 5.1.0. produced by mungling output of: 451 * grep ';[FW]\>' EastAsianWidth.txt */ 452 static const struct Interval wide[] = { 453 {0x1100, 0x1159}, {0x115F, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, 454 {0x2E9B, 0x2EF3}, {0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, 455 {0x3041, 0x3096}, {0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, 456 {0x3190, 0x31B7}, {0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3243}, 457 {0x3250, 0x32FE}, {0x3300, 0x4DB5}, {0x4E00, 0x9FC3}, {0xA000, 0xA48C}, 458 {0xA490, 0xA4C6}, {0xAC00, 0xD7A3}, {0xF900, 0xFA2D}, {0xFA30, 0xFA6A}, 459 {0xFA70, 0xFAD9}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE66}, 460 {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6}, {0x20000, 0x2FFFD}, 461 {0x30000, 0x3FFFD} 462 }; 463 464 if (bsearch (GUINT_TO_POINTER (c), wide, G_N_ELEMENTS (wide), sizeof wide[0], 465 interval_compare)) 466 return TRUE; 467 468 return FALSE; 469 } 470 471 472 /** 473 * g_unichar_iswide_cjk: 474 * @c: a Unicode character 475 * 476 * Determines if a character is typically rendered in a double-width 477 * cell under legacy East Asian locales. If a character is wide according to 478 * g_unichar_iswide(), then it is also reported wide with this function, but 479 * the converse is not necessarily true. See the 480 * <ulink url="http://www.unicode.org/reports/tr11/">Unicode Standard 481 * Annex #11</ulink> for details. 482 * 483 * If a character passes the g_unichar_iswide() test then it will also pass 484 * this test, but not the other way around. Note that some characters may 485 * pas both this test and g_unichar_iszerowidth(). 486 * 487 * Return value: %TRUE if the character is wide in legacy East Asian locales 488 * 489 * Since: 2.12 490 */ 491 gboolean 492 g_unichar_iswide_cjk (gunichar c) 493 { 494 /* sorted list of intervals of East_Asian_Width = A and F characters 495 * from Unicode 5.1.0. produced by mungling output of: 496 * grep ';[A]\>' EastAsianWidth.txt */ 497 static const struct Interval ambiguous[] = { 498 {0x00A1, 0x00A1}, {0x00A4, 0x00A4}, {0x00A7, 0x00A8}, {0x00AA, 0x00AA}, 499 {0x00AD, 0x00AE}, {0x00B0, 0x00B4}, {0x00B6, 0x00BA}, {0x00BC, 0x00BF}, 500 {0x00C6, 0x00C6}, {0x00D0, 0x00D0}, {0x00D7, 0x00D8}, {0x00DE, 0x00E1}, 501 {0x00E6, 0x00E6}, {0x00E8, 0x00EA}, {0x00EC, 0x00ED}, {0x00F0, 0x00F0}, 502 {0x00F2, 0x00F3}, {0x00F7, 0x00FA}, {0x00FC, 0x00FC}, {0x00FE, 0x00FE}, 503 {0x0101, 0x0101}, {0x0111, 0x0111}, {0x0113, 0x0113}, {0x011B, 0x011B}, 504 {0x0126, 0x0127}, {0x012B, 0x012B}, {0x0131, 0x0133}, {0x0138, 0x0138}, 505 {0x013F, 0x0142}, {0x0144, 0x0144}, {0x0148, 0x014B}, {0x014D, 0x014D}, 506 {0x0152, 0x0153}, {0x0166, 0x0167}, {0x016B, 0x016B}, {0x01CE, 0x01CE}, 507 {0x01D0, 0x01D0}, {0x01D2, 0x01D2}, {0x01D4, 0x01D4}, {0x01D6, 0x01D6}, 508 {0x01D8, 0x01D8}, {0x01DA, 0x01DA}, {0x01DC, 0x01DC}, {0x0251, 0x0251}, 509 {0x0261, 0x0261}, {0x02C4, 0x02C4}, {0x02C7, 0x02C7}, {0x02C9, 0x02CB}, 510 {0x02CD, 0x02CD}, {0x02D0, 0x02D0}, {0x02D8, 0x02DB}, {0x02DD, 0x02DD}, 511 {0x02DF, 0x02DF}, {0x0300, 0x036F}, {0x0391, 0x03A1}, {0x03A3, 0x03A9}, 512 {0x03B1, 0x03C1}, {0x03C3, 0x03C9}, {0x0401, 0x0401}, {0x0410, 0x044F}, 513 {0x0451, 0x0451}, {0x2010, 0x2010}, {0x2013, 0x2016}, {0x2018, 0x2019}, 514 {0x201C, 0x201D}, {0x2020, 0x2022}, {0x2024, 0x2027}, {0x2030, 0x2030}, 515 {0x2032, 0x2033}, {0x2035, 0x2035}, {0x203B, 0x203B}, {0x203E, 0x203E}, 516 {0x2074, 0x2074}, {0x207F, 0x207F}, {0x2081, 0x2084}, {0x20AC, 0x20AC}, 517 {0x2103, 0x2103}, {0x2105, 0x2105}, {0x2109, 0x2109}, {0x2113, 0x2113}, 518 {0x2116, 0x2116}, {0x2121, 0x2122}, {0x2126, 0x2126}, {0x212B, 0x212B}, 519 {0x2153, 0x2154}, {0x215B, 0x215E}, {0x2160, 0x216B}, {0x2170, 0x2179}, 520 {0x2190, 0x2199}, {0x21B8, 0x21B9}, {0x21D2, 0x21D2}, {0x21D4, 0x21D4}, 521 {0x21E7, 0x21E7}, {0x2200, 0x2200}, {0x2202, 0x2203}, {0x2207, 0x2208}, 522 {0x220B, 0x220B}, {0x220F, 0x220F}, {0x2211, 0x2211}, {0x2215, 0x2215}, 523 {0x221A, 0x221A}, {0x221D, 0x2220}, {0x2223, 0x2223}, {0x2225, 0x2225}, 524 {0x2227, 0x222C}, {0x222E, 0x222E}, {0x2234, 0x2237}, {0x223C, 0x223D}, 525 {0x2248, 0x2248}, {0x224C, 0x224C}, {0x2252, 0x2252}, {0x2260, 0x2261}, 526 {0x2264, 0x2267}, {0x226A, 0x226B}, {0x226E, 0x226F}, {0x2282, 0x2283}, 527 {0x2286, 0x2287}, {0x2295, 0x2295}, {0x2299, 0x2299}, {0x22A5, 0x22A5}, 528 {0x22BF, 0x22BF}, {0x2312, 0x2312}, {0x2460, 0x24E9}, {0x24EB, 0x254B}, 529 {0x2550, 0x2573}, {0x2580, 0x258F}, {0x2592, 0x2595}, {0x25A0, 0x25A1}, 530 {0x25A3, 0x25A9}, {0x25B2, 0x25B3}, {0x25B6, 0x25B7}, {0x25BC, 0x25BD}, 531 {0x25C0, 0x25C1}, {0x25C6, 0x25C8}, {0x25CB, 0x25CB}, {0x25CE, 0x25D1}, 532 {0x25E2, 0x25E5}, {0x25EF, 0x25EF}, {0x2605, 0x2606}, {0x2609, 0x2609}, 533 {0x260E, 0x260F}, {0x2614, 0x2615}, {0x261C, 0x261C}, {0x261E, 0x261E}, 534 {0x2640, 0x2640}, {0x2642, 0x2642}, {0x2660, 0x2661}, {0x2663, 0x2665}, 535 {0x2667, 0x266A}, {0x266C, 0x266D}, {0x266F, 0x266F}, {0x273D, 0x273D}, 536 {0x2776, 0x277F}, {0xE000, 0xF8FF}, {0xFE00, 0xFE0F}, {0xFFFD, 0xFFFD}, 537 {0xE0100, 0xE01EF}, {0xF0000, 0xFFFFD}, {0x100000, 0x10FFFD} 538 }; 539 540 if (g_unichar_iswide (c)) 541 return TRUE; 542 543 if (bsearch (GUINT_TO_POINTER (c), ambiguous, G_N_ELEMENTS (ambiguous), sizeof ambiguous[0], 544 interval_compare)) 545 return TRUE; 546 547 return FALSE; 548 } 549 550 551 /** 552 * g_unichar_toupper: 553 * @c: a Unicode character 554 * 555 * Converts a character to uppercase. 556 * 557 * Return value: the result of converting @c to uppercase. 558 * If @c is not an lowercase or titlecase character, 559 * or has no upper case equivalent @c is returned unchanged. 560 **/ 561 gunichar 562 g_unichar_toupper (gunichar c) 563 { 564 int t = TYPE (c); 565 if (t == G_UNICODE_LOWERCASE_LETTER) 566 { 567 gunichar val = ATTTABLE (c >> 8, c & 0xff); 568 if (val >= 0x1000000) 569 { 570 const gchar *p = special_case_table + val - 0x1000000; 571 val = g_utf8_get_char (p); 572 } 573 /* Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR, 574 * do not have an uppercase equivalent, in which case val will be 575 * zero. 576 */ 577 return val ? val : c; 578 } 579 else if (t == G_UNICODE_TITLECASE_LETTER) 580 { 581 unsigned int i; 582 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) 583 { 584 if (title_table[i][0] == c) 585 return title_table[i][1]; 586 } 587 } 588 return c; 589 } 590 591 /** 592 * g_unichar_tolower: 593 * @c: a Unicode character. 594 * 595 * Converts a character to lower case. 596 * 597 * Return value: the result of converting @c to lower case. 598 * If @c is not an upperlower or titlecase character, 599 * or has no lowercase equivalent @c is returned unchanged. 600 **/ 601 gunichar 602 g_unichar_tolower (gunichar c) 603 { 604 int t = TYPE (c); 605 if (t == G_UNICODE_UPPERCASE_LETTER) 606 { 607 gunichar val = ATTTABLE (c >> 8, c & 0xff); 608 if (val >= 0x1000000) 609 { 610 const gchar *p = special_case_table + val - 0x1000000; 611 return g_utf8_get_char (p); 612 } 613 else 614 { 615 /* Not all uppercase letters are guaranteed to have a lowercase 616 * equivalent. If this is the case, val will be zero. */ 617 return val ? val : c; 618 } 619 } 620 else if (t == G_UNICODE_TITLECASE_LETTER) 621 { 622 unsigned int i; 623 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) 624 { 625 if (title_table[i][0] == c) 626 return title_table[i][2]; 627 } 628 } 629 return c; 630 } 631 632 /** 633 * g_unichar_totitle: 634 * @c: a Unicode character 635 * 636 * Converts a character to the titlecase. 637 * 638 * Return value: the result of converting @c to titlecase. 639 * If @c is not an uppercase or lowercase character, 640 * @c is returned unchanged. 641 **/ 642 gunichar 643 g_unichar_totitle (gunichar c) 644 { 645 unsigned int i; 646 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) 647 { 648 if (title_table[i][0] == c || title_table[i][1] == c 649 || title_table[i][2] == c) 650 return title_table[i][0]; 651 } 652 653 if (TYPE (c) == G_UNICODE_LOWERCASE_LETTER) 654 return g_unichar_toupper (c); 655 656 return c; 657 } 658 659 /** 660 * g_unichar_digit_value: 661 * @c: a Unicode character 662 * 663 * Determines the numeric value of a character as a decimal 664 * digit. 665 * 666 * Return value: If @c is a decimal digit (according to 667 * g_unichar_isdigit()), its numeric value. Otherwise, -1. 668 **/ 669 int 670 g_unichar_digit_value (gunichar c) 671 { 672 if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) 673 return ATTTABLE (c >> 8, c & 0xff); 674 return -1; 675 } 676 677 /** 678 * g_unichar_xdigit_value: 679 * @c: a Unicode character 680 * 681 * Determines the numeric value of a character as a hexidecimal 682 * digit. 683 * 684 * Return value: If @c is a hex digit (according to 685 * g_unichar_isxdigit()), its numeric value. Otherwise, -1. 686 **/ 687 int 688 g_unichar_xdigit_value (gunichar c) 689 { 690 if (c >= 'A' && c <= 'F') 691 return c - 'A' + 10; 692 if (c >= 'a' && c <= 'f') 693 return c - 'a' + 10; 694 if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) 695 return ATTTABLE (c >> 8, c & 0xff); 696 return -1; 697 } 698 699 /** 700 * g_unichar_type: 701 * @c: a Unicode character 702 * 703 * Classifies a Unicode character by type. 704 * 705 * Return value: the type of the character. 706 **/ 707 GUnicodeType 708 g_unichar_type (gunichar c) 709 { 710 return TYPE (c); 711 } 712 713 /* 714 * Case mapping functions 715 */ 716 717 typedef enum { 718 LOCALE_NORMAL, 719 LOCALE_TURKIC, 720 LOCALE_LITHUANIAN 721 } LocaleType; 722 723 static LocaleType 724 get_locale_type (void) 725 { 726 #ifdef G_OS_WIN32 727 char *tem = g_win32_getlocale (); 728 char locale[2]; 729 730 locale[0] = tem[0]; 731 locale[1] = tem[1]; 732 g_free (tem); 733 #else 734 const char *locale = setlocale (LC_CTYPE, NULL); 735 #endif 736 737 switch (locale[0]) 738 { 739 case 'a': 740 if (locale[1] == 'z') 741 return LOCALE_TURKIC; 742 break; 743 case 'l': 744 if (locale[1] == 't') 745 return LOCALE_LITHUANIAN; 746 break; 747 case 't': 748 if (locale[1] == 'r') 749 return LOCALE_TURKIC; 750 break; 751 } 752 753 return LOCALE_NORMAL; 754 } 755 756 static gint 757 output_marks (const char **p_inout, 758 char *out_buffer, 759 gboolean remove_dot) 760 { 761 const char *p = *p_inout; 762 gint len = 0; 763 764 while (*p) 765 { 766 gunichar c = g_utf8_get_char (p); 767 768 if (ISMARK (TYPE (c))) 769 { 770 if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */) 771 len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL); 772 p = g_utf8_next_char (p); 773 } 774 else 775 break; 776 } 777 778 *p_inout = p; 779 return len; 780 } 781 782 static gint 783 output_special_case (gchar *out_buffer, 784 int offset, 785 int type, 786 int which) 787 { 788 const gchar *p = special_case_table + offset; 789 gint len; 790 791 if (type != G_UNICODE_TITLECASE_LETTER) 792 p = g_utf8_next_char (p); 793 794 if (which == 1) 795 p += strlen (p) + 1; 796 797 len = strlen (p); 798 if (out_buffer) 799 memcpy (out_buffer, p, len); 800 801 return len; 802 } 803 804 static gsize 805 real_toupper (const gchar *str, 806 gssize max_len, 807 gchar *out_buffer, 808 LocaleType locale_type) 809 { 810 const gchar *p = str; 811 const char *last = NULL; 812 gsize len = 0; 813 gboolean last_was_i = FALSE; 814 815 while ((max_len < 0 || p < str + max_len) && *p) 816 { 817 gunichar c = g_utf8_get_char (p); 818 int t = TYPE (c); 819 gunichar val; 820 821 last = p; 822 p = g_utf8_next_char (p); 823 824 if (locale_type == LOCALE_LITHUANIAN) 825 { 826 if (c == 'i') 827 last_was_i = TRUE; 828 else 829 { 830 if (last_was_i) 831 { 832 /* Nasty, need to remove any dot above. Though 833 * I think only E WITH DOT ABOVE occurs in practice 834 * which could simplify this considerably. 835 */ 836 gsize decomp_len, i; 837 gunichar *decomp; 838 839 decomp = g_unicode_canonical_decomposition (c, &decomp_len); 840 for (i=0; i < decomp_len; i++) 841 { 842 if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */) 843 len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL); 844 } 845 g_free (decomp); 846 847 len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE); 848 849 continue; 850 } 851 852 if (!ISMARK (t)) 853 last_was_i = FALSE; 854 } 855 } 856 857 if (locale_type == LOCALE_TURKIC && c == 'i') 858 { 859 /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */ 860 len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL); 861 } 862 else if (c == 0x0345) /* COMBINING GREEK YPOGEGRAMMENI */ 863 { 864 /* Nasty, need to move it after other combining marks .. this would go away if 865 * we normalized first. 866 */ 867 len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE); 868 869 /* And output as GREEK CAPITAL LETTER IOTA */ 870 len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL); 871 } 872 else if (IS (t, 873 OR (G_UNICODE_LOWERCASE_LETTER, 874 OR (G_UNICODE_TITLECASE_LETTER, 875 0)))) 876 { 877 val = ATTTABLE (c >> 8, c & 0xff); 878 879 if (val >= 0x1000000) 880 { 881 len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 882 t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1); 883 } 884 else 885 { 886 if (t == G_UNICODE_TITLECASE_LETTER) 887 { 888 unsigned int i; 889 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) 890 { 891 if (title_table[i][0] == c) 892 { 893 val = title_table[i][1]; 894 break; 895 } 896 } 897 } 898 899 /* Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR, 900 * do not have an uppercase equivalent, in which case val will be 901 * zero. */ 902 len += g_unichar_to_utf8 (val ? val : c, out_buffer ? out_buffer + len : NULL); 903 } 904 } 905 else 906 { 907 gsize char_len = g_utf8_skip[*(guchar *)last]; 908 909 if (out_buffer) 910 memcpy (out_buffer + len, last, char_len); 911 912 len += char_len; 913 } 914 915 } 916 917 return len; 918 } 919 920 /** 921 * g_utf8_strup: 922 * @str: a UTF-8 encoded string 923 * @len: length of @str, in bytes, or -1 if @str is nul-terminated. 924 * 925 * Converts all Unicode characters in the string that have a case 926 * to uppercase. The exact manner that this is done depends 927 * on the current locale, and may result in the number of 928 * characters in the string increasing. (For instance, the 929 * German ess-zet will be changed to SS.) 930 * 931 * Return value: a newly allocated string, with all characters 932 * converted to uppercase. 933 **/ 934 gchar * 935 g_utf8_strup (const gchar *str, 936 gssize len) 937 { 938 gsize result_len; 939 LocaleType locale_type; 940 gchar *result; 941 942 g_return_val_if_fail (str != NULL, NULL); 943 944 locale_type = get_locale_type (); 945 946 /* 947 * We use a two pass approach to keep memory management simple 948 */ 949 result_len = real_toupper (str, len, NULL, locale_type); 950 result = g_malloc (result_len + 1); 951 real_toupper (str, len, result, locale_type); 952 result[result_len] = '\0'; 953 954 return result; 955 } 956 957 /* traverses the string checking for characters with combining class == 230 958 * until a base character is found */ 959 static gboolean 960 has_more_above (const gchar *str) 961 { 962 const gchar *p = str; 963 gint combining_class; 964 965 while (*p) 966 { 967 combining_class = g_unichar_combining_class (g_utf8_get_char (p)); 968 if (combining_class == 230) 969 return TRUE; 970 else if (combining_class == 0) 971 break; 972 973 p = g_utf8_next_char (p); 974 } 975 976 return FALSE; 977 } 978 979 static gsize 980 real_tolower (const gchar *str, 981 gssize max_len, 982 gchar *out_buffer, 983 LocaleType locale_type) 984 { 985 const gchar *p = str; 986 const char *last = NULL; 987 gsize len = 0; 988 989 while ((max_len < 0 || p < str + max_len) && *p) 990 { 991 gunichar c = g_utf8_get_char (p); 992 int t = TYPE (c); 993 gunichar val; 994 995 last = p; 996 p = g_utf8_next_char (p); 997 998 if (locale_type == LOCALE_TURKIC && c == 'I') 999 { 1000 if (g_utf8_get_char (p) == 0x0307) 1001 { 1002 /* I + COMBINING DOT ABOVE => i (U+0069) */ 1003 len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); 1004 p = g_utf8_next_char (p); 1005 } 1006 else 1007 { 1008 /* I => LATIN SMALL LETTER DOTLESS I */ 1009 len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL); 1010 } 1011 } 1012 /* Introduce an explicit dot above when lowercasing capital I's and J's 1013 * whenever there are more accents above. [SpecialCasing.txt] */ 1014 else if (locale_type == LOCALE_LITHUANIAN && 1015 (c == 0x00cc || c == 0x00cd || c == 0x0128)) 1016 { 1017 len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); 1018 len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); 1019 1020 switch (c) 1021 { 1022 case 0x00cc: 1023 len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL); 1024 break; 1025 case 0x00cd: 1026 len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL); 1027 break; 1028 case 0x0128: 1029 len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL); 1030 break; 1031 } 1032 } 1033 else if (locale_type == LOCALE_LITHUANIAN && 1034 (c == 'I' || c == 'J' || c == 0x012e) && 1035 has_more_above (p)) 1036 { 1037 len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL); 1038 len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); 1039 } 1040 else if (c == 0x03A3) /* GREEK CAPITAL LETTER SIGMA */ 1041 { 1042 if ((max_len < 0 || p < str + max_len) && *p) 1043 { 1044 gunichar next_c = g_utf8_get_char (p); 1045 int next_type = TYPE(next_c); 1046 1047 /* SIGMA mapps differently depending on whether it is 1048 * final or not. The following simplified test would 1049 * fail in the case of combining marks following the 1050 * sigma, but I don't think that occurs in real text. 1051 * The test here matches that in ICU. 1052 */ 1053 if (ISALPHA (next_type)) /* Lu,Ll,Lt,Lm,Lo */ 1054 val = 0x3c3; /* GREEK SMALL SIGMA */ 1055 else 1056 val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ 1057 } 1058 else 1059 val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ 1060 1061 len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL); 1062 } 1063 else if (IS (t, 1064 OR (G_UNICODE_UPPERCASE_LETTER, 1065 OR (G_UNICODE_TITLECASE_LETTER, 1066 0)))) 1067 { 1068 val = ATTTABLE (c >> 8, c & 0xff); 1069 1070 if (val >= 0x1000000) 1071 { 1072 len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 0); 1073 } 1074 else 1075 { 1076 if (t == G_UNICODE_TITLECASE_LETTER) 1077 { 1078 unsigned int i; 1079 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) 1080 { 1081 if (title_table[i][0] == c) 1082 { 1083 val = title_table[i][2]; 1084 break; 1085 } 1086 } 1087 } 1088 1089 /* Not all uppercase letters are guaranteed to have a lowercase 1090 * equivalent. If this is the case, val will be zero. */ 1091 len += g_unichar_to_utf8 (val ? val : c, out_buffer ? out_buffer + len : NULL); 1092 } 1093 } 1094 else 1095 { 1096 gsize char_len = g_utf8_skip[*(guchar *)last]; 1097 1098 if (out_buffer) 1099 memcpy (out_buffer + len, last, char_len); 1100 1101 len += char_len; 1102 } 1103 1104 } 1105 1106 return len; 1107 } 1108 1109 /** 1110 * g_utf8_strdown: 1111 * @str: a UTF-8 encoded string 1112 * @len: length of @str, in bytes, or -1 if @str is nul-terminated. 1113 * 1114 * Converts all Unicode characters in the string that have a case 1115 * to lowercase. The exact manner that this is done depends 1116 * on the current locale, and may result in the number of 1117 * characters in the string changing. 1118 * 1119 * Return value: a newly allocated string, with all characters 1120 * converted to lowercase. 1121 **/ 1122 gchar * 1123 g_utf8_strdown (const gchar *str, 1124 gssize len) 1125 { 1126 gsize result_len; 1127 LocaleType locale_type; 1128 gchar *result; 1129 1130 g_return_val_if_fail (str != NULL, NULL); 1131 1132 locale_type = get_locale_type (); 1133 1134 /* 1135 * We use a two pass approach to keep memory management simple 1136 */ 1137 result_len = real_tolower (str, len, NULL, locale_type); 1138 result = g_malloc (result_len + 1); 1139 real_tolower (str, len, result, locale_type); 1140 result[result_len] = '\0'; 1141 1142 return result; 1143 } 1144 1145 /** 1146 * g_utf8_casefold: 1147 * @str: a UTF-8 encoded string 1148 * @len: length of @str, in bytes, or -1 if @str is nul-terminated. 1149 * 1150 * Converts a string into a form that is independent of case. The 1151 * result will not correspond to any particular case, but can be 1152 * compared for equality or ordered with the results of calling 1153 * g_utf8_casefold() on other strings. 1154 * 1155 * Note that calling g_utf8_casefold() followed by g_utf8_collate() is 1156 * only an approximation to the correct linguistic case insensitive 1157 * ordering, though it is a fairly good one. Getting this exactly 1158 * right would require a more sophisticated collation function that 1159 * takes case sensitivity into account. GLib does not currently 1160 * provide such a function. 1161 * 1162 * Return value: a newly allocated string, that is a 1163 * case independent form of @str. 1164 **/ 1165 gchar * 1166 g_utf8_casefold (const gchar *str, 1167 gssize len) 1168 { 1169 GString *result; 1170 const char *p; 1171 1172 g_return_val_if_fail (str != NULL, NULL); 1173 1174 result = g_string_new (NULL); 1175 p = str; 1176 while ((len < 0 || p < str + len) && *p) 1177 { 1178 gunichar ch = g_utf8_get_char (p); 1179 1180 int start = 0; 1181 int end = G_N_ELEMENTS (casefold_table); 1182 1183 if (ch >= casefold_table[start].ch && 1184 ch <= casefold_table[end - 1].ch) 1185 { 1186 while (TRUE) 1187 { 1188 int half = (start + end) / 2; 1189 if (ch == casefold_table[half].ch) 1190 { 1191 g_string_append (result, casefold_table[half].data); 1192 goto next; 1193 } 1194 else if (half == start) 1195 break; 1196 else if (ch > casefold_table[half].ch) 1197 start = half; 1198 else 1199 end = half; 1200 } 1201 } 1202 1203 g_string_append_unichar (result, g_unichar_tolower (ch)); 1204 1205 next: 1206 p = g_utf8_next_char (p); 1207 } 1208 1209 return g_string_free (result, FALSE); 1210 } 1211 1212 /** 1213 * g_unichar_get_mirror_char: 1214 * @ch: a Unicode character 1215 * @mirrored_ch: location to store the mirrored character 1216 * 1217 * In Unicode, some characters are <firstterm>mirrored</firstterm>. This 1218 * means that their images are mirrored horizontally in text that is laid 1219 * out from right to left. For instance, "(" would become its mirror image, 1220 * ")", in right-to-left text. 1221 * 1222 * If @ch has the Unicode mirrored property and there is another unicode 1223 * character that typically has a glyph that is the mirror image of @ch's 1224 * glyph and @mirrored_ch is set, it puts that character in the address 1225 * pointed to by @mirrored_ch. Otherwise the original character is put. 1226 * 1227 * Return value: %TRUE if @ch has a mirrored character, %FALSE otherwise 1228 * 1229 * Since: 2.4 1230 **/ 1231 gboolean 1232 g_unichar_get_mirror_char (gunichar ch, 1233 gunichar *mirrored_ch) 1234 { 1235 gboolean found; 1236 gunichar mirrored; 1237 1238 mirrored = GLIB_GET_MIRRORING(ch); 1239 1240 found = ch != mirrored; 1241 if (mirrored_ch) 1242 *mirrored_ch = mirrored; 1243 1244 return found; 1245 1246 } 1247 1248 #define G_SCRIPT_TABLE_MIDPOINT (G_N_ELEMENTS (g_script_table) / 2) 1249 1250 static inline GUnicodeScript 1251 g_unichar_get_script_bsearch (gunichar ch) 1252 { 1253 int lower = 0; 1254 int upper = G_N_ELEMENTS (g_script_table) - 1; 1255 static int saved_mid = G_SCRIPT_TABLE_MIDPOINT; 1256 int mid = saved_mid; 1257 1258 1259 do 1260 { 1261 if (ch < g_script_table[mid].start) 1262 upper = mid - 1; 1263 else if (ch >= g_script_table[mid].start + g_script_table[mid].chars) 1264 lower = mid + 1; 1265 else 1266 return g_script_table[saved_mid = mid].script; 1267 1268 mid = (lower + upper) / 2; 1269 } 1270 while (lower <= upper); 1271 1272 return G_UNICODE_SCRIPT_UNKNOWN; 1273 } 1274 1275 /** 1276 * g_unichar_get_script: 1277 * @ch: a Unicode character 1278 * 1279 * Looks up the #GUnicodeScript for a particular character (as defined 1280 * by Unicode Standard Annex #24). No check is made for @ch being a 1281 * valid Unicode character; if you pass in invalid character, the 1282 * result is undefined. 1283 * 1284 * This function is equivalent to pango_script_for_unichar() and the 1285 * two are interchangeable. 1286 * 1287 * Return value: the #GUnicodeScript for the character. 1288 * 1289 * Since: 2.14 1290 */ 1291 GUnicodeScript 1292 g_unichar_get_script (gunichar ch) 1293 { 1294 if (ch < G_EASY_SCRIPTS_RANGE) 1295 return g_script_easy_table[ch]; 1296 else 1297 return g_unichar_get_script_bsearch (ch); 1298 } 1299 1300 1301 #define __G_UNIPROP_C__ 1302 #include "galiasdef.c" 1303