1 /* 2 * Copyright (C) 2007 Apple Computer, Inc. 3 * 4 * Portions are Copyright (C) 1998 Netscape Communications Corporation. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 * 20 * Alternatively, the contents of this file may be used under the terms 21 * of either the Mozilla Public License Version 1.1, found at 22 * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public 23 * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html 24 * (the "GPL"), in which case the provisions of the MPL or the GPL are 25 * applicable instead of those above. If you wish to allow use of your 26 * version of this file only under the terms of one of those two 27 * licenses (the MPL or the GPL) and not to allow others to use your 28 * version of this file under the LGPL, indicate your decision by 29 * deletingthe provisions above and replace them with the notice and 30 * other provisions required by the MPL or the GPL, as the case may be. 31 * If you do not delete the provisions above, a recipient may use your 32 * version of this file under any of the LGPL, the MPL or the GPL. 33 */ 34 35 #include "config.h" 36 #include "UnicodeRange.h" 37 38 namespace WebCore { 39 40 // This table depends on unicode range definitions. 41 // Each item's index must correspond to a unicode range value 42 // eg. x-cyrillic = LangGroupTable[cRangeCyrillic] 43 static const char* gUnicodeRangeToLangGroupTable[] = 44 { 45 "x-cyrillic", 46 "el", 47 "tr", 48 "he", 49 "ar", 50 "x-baltic", 51 "th", 52 "ko", 53 "ja", 54 "zh-CN", 55 "zh-TW", 56 "x-devanagari", 57 "x-tamil", 58 "x-armn", 59 "x-beng", 60 "x-cans", 61 "x-ethi", 62 "x-geor", 63 "x-gujr", 64 "x-guru", 65 "x-khmr", 66 "x-mlym" 67 }; 68 69 /********************************************************************** 70 * Unicode subranges as defined in unicode 3.0 71 * x-western, x-central-euro, tr, x-baltic -> latin 72 * 0000 - 036f 73 * 1e00 - 1eff 74 * 2000 - 206f (general punctuation) 75 * 20a0 - 20cf (currency symbols) 76 * 2100 - 214f (letterlike symbols) 77 * 2150 - 218f (Number Forms) 78 * el -> greek 79 * 0370 - 03ff 80 * 1f00 - 1fff 81 * x-cyrillic -> cyrillic 82 * 0400 - 04ff 83 * he -> hebrew 84 * 0590 - 05ff 85 * ar -> arabic 86 * 0600 - 06ff 87 * fb50 - fdff (arabic presentation forms) 88 * fe70 - feff (arabic presentation forms b) 89 * th - thai 90 * 0e00 - 0e7f 91 * ko -> korean 92 * ac00 - d7af (hangul Syllables) 93 * 1100 - 11ff (jamo) 94 * 3130 - 318f (hangul compatibility jamo) 95 * ja 96 * 3040 - 309f (hiragana) 97 * 30a0 - 30ff (katakana) 98 * zh-CN 99 * zh-TW 100 * 101 * CJK 102 * 3100 - 312f (bopomofo) 103 * 31a0 - 31bf (bopomofo extended) 104 * 3000 - 303f (CJK Symbols and Punctuation) 105 * 2e80 - 2eff (CJK radicals supplement) 106 * 2f00 - 2fdf (Kangxi Radicals) 107 * 2ff0 - 2fff (Ideographic Description Characters) 108 * 3190 - 319f (kanbun) 109 * 3200 - 32ff (Enclosed CJK letters and Months) 110 * 3300 - 33ff (CJK compatibility) 111 * 3400 - 4dbf (CJK Unified Ideographs Extension A) 112 * 4e00 - 9faf (CJK Unified Ideographs) 113 * f900 - fa5f (CJK Compatibility Ideographs) 114 * fe30 - fe4f (CJK compatibility Forms) 115 * ff00 - ffef (halfwidth and fullwidth forms) 116 * 117 * Armenian 118 * 0530 - 058f 119 * Sriac 120 * 0700 - 074f 121 * Thaana 122 * 0780 - 07bf 123 * Devanagari 124 * 0900 - 097f 125 * Bengali 126 * 0980 - 09ff 127 * Gurmukhi 128 * 0a00 - 0a7f 129 * Gujarati 130 * 0a80 - 0aff 131 * Oriya 132 * 0b00 - 0b7f 133 * Tamil 134 * 0b80 - 0bff 135 * Telugu 136 * 0c00 - 0c7f 137 * Kannada 138 * 0c80 - 0cff 139 * Malayalam 140 * 0d00 - 0d7f 141 * Sinhala 142 * 0d80 - 0def 143 * Lao 144 * 0e80 - 0eff 145 * Tibetan 146 * 0f00 - 0fbf 147 * Myanmar 148 * 1000 - 109f 149 * Georgian 150 * 10a0 - 10ff 151 * Ethiopic 152 * 1200 - 137f 153 * Cherokee 154 * 13a0 - 13ff 155 * Canadian Aboriginal Syllabics 156 * 1400 - 167f 157 * Ogham 158 * 1680 - 169f 159 * Runic 160 * 16a0 - 16ff 161 * Khmer 162 * 1780 - 17ff 163 * Mongolian 164 * 1800 - 18af 165 * Misc - superscripts and subscripts 166 * 2070 - 209f 167 * Misc - Combining Diacritical Marks for Symbols 168 * 20d0 - 20ff 169 * Misc - Arrows 170 * 2190 - 21ff 171 * Misc - Mathematical Operators 172 * 2200 - 22ff 173 * Misc - Miscellaneous Technical 174 * 2300 - 23ff 175 * Misc - Control picture 176 * 2400 - 243f 177 * Misc - Optical character recognition 178 * 2440 - 2450 179 * Misc - Enclose Alphanumerics 180 * 2460 - 24ff 181 * Misc - Box Drawing 182 * 2500 - 257f 183 * Misc - Block Elements 184 * 2580 - 259f 185 * Misc - Geometric Shapes 186 * 25a0 - 25ff 187 * Misc - Miscellaneous Symbols 188 * 2600 - 267f 189 * Misc - Dingbats 190 * 2700 - 27bf 191 * Misc - Braille Patterns 192 * 2800 - 28ff 193 * Yi Syllables 194 * a000 - a48f 195 * Yi radicals 196 * a490 - a4cf 197 * Alphabetic Presentation Forms 198 * fb00 - fb4f 199 * Misc - Combining half Marks 200 * fe20 - fe2f 201 * Misc - small form variants 202 * fe50 - fe6f 203 * Misc - Specials 204 * fff0 - ffff 205 *********************************************************************/ 206 207 static const unsigned cNumSubTables = 9; 208 static const unsigned cSubTableSize = 16; 209 210 static const unsigned char gUnicodeSubrangeTable[cNumSubTables][cSubTableSize] = 211 { 212 { // table for X--- 213 cRangeTableBase+1, //u0xxx 214 cRangeTableBase+2, //u1xxx 215 cRangeTableBase+3, //u2xxx 216 cRangeSetCJK, //u3xxx 217 cRangeSetCJK, //u4xxx 218 cRangeSetCJK, //u5xxx 219 cRangeSetCJK, //u6xxx 220 cRangeSetCJK, //u7xxx 221 cRangeSetCJK, //u8xxx 222 cRangeSetCJK, //u9xxx 223 cRangeTableBase+4, //uaxxx 224 cRangeKorean, //ubxxx 225 cRangeKorean, //ucxxx 226 cRangeTableBase+5, //udxxx 227 cRangePrivate, //uexxx 228 cRangeTableBase+6 //ufxxx 229 }, 230 { //table for 0X-- 231 cRangeSetLatin, //u00xx 232 cRangeSetLatin, //u01xx 233 cRangeSetLatin, //u02xx 234 cRangeGreek, //u03xx XXX 0300-036f is in fact cRangeCombiningDiacriticalMarks 235 cRangeCyrillic, //u04xx 236 cRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian 237 cRangeArabic, //u06xx 238 cRangeTertiaryTable, //u07xx 239 cRangeUnassigned, //u08xx 240 cRangeTertiaryTable, //u09xx 241 cRangeTertiaryTable, //u0axx 242 cRangeTertiaryTable, //u0bxx 243 cRangeTertiaryTable, //u0cxx 244 cRangeTertiaryTable, //u0dxx 245 cRangeTertiaryTable, //u0exx 246 cRangeTibetan, //u0fxx 247 }, 248 { //table for 1x-- 249 cRangeTertiaryTable, //u10xx 250 cRangeKorean, //u11xx 251 cRangeEthiopic, //u12xx 252 cRangeTertiaryTable, //u13xx 253 cRangeCanadian, //u14xx 254 cRangeCanadian, //u15xx 255 cRangeTertiaryTable, //u16xx 256 cRangeKhmer, //u17xx 257 cRangeMongolian, //u18xx 258 cRangeUnassigned, //u19xx 259 cRangeUnassigned, //u1axx 260 cRangeUnassigned, //u1bxx 261 cRangeUnassigned, //u1cxx 262 cRangeUnassigned, //u1dxx 263 cRangeSetLatin, //u1exx 264 cRangeGreek, //u1fxx 265 }, 266 { //table for 2x-- 267 cRangeSetLatin, //u20xx 268 cRangeSetLatin, //u21xx 269 cRangeMathOperators, //u22xx 270 cRangeMiscTechnical, //u23xx 271 cRangeControlOpticalEnclose, //u24xx 272 cRangeBoxBlockGeometrics, //u25xx 273 cRangeMiscSymbols, //u26xx 274 cRangeDingbats, //u27xx 275 cRangeBraillePattern, //u28xx 276 cRangeUnassigned, //u29xx 277 cRangeUnassigned, //u2axx 278 cRangeUnassigned, //u2bxx 279 cRangeUnassigned, //u2cxx 280 cRangeUnassigned, //u2dxx 281 cRangeSetCJK, //u2exx 282 cRangeSetCJK, //u2fxx 283 }, 284 { //table for ax-- 285 cRangeYi, //ua0xx 286 cRangeYi, //ua1xx 287 cRangeYi, //ua2xx 288 cRangeYi, //ua3xx 289 cRangeYi, //ua4xx 290 cRangeUnassigned, //ua5xx 291 cRangeUnassigned, //ua6xx 292 cRangeUnassigned, //ua7xx 293 cRangeUnassigned, //ua8xx 294 cRangeUnassigned, //ua9xx 295 cRangeUnassigned, //uaaxx 296 cRangeUnassigned, //uabxx 297 cRangeKorean, //uacxx 298 cRangeKorean, //uadxx 299 cRangeKorean, //uaexx 300 cRangeKorean, //uafxx 301 }, 302 { //table for dx-- 303 cRangeKorean, //ud0xx 304 cRangeKorean, //ud1xx 305 cRangeKorean, //ud2xx 306 cRangeKorean, //ud3xx 307 cRangeKorean, //ud4xx 308 cRangeKorean, //ud5xx 309 cRangeKorean, //ud6xx 310 cRangeKorean, //ud7xx 311 cRangeSurrogate, //ud8xx 312 cRangeSurrogate, //ud9xx 313 cRangeSurrogate, //udaxx 314 cRangeSurrogate, //udbxx 315 cRangeSurrogate, //udcxx 316 cRangeSurrogate, //uddxx 317 cRangeSurrogate, //udexx 318 cRangeSurrogate, //udfxx 319 }, 320 { // table for fx-- 321 cRangePrivate, //uf0xx 322 cRangePrivate, //uf1xx 323 cRangePrivate, //uf2xx 324 cRangePrivate, //uf3xx 325 cRangePrivate, //uf4xx 326 cRangePrivate, //uf5xx 327 cRangePrivate, //uf6xx 328 cRangePrivate, //uf7xx 329 cRangePrivate, //uf8xx 330 cRangeSetCJK, //uf9xx 331 cRangeSetCJK, //ufaxx 332 cRangeArabic, //ufbxx, includes alphabic presentation form 333 cRangeArabic, //ufcxx 334 cRangeArabic, //ufdxx 335 cRangeArabic, //ufexx, includes Combining half marks, 336 // CJK compatibility forms, 337 // CJK compatibility forms, 338 // small form variants 339 cRangeTableBase+8, //uffxx, halfwidth and fullwidth forms, includes Specials 340 }, 341 { //table for 0x0500 - 0x05ff 342 cRangeCyrillic, //u050x 343 cRangeCyrillic, //u051x 344 cRangeCyrillic, //u052x 345 cRangeArmenian, //u053x 346 cRangeArmenian, //u054x 347 cRangeArmenian, //u055x 348 cRangeArmenian, //u056x 349 cRangeArmenian, //u057x 350 cRangeArmenian, //u058x 351 cRangeHebrew, //u059x 352 cRangeHebrew, //u05ax 353 cRangeHebrew, //u05bx 354 cRangeHebrew, //u05cx 355 cRangeHebrew, //u05dx 356 cRangeHebrew, //u05ex 357 cRangeHebrew, //u05fx 358 }, 359 { //table for 0xff00 - 0xffff 360 cRangeSetCJK, //uff0x, fullwidth latin 361 cRangeSetCJK, //uff1x, fullwidth latin 362 cRangeSetCJK, //uff2x, fullwidth latin 363 cRangeSetCJK, //uff3x, fullwidth latin 364 cRangeSetCJK, //uff4x, fullwidth latin 365 cRangeSetCJK, //uff5x, fullwidth latin 366 cRangeSetCJK, //uff6x, halfwidth katakana 367 cRangeSetCJK, //uff7x, halfwidth katakana 368 cRangeSetCJK, //uff8x, halfwidth katakana 369 cRangeSetCJK, //uff9x, halfwidth katakana 370 cRangeSetCJK, //uffax, halfwidth hangul jamo 371 cRangeSetCJK, //uffbx, halfwidth hangul jamo 372 cRangeSetCJK, //uffcx, halfwidth hangul jamo 373 cRangeSetCJK, //uffdx, halfwidth hangul jamo 374 cRangeSetCJK, //uffex, fullwidth symbols 375 cRangeSpecials, //ufffx, Specials 376 }, 377 }; 378 379 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) 380 // code points so that the number of entries in the tertiary range 381 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128. 382 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal 383 // syllabaries take multiple chunks and Ogham and Runic share a single chunk. 384 static const unsigned cTertiaryTableSize = ((0x1700 - 0x0700) / 0x80); 385 386 static const unsigned char gUnicodeTertiaryRangeTable[cTertiaryTableSize] = 387 { //table for 0x0700 - 0x1600 388 cRangeSyriac, //u070x 389 cRangeThaana, //u078x 390 cRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.) 391 cRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.) 392 cRangeDevanagari, //u090x 393 cRangeBengali, //u098x 394 cRangeGurmukhi, //u0a0x 395 cRangeGujarati, //u0a8x 396 cRangeOriya, //u0b0x 397 cRangeTamil, //u0b8x 398 cRangeTelugu, //u0c0x 399 cRangeKannada, //u0c8x 400 cRangeMalayalam, //u0d0x 401 cRangeSinhala, //u0d8x 402 cRangeThai, //u0e0x 403 cRangeLao, //u0e8x 404 cRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.) 405 cRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.) 406 cRangeMyanmar, //u100x 407 cRangeGeorgian, //u108x 408 cRangeKorean, //u110x place holder(resolved in the 2ndary tab.) 409 cRangeKorean, //u118x place holder(resolved in the 2ndary tab.) 410 cRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.) 411 cRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.) 412 cRangeEthiopic, //u130x 413 cRangeCherokee, //u138x 414 cRangeCanadian, //u140x place holder(resolved in the 2ndary tab.) 415 cRangeCanadian, //u148x place holder(resolved in the 2ndary tab.) 416 cRangeCanadian, //u150x place holder(resolved in the 2ndary tab.) 417 cRangeCanadian, //u158x place holder(resolved in the 2ndary tab.) 418 cRangeCanadian, //u160x 419 cRangeOghamRunic, //u168x this contains two scripts, Ogham & Runic 420 }; 421 422 // A two level index is almost enough for locating a range, with the 423 // exception of u03xx and u05xx. Since we don't really care about range for 424 // combining diacritical marks in our font application, they are 425 // not discriminated further. Future adoption of this method for other use 426 // should be aware of this limitation. The implementation can be extended if 427 // there is such a need. 428 // For Indic, Southeast Asian scripts and some other scripts between 429 // U+0700 and U+16FF, it's extended to the third level. 430 unsigned int findCharUnicodeRange(UChar32 ch) 431 { 432 if (ch >= 0xFFFF) 433 return 0; 434 435 unsigned int range; 436 437 //search the first table 438 range = gUnicodeSubrangeTable[0][ch >> 12]; 439 440 if (range < cRangeTableBase) 441 // we try to get a specific range 442 return range; 443 444 // otherwise, we have one more table to look at 445 range = gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x0f00) >> 8]; 446 if (range < cRangeTableBase) 447 return range; 448 if (range < cRangeTertiaryTable) 449 return gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x00f0) >> 4]; 450 451 // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks 452 return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7]; 453 } 454 455 const char* langGroupFromUnicodeRange(unsigned char unicodeRange) 456 { 457 if (cRangeSpecificItemNum > unicodeRange) 458 return gUnicodeRangeToLangGroupTable[unicodeRange]; 459 return 0; 460 } 461 462 } 463