Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2007 Apple Computer, Inc.
      3  *
      4  * Portions are Copyright (C) 1998 Netscape Communications Corporation.
      5  *
      6  * This library is free software; you can redistribute it and/or
      7  * modify it under the terms of the GNU Lesser General Public
      8  * License as published by the Free Software Foundation; either
      9  * version 2.1 of the License, or (at your option) any later version.
     10  *
     11  * This library is distributed in the hope that it will be useful,
     12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14  * Lesser General Public License for more details.
     15  *
     16  * You should have received a copy of the GNU Lesser General Public
     17  * License along with this library; if not, write to the Free Software
     18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
     19  *
     20  * Alternatively, the contents of this file may be used under the terms
     21  * of either the Mozilla Public License Version 1.1, found at
     22  * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public
     23  * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html
     24  * (the "GPL"), in which case the provisions of the MPL or the GPL are
     25  * applicable instead of those above.  If you wish to allow use of your
     26  * version of this file only under the terms of one of those two
     27  * licenses (the MPL or the GPL) and not to allow others to use your
     28  * version of this file under the LGPL, indicate your decision by
     29  * deletingthe provisions above and replace them with the notice and
     30  * other provisions required by the MPL or the GPL, as the case may be.
     31  * If you do not delete the provisions above, a recipient may use your
     32  * version of this file under any of the LGPL, the MPL or the GPL.
     33  */
     34 
     35 #include "config.h"
     36 #include "UnicodeRange.h"
     37 
     38 namespace WebCore {
     39 
     40 // This table depends on unicode range definitions.
     41 // Each item's index must correspond to a unicode range value
     42 // eg. x-cyrillic = LangGroupTable[cRangeCyrillic]
     43 static const char* gUnicodeRangeToLangGroupTable[] =
     44 {
     45   "x-cyrillic",
     46   "el",
     47   "tr",
     48   "he",
     49   "ar",
     50   "x-baltic",
     51   "th",
     52   "ko",
     53   "ja",
     54   "zh-CN",
     55   "zh-TW",
     56   "x-devanagari",
     57   "x-tamil",
     58   "x-armn",
     59   "x-beng",
     60   "x-cans",
     61   "x-ethi",
     62   "x-geor",
     63   "x-gujr",
     64   "x-guru",
     65   "x-khmr",
     66   "x-mlym"
     67 };
     68 
     69 /**********************************************************************
     70  * Unicode subranges as defined in unicode 3.0
     71  * x-western, x-central-euro, tr, x-baltic  -> latin
     72  *  0000 - 036f
     73  *  1e00 - 1eff
     74  *  2000 - 206f  (general punctuation)
     75  *  20a0 - 20cf  (currency symbols)
     76  *  2100 - 214f  (letterlike symbols)
     77  *  2150 - 218f  (Number Forms)
     78  * el         -> greek
     79  *  0370 - 03ff
     80  *  1f00 - 1fff
     81  * x-cyrillic -> cyrillic
     82  *  0400 - 04ff
     83  * he         -> hebrew
     84  *  0590 - 05ff
     85  * ar         -> arabic
     86  *  0600 - 06ff
     87  *  fb50 - fdff (arabic presentation forms)
     88  *  fe70 - feff (arabic presentation forms b)
     89  * th - thai
     90  *  0e00 - 0e7f
     91  * ko        -> korean
     92  *  ac00 - d7af  (hangul Syllables)
     93  *  1100 - 11ff    (jamo)
     94  *  3130 - 318f (hangul compatibility jamo)
     95  * ja
     96  *  3040 - 309f (hiragana)
     97  *  30a0 - 30ff (katakana)
     98  * zh-CN
     99  * zh-TW
    100  *
    101  * CJK
    102  *  3100 - 312f (bopomofo)
    103  *  31a0 - 31bf (bopomofo extended)
    104  *  3000 - 303f (CJK Symbols and Punctuation)
    105  *  2e80 - 2eff (CJK radicals supplement)
    106  *  2f00 - 2fdf (Kangxi Radicals)
    107  *  2ff0 - 2fff (Ideographic Description Characters)
    108  *  3190 - 319f (kanbun)
    109  *  3200 - 32ff (Enclosed CJK letters and Months)
    110  *  3300 - 33ff (CJK compatibility)
    111  *  3400 - 4dbf (CJK Unified Ideographs Extension A)
    112  *  4e00 - 9faf (CJK Unified Ideographs)
    113  *  f900 - fa5f (CJK Compatibility Ideographs)
    114  *  fe30 - fe4f (CJK compatibility Forms)
    115  *  ff00 - ffef (halfwidth and fullwidth forms)
    116  *
    117  * Armenian
    118  *  0530 - 058f
    119  * Sriac
    120  *  0700 - 074f
    121  * Thaana
    122  *  0780 - 07bf
    123  * Devanagari
    124  *  0900 - 097f
    125  * Bengali
    126  *  0980 - 09ff
    127  * Gurmukhi
    128  *  0a00 - 0a7f
    129  * Gujarati
    130  *  0a80 - 0aff
    131  * Oriya
    132  *  0b00 - 0b7f
    133  * Tamil
    134  *  0b80 - 0bff
    135  * Telugu
    136  *  0c00 - 0c7f
    137  * Kannada
    138  *  0c80 - 0cff
    139  * Malayalam
    140  *  0d00 - 0d7f
    141  * Sinhala
    142  *  0d80 - 0def
    143  * Lao
    144  *  0e80 - 0eff
    145  * Tibetan
    146  *  0f00 - 0fbf
    147  * Myanmar
    148  *  1000 - 109f
    149  * Georgian
    150  *  10a0 - 10ff
    151  * Ethiopic
    152  *  1200 - 137f
    153  * Cherokee
    154  *  13a0 - 13ff
    155  * Canadian Aboriginal Syllabics
    156  *  1400 - 167f
    157  * Ogham
    158  *  1680 - 169f
    159  * Runic
    160  *  16a0 - 16ff
    161  * Khmer
    162  *  1780 - 17ff
    163  * Mongolian
    164  *  1800 - 18af
    165  * Misc - superscripts and subscripts
    166  *  2070 - 209f
    167  * Misc - Combining Diacritical Marks for Symbols
    168  *  20d0 - 20ff
    169  * Misc - Arrows
    170  *  2190 - 21ff
    171  * Misc - Mathematical Operators
    172  *  2200 - 22ff
    173  * Misc - Miscellaneous Technical
    174  *  2300 - 23ff
    175  * Misc - Control picture
    176  *  2400 - 243f
    177  * Misc - Optical character recognition
    178  *  2440 - 2450
    179  * Misc - Enclose Alphanumerics
    180  *  2460 - 24ff
    181  * Misc - Box Drawing
    182  *  2500 - 257f
    183  * Misc - Block Elements
    184  *  2580 - 259f
    185  * Misc - Geometric Shapes
    186  *  25a0 - 25ff
    187  * Misc - Miscellaneous Symbols
    188  *  2600 - 267f
    189  * Misc - Dingbats
    190  *  2700 - 27bf
    191  * Misc - Braille Patterns
    192  *  2800 - 28ff
    193  * Yi Syllables
    194  *  a000 - a48f
    195  * Yi radicals
    196  *  a490 - a4cf
    197  * Alphabetic Presentation Forms
    198  *  fb00 - fb4f
    199  * Misc - Combining half Marks
    200  *  fe20 - fe2f
    201  * Misc - small form variants
    202  *  fe50 - fe6f
    203  * Misc - Specials
    204  *  fff0 - ffff
    205  *********************************************************************/
    206 
    207 static const unsigned cNumSubTables = 9;
    208 static const unsigned cSubTableSize = 16;
    209 
    210 static const unsigned char gUnicodeSubrangeTable[cNumSubTables][cSubTableSize] =
    211 {
    212   { // table for X---
    213     cRangeTableBase+1,  //u0xxx
    214     cRangeTableBase+2,  //u1xxx
    215     cRangeTableBase+3,  //u2xxx
    216     cRangeSetCJK,       //u3xxx
    217     cRangeSetCJK,       //u4xxx
    218     cRangeSetCJK,       //u5xxx
    219     cRangeSetCJK,       //u6xxx
    220     cRangeSetCJK,       //u7xxx
    221     cRangeSetCJK,       //u8xxx
    222     cRangeSetCJK,       //u9xxx
    223     cRangeTableBase+4,  //uaxxx
    224     cRangeKorean,       //ubxxx
    225     cRangeKorean,       //ucxxx
    226     cRangeTableBase+5,  //udxxx
    227     cRangePrivate,      //uexxx
    228     cRangeTableBase+6   //ufxxx
    229   },
    230   { //table for 0X--
    231     cRangeSetLatin,          //u00xx
    232     cRangeSetLatin,          //u01xx
    233     cRangeSetLatin,          //u02xx
    234     cRangeGreek,             //u03xx     XXX 0300-036f is in fact cRangeCombiningDiacriticalMarks
    235     cRangeCyrillic,          //u04xx
    236     cRangeTableBase+7,       //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
    237     cRangeArabic,            //u06xx
    238     cRangeTertiaryTable,     //u07xx
    239     cRangeUnassigned,        //u08xx
    240     cRangeTertiaryTable,     //u09xx
    241     cRangeTertiaryTable,     //u0axx
    242     cRangeTertiaryTable,     //u0bxx
    243     cRangeTertiaryTable,     //u0cxx
    244     cRangeTertiaryTable,     //u0dxx
    245     cRangeTertiaryTable,     //u0exx
    246     cRangeTibetan,           //u0fxx
    247   },
    248   { //table for 1x--
    249     cRangeTertiaryTable,     //u10xx
    250     cRangeKorean,            //u11xx
    251     cRangeEthiopic,          //u12xx
    252     cRangeTertiaryTable,     //u13xx
    253     cRangeCanadian,          //u14xx
    254     cRangeCanadian,          //u15xx
    255     cRangeTertiaryTable,     //u16xx
    256     cRangeKhmer,             //u17xx
    257     cRangeMongolian,         //u18xx
    258     cRangeUnassigned,        //u19xx
    259     cRangeUnassigned,        //u1axx
    260     cRangeUnassigned,        //u1bxx
    261     cRangeUnassigned,        //u1cxx
    262     cRangeUnassigned,        //u1dxx
    263     cRangeSetLatin,          //u1exx
    264     cRangeGreek,             //u1fxx
    265   },
    266   { //table for 2x--
    267     cRangeSetLatin,          //u20xx
    268     cRangeSetLatin,          //u21xx
    269     cRangeMathOperators,     //u22xx
    270     cRangeMiscTechnical,     //u23xx
    271     cRangeControlOpticalEnclose, //u24xx
    272     cRangeBoxBlockGeometrics, //u25xx
    273     cRangeMiscSymbols,       //u26xx
    274     cRangeDingbats,          //u27xx
    275     cRangeBraillePattern,    //u28xx
    276     cRangeUnassigned,        //u29xx
    277     cRangeUnassigned,        //u2axx
    278     cRangeUnassigned,        //u2bxx
    279     cRangeUnassigned,        //u2cxx
    280     cRangeUnassigned,        //u2dxx
    281     cRangeSetCJK,            //u2exx
    282     cRangeSetCJK,            //u2fxx
    283   },
    284   {  //table for ax--
    285     cRangeYi,                //ua0xx
    286     cRangeYi,                //ua1xx
    287     cRangeYi,                //ua2xx
    288     cRangeYi,                //ua3xx
    289     cRangeYi,                //ua4xx
    290     cRangeUnassigned,        //ua5xx
    291     cRangeUnassigned,        //ua6xx
    292     cRangeUnassigned,        //ua7xx
    293     cRangeUnassigned,        //ua8xx
    294     cRangeUnassigned,        //ua9xx
    295     cRangeUnassigned,        //uaaxx
    296     cRangeUnassigned,        //uabxx
    297     cRangeKorean,            //uacxx
    298     cRangeKorean,            //uadxx
    299     cRangeKorean,            //uaexx
    300     cRangeKorean,            //uafxx
    301   },
    302   {  //table for dx--
    303     cRangeKorean,            //ud0xx
    304     cRangeKorean,            //ud1xx
    305     cRangeKorean,            //ud2xx
    306     cRangeKorean,            //ud3xx
    307     cRangeKorean,            //ud4xx
    308     cRangeKorean,            //ud5xx
    309     cRangeKorean,            //ud6xx
    310     cRangeKorean,            //ud7xx
    311     cRangeSurrogate,         //ud8xx
    312     cRangeSurrogate,         //ud9xx
    313     cRangeSurrogate,         //udaxx
    314     cRangeSurrogate,         //udbxx
    315     cRangeSurrogate,         //udcxx
    316     cRangeSurrogate,         //uddxx
    317     cRangeSurrogate,         //udexx
    318     cRangeSurrogate,         //udfxx
    319   },
    320   { // table for fx--
    321     cRangePrivate,           //uf0xx
    322     cRangePrivate,           //uf1xx
    323     cRangePrivate,           //uf2xx
    324     cRangePrivate,           //uf3xx
    325     cRangePrivate,           //uf4xx
    326     cRangePrivate,           //uf5xx
    327     cRangePrivate,           //uf6xx
    328     cRangePrivate,           //uf7xx
    329     cRangePrivate,           //uf8xx
    330     cRangeSetCJK,            //uf9xx
    331     cRangeSetCJK,            //ufaxx
    332     cRangeArabic,            //ufbxx, includes alphabic presentation form
    333     cRangeArabic,            //ufcxx
    334     cRangeArabic,            //ufdxx
    335     cRangeArabic,            //ufexx, includes Combining half marks,
    336                              //                CJK compatibility forms,
    337                              //                CJK compatibility forms,
    338                              //                small form variants
    339     cRangeTableBase+8,       //uffxx, halfwidth and fullwidth forms, includes Specials
    340   },
    341   { //table for 0x0500 - 0x05ff
    342     cRangeCyrillic,          //u050x
    343     cRangeCyrillic,          //u051x
    344     cRangeCyrillic,          //u052x
    345     cRangeArmenian,          //u053x
    346     cRangeArmenian,          //u054x
    347     cRangeArmenian,          //u055x
    348     cRangeArmenian,          //u056x
    349     cRangeArmenian,          //u057x
    350     cRangeArmenian,          //u058x
    351     cRangeHebrew,            //u059x
    352     cRangeHebrew,            //u05ax
    353     cRangeHebrew,            //u05bx
    354     cRangeHebrew,            //u05cx
    355     cRangeHebrew,            //u05dx
    356     cRangeHebrew,            //u05ex
    357     cRangeHebrew,            //u05fx
    358   },
    359   { //table for 0xff00 - 0xffff
    360     cRangeSetCJK,            //uff0x, fullwidth latin
    361     cRangeSetCJK,            //uff1x, fullwidth latin
    362     cRangeSetCJK,            //uff2x, fullwidth latin
    363     cRangeSetCJK,            //uff3x, fullwidth latin
    364     cRangeSetCJK,            //uff4x, fullwidth latin
    365     cRangeSetCJK,            //uff5x, fullwidth latin
    366     cRangeSetCJK,            //uff6x, halfwidth katakana
    367     cRangeSetCJK,            //uff7x, halfwidth katakana
    368     cRangeSetCJK,            //uff8x, halfwidth katakana
    369     cRangeSetCJK,            //uff9x, halfwidth katakana
    370     cRangeSetCJK,            //uffax, halfwidth hangul jamo
    371     cRangeSetCJK,            //uffbx, halfwidth hangul jamo
    372     cRangeSetCJK,            //uffcx, halfwidth hangul jamo
    373     cRangeSetCJK,            //uffdx, halfwidth hangul jamo
    374     cRangeSetCJK,            //uffex, fullwidth symbols
    375     cRangeSpecials,          //ufffx, Specials
    376   },
    377 };
    378 
    379 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
    380 // code points so that the number of entries in the tertiary range
    381 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
    382 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
    383 // syllabaries take multiple chunks and Ogham and Runic share a single chunk.
    384 static const unsigned cTertiaryTableSize = ((0x1700 - 0x0700) / 0x80);
    385 
    386 static const unsigned char gUnicodeTertiaryRangeTable[cTertiaryTableSize] =
    387 { //table for 0x0700 - 0x1600
    388     cRangeSyriac,            //u070x
    389     cRangeThaana,            //u078x
    390     cRangeUnassigned,        //u080x  place holder(resolved in the 2ndary tab.)
    391     cRangeUnassigned,        //u088x  place holder(resolved in the 2ndary tab.)
    392     cRangeDevanagari,        //u090x
    393     cRangeBengali,           //u098x
    394     cRangeGurmukhi,          //u0a0x
    395     cRangeGujarati,          //u0a8x
    396     cRangeOriya,             //u0b0x
    397     cRangeTamil,             //u0b8x
    398     cRangeTelugu,            //u0c0x
    399     cRangeKannada,           //u0c8x
    400     cRangeMalayalam,         //u0d0x
    401     cRangeSinhala,           //u0d8x
    402     cRangeThai,              //u0e0x
    403     cRangeLao,               //u0e8x
    404     cRangeTibetan,           //u0f0x  place holder(resolved in the 2ndary tab.)
    405     cRangeTibetan,           //u0f8x  place holder(resolved in the 2ndary tab.)
    406     cRangeMyanmar,           //u100x
    407     cRangeGeorgian,          //u108x
    408     cRangeKorean,            //u110x  place holder(resolved in the 2ndary tab.)
    409     cRangeKorean,            //u118x  place holder(resolved in the 2ndary tab.)
    410     cRangeEthiopic,          //u120x  place holder(resolved in the 2ndary tab.)
    411     cRangeEthiopic,          //u128x  place holder(resolved in the 2ndary tab.)
    412     cRangeEthiopic,          //u130x
    413     cRangeCherokee,          //u138x
    414     cRangeCanadian,          //u140x  place holder(resolved in the 2ndary tab.)
    415     cRangeCanadian,          //u148x  place holder(resolved in the 2ndary tab.)
    416     cRangeCanadian,          //u150x  place holder(resolved in the 2ndary tab.)
    417     cRangeCanadian,          //u158x  place holder(resolved in the 2ndary tab.)
    418     cRangeCanadian,          //u160x
    419     cRangeOghamRunic,        //u168x  this contains two scripts, Ogham & Runic
    420 };
    421 
    422 // A two level index is almost enough for locating a range, with the
    423 // exception of u03xx and u05xx. Since we don't really care about range for
    424 // combining diacritical marks in our font application, they are
    425 // not discriminated further.  Future adoption of this method for other use
    426 // should be aware of this limitation. The implementation can be extended if
    427 // there is such a need.
    428 // For Indic, Southeast Asian scripts and some other scripts between
    429 // U+0700 and U+16FF, it's extended to the third level.
    430 unsigned int findCharUnicodeRange(UChar32 ch)
    431 {
    432     if (ch >= 0xFFFF)
    433         return 0;
    434 
    435     unsigned int range;
    436 
    437     //search the first table
    438     range = gUnicodeSubrangeTable[0][ch >> 12];
    439 
    440     if (range < cRangeTableBase)
    441         // we try to get a specific range
    442         return range;
    443 
    444     // otherwise, we have one more table to look at
    445     range = gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x0f00) >> 8];
    446     if (range < cRangeTableBase)
    447         return range;
    448     if (range < cRangeTertiaryTable)
    449         return gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x00f0) >> 4];
    450 
    451     // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
    452     return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
    453 }
    454 
    455 const char* langGroupFromUnicodeRange(unsigned char unicodeRange)
    456 {
    457     if (cRangeSpecificItemNum > unicodeRange)
    458         return gUnicodeRangeToLangGroupTable[unicodeRange];
    459     return 0;
    460 }
    461 
    462 }
    463