Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2011 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "platform/text/LocaleToScriptMapping.h"
     33 
     34 #include "wtf/HashMap.h"
     35 #include "wtf/HashSet.h"
     36 #include "wtf/text/StringHash.h"
     37 
     38 namespace WebCore {
     39 
     40 UScriptCode scriptNameToCode(const String& scriptName)
     41 {
     42     struct ScriptNameCode {
     43         const char* name;
     44         UScriptCode code;
     45     };
     46 
     47     // This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are
     48     // treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to
     49     // USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered
     50     // using the same font setting.
     51     static const ScriptNameCode scriptNameCodeList[] = {
     52         { "zyyy", USCRIPT_COMMON },
     53         { "qaai", USCRIPT_INHERITED },
     54         { "arab", USCRIPT_ARABIC },
     55         { "armn", USCRIPT_ARMENIAN },
     56         { "beng", USCRIPT_BENGALI },
     57         { "bopo", USCRIPT_BOPOMOFO },
     58         { "cher", USCRIPT_CHEROKEE },
     59         { "copt", USCRIPT_COPTIC },
     60         { "cyrl", USCRIPT_CYRILLIC },
     61         { "dsrt", USCRIPT_DESERET },
     62         { "deva", USCRIPT_DEVANAGARI },
     63         { "ethi", USCRIPT_ETHIOPIC },
     64         { "geor", USCRIPT_GEORGIAN },
     65         { "goth", USCRIPT_GOTHIC },
     66         { "grek", USCRIPT_GREEK },
     67         { "gujr", USCRIPT_GUJARATI },
     68         { "guru", USCRIPT_GURMUKHI },
     69         { "hani", USCRIPT_HAN },
     70         { "hang", USCRIPT_HANGUL },
     71         { "hebr", USCRIPT_HEBREW },
     72         { "hira", USCRIPT_KATAKANA_OR_HIRAGANA },
     73         { "knda", USCRIPT_KANNADA },
     74         { "kana", USCRIPT_KATAKANA_OR_HIRAGANA },
     75         { "khmr", USCRIPT_KHMER },
     76         { "laoo", USCRIPT_LAO },
     77         { "latn", USCRIPT_LATIN },
     78         { "mlym", USCRIPT_MALAYALAM },
     79         { "mong", USCRIPT_MONGOLIAN },
     80         { "mymr", USCRIPT_MYANMAR },
     81         { "ogam", USCRIPT_OGHAM },
     82         { "ital", USCRIPT_OLD_ITALIC },
     83         { "orya", USCRIPT_ORIYA },
     84         { "runr", USCRIPT_RUNIC },
     85         { "sinh", USCRIPT_SINHALA },
     86         { "syrc", USCRIPT_SYRIAC },
     87         { "taml", USCRIPT_TAMIL },
     88         { "telu", USCRIPT_TELUGU },
     89         { "thaa", USCRIPT_THAANA },
     90         { "thai", USCRIPT_THAI },
     91         { "tibt", USCRIPT_TIBETAN },
     92         { "cans", USCRIPT_CANADIAN_ABORIGINAL },
     93         { "yiii", USCRIPT_YI },
     94         { "tglg", USCRIPT_TAGALOG },
     95         { "hano", USCRIPT_HANUNOO },
     96         { "buhd", USCRIPT_BUHID },
     97         { "tagb", USCRIPT_TAGBANWA },
     98         { "brai", USCRIPT_BRAILLE },
     99         { "cprt", USCRIPT_CYPRIOT },
    100         { "limb", USCRIPT_LIMBU },
    101         { "linb", USCRIPT_LINEAR_B },
    102         { "osma", USCRIPT_OSMANYA },
    103         { "shaw", USCRIPT_SHAVIAN },
    104         { "tale", USCRIPT_TAI_LE },
    105         { "ugar", USCRIPT_UGARITIC },
    106         { "hrkt", USCRIPT_KATAKANA_OR_HIRAGANA },
    107         { "bugi", USCRIPT_BUGINESE },
    108         { "glag", USCRIPT_GLAGOLITIC },
    109         { "khar", USCRIPT_KHAROSHTHI },
    110         { "sylo", USCRIPT_SYLOTI_NAGRI },
    111         { "talu", USCRIPT_NEW_TAI_LUE },
    112         { "tfng", USCRIPT_TIFINAGH },
    113         { "xpeo", USCRIPT_OLD_PERSIAN },
    114         { "bali", USCRIPT_BALINESE },
    115         { "batk", USCRIPT_BATAK },
    116         { "blis", USCRIPT_BLISSYMBOLS },
    117         { "brah", USCRIPT_BRAHMI },
    118         { "cham", USCRIPT_CHAM },
    119         { "cirt", USCRIPT_CIRTH },
    120         { "cyrs", USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC },
    121         { "egyd", USCRIPT_DEMOTIC_EGYPTIAN },
    122         { "egyh", USCRIPT_HIERATIC_EGYPTIAN },
    123         { "egyp", USCRIPT_EGYPTIAN_HIEROGLYPHS },
    124         { "geok", USCRIPT_KHUTSURI },
    125         { "hans", USCRIPT_SIMPLIFIED_HAN },
    126         { "hant", USCRIPT_TRADITIONAL_HAN },
    127         { "hmng", USCRIPT_PAHAWH_HMONG },
    128         { "hung", USCRIPT_OLD_HUNGARIAN },
    129         { "inds", USCRIPT_HARAPPAN_INDUS },
    130         { "java", USCRIPT_JAVANESE },
    131         { "kali", USCRIPT_KAYAH_LI },
    132         { "latf", USCRIPT_LATIN_FRAKTUR },
    133         { "latg", USCRIPT_LATIN_GAELIC },
    134         { "lepc", USCRIPT_LEPCHA },
    135         { "lina", USCRIPT_LINEAR_A },
    136         { "mand", USCRIPT_MANDAEAN },
    137         { "maya", USCRIPT_MAYAN_HIEROGLYPHS },
    138         { "mero", USCRIPT_MEROITIC },
    139         { "nkoo", USCRIPT_NKO },
    140         { "orkh", USCRIPT_ORKHON },
    141         { "perm", USCRIPT_OLD_PERMIC },
    142         { "phag", USCRIPT_PHAGS_PA },
    143         { "phnx", USCRIPT_PHOENICIAN },
    144         { "plrd", USCRIPT_PHONETIC_POLLARD },
    145         { "roro", USCRIPT_RONGORONGO },
    146         { "sara", USCRIPT_SARATI },
    147         { "syre", USCRIPT_ESTRANGELO_SYRIAC },
    148         { "syrj", USCRIPT_WESTERN_SYRIAC },
    149         { "syrn", USCRIPT_EASTERN_SYRIAC },
    150         { "teng", USCRIPT_TENGWAR },
    151         { "vaii", USCRIPT_VAI },
    152         { "visp", USCRIPT_VISIBLE_SPEECH },
    153         { "xsux", USCRIPT_CUNEIFORM },
    154         { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA },
    155         { "kore", USCRIPT_HANGUL },
    156         { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES },
    157         { "zzzz", USCRIPT_UNKNOWN }
    158     };
    159 
    160     typedef HashMap<String, UScriptCode> ScriptNameCodeMap;
    161     DEFINE_STATIC_LOCAL(ScriptNameCodeMap, scriptNameCodeMap, ());
    162     if (scriptNameCodeMap.isEmpty()) {
    163         for (size_t i = 0; i < sizeof(scriptNameCodeList) / sizeof(scriptNameCodeList[0]); ++i)
    164             scriptNameCodeMap.set(scriptNameCodeList[i].name, scriptNameCodeList[i].code);
    165     }
    166 
    167     HashMap<String, UScriptCode>::iterator it = scriptNameCodeMap.find(scriptName.lower());
    168     if (it != scriptNameCodeMap.end())
    169         return it->value;
    170     return USCRIPT_INVALID_CODE;
    171 }
    172 
    173 UScriptCode localeToScriptCodeForFontSelection(const String& locale)
    174 {
    175     struct LocaleScript {
    176         const char* locale;
    177         UScriptCode script;
    178     };
    179 
    180     static const LocaleScript localeScriptList[] = {
    181         { "aa", USCRIPT_LATIN },
    182         { "ab", USCRIPT_CYRILLIC },
    183         { "ady", USCRIPT_CYRILLIC },
    184         { "af", USCRIPT_LATIN },
    185         { "ak", USCRIPT_LATIN },
    186         { "am", USCRIPT_ETHIOPIC },
    187         { "ar", USCRIPT_ARABIC },
    188         { "as", USCRIPT_BENGALI },
    189         { "ast", USCRIPT_LATIN },
    190         { "av", USCRIPT_CYRILLIC },
    191         { "ay", USCRIPT_LATIN },
    192         { "az", USCRIPT_LATIN },
    193         { "ba", USCRIPT_CYRILLIC },
    194         { "be", USCRIPT_CYRILLIC },
    195         { "bg", USCRIPT_CYRILLIC },
    196         { "bi", USCRIPT_LATIN },
    197         { "bn", USCRIPT_BENGALI },
    198         { "bo", USCRIPT_TIBETAN },
    199         { "bs", USCRIPT_LATIN },
    200         { "ca", USCRIPT_LATIN },
    201         { "ce", USCRIPT_CYRILLIC },
    202         { "ceb", USCRIPT_LATIN },
    203         { "ch", USCRIPT_LATIN },
    204         { "chk", USCRIPT_LATIN },
    205         { "cs", USCRIPT_LATIN },
    206         { "cy", USCRIPT_LATIN },
    207         { "da", USCRIPT_LATIN },
    208         { "de", USCRIPT_LATIN },
    209         { "dv", USCRIPT_THAANA },
    210         { "dz", USCRIPT_TIBETAN },
    211         { "ee", USCRIPT_LATIN },
    212         { "efi", USCRIPT_LATIN },
    213         { "el", USCRIPT_GREEK },
    214         { "en", USCRIPT_LATIN },
    215         { "es", USCRIPT_LATIN },
    216         { "et", USCRIPT_LATIN },
    217         { "eu", USCRIPT_LATIN },
    218         { "fa", USCRIPT_ARABIC },
    219         { "fi", USCRIPT_LATIN },
    220         { "fil", USCRIPT_LATIN },
    221         { "fj", USCRIPT_LATIN },
    222         { "fo", USCRIPT_LATIN },
    223         { "fr", USCRIPT_LATIN },
    224         { "fur", USCRIPT_LATIN },
    225         { "fy", USCRIPT_LATIN },
    226         { "ga", USCRIPT_LATIN },
    227         { "gaa", USCRIPT_LATIN },
    228         { "gd", USCRIPT_LATIN },
    229         { "gil", USCRIPT_LATIN },
    230         { "gl", USCRIPT_LATIN },
    231         { "gn", USCRIPT_LATIN },
    232         { "gsw", USCRIPT_LATIN },
    233         { "gu", USCRIPT_GUJARATI },
    234         { "ha", USCRIPT_LATIN },
    235         { "haw", USCRIPT_LATIN },
    236         { "he", USCRIPT_HEBREW },
    237         { "hi", USCRIPT_DEVANAGARI },
    238         { "hil", USCRIPT_LATIN },
    239         { "ho", USCRIPT_LATIN },
    240         { "hr", USCRIPT_LATIN },
    241         { "ht", USCRIPT_LATIN },
    242         { "hu", USCRIPT_LATIN },
    243         { "hy", USCRIPT_ARMENIAN },
    244         { "id", USCRIPT_LATIN },
    245         { "ig", USCRIPT_LATIN },
    246         { "ii", USCRIPT_YI },
    247         { "ilo", USCRIPT_LATIN },
    248         { "inh", USCRIPT_CYRILLIC },
    249         { "is", USCRIPT_LATIN },
    250         { "it", USCRIPT_LATIN },
    251         { "iu", USCRIPT_CANADIAN_ABORIGINAL },
    252         { "ja", USCRIPT_KATAKANA_OR_HIRAGANA },
    253         { "jv", USCRIPT_LATIN },
    254         { "ka", USCRIPT_GEORGIAN },
    255         { "kaj", USCRIPT_LATIN },
    256         { "kam", USCRIPT_LATIN },
    257         { "kbd", USCRIPT_CYRILLIC },
    258         { "kha", USCRIPT_LATIN },
    259         { "kk", USCRIPT_CYRILLIC },
    260         { "kl", USCRIPT_LATIN },
    261         { "km", USCRIPT_KHMER },
    262         { "kn", USCRIPT_KANNADA },
    263         { "ko", USCRIPT_HANGUL },
    264         { "kok", USCRIPT_DEVANAGARI },
    265         { "kos", USCRIPT_LATIN },
    266         { "kpe", USCRIPT_LATIN },
    267         { "krc", USCRIPT_CYRILLIC },
    268         { "ks", USCRIPT_ARABIC },
    269         { "ku", USCRIPT_ARABIC },
    270         { "kum", USCRIPT_CYRILLIC },
    271         { "ky", USCRIPT_CYRILLIC },
    272         { "la", USCRIPT_LATIN },
    273         { "lah", USCRIPT_ARABIC },
    274         { "lb", USCRIPT_LATIN },
    275         { "lez", USCRIPT_CYRILLIC },
    276         { "ln", USCRIPT_LATIN },
    277         { "lo", USCRIPT_LAO },
    278         { "lt", USCRIPT_LATIN },
    279         { "lv", USCRIPT_LATIN },
    280         { "mai", USCRIPT_DEVANAGARI },
    281         { "mdf", USCRIPT_CYRILLIC },
    282         { "mg", USCRIPT_LATIN },
    283         { "mh", USCRIPT_LATIN },
    284         { "mi", USCRIPT_LATIN },
    285         { "mk", USCRIPT_CYRILLIC },
    286         { "ml", USCRIPT_MALAYALAM },
    287         { "mn", USCRIPT_CYRILLIC },
    288         { "mr", USCRIPT_DEVANAGARI },
    289         { "ms", USCRIPT_LATIN },
    290         { "mt", USCRIPT_LATIN },
    291         { "my", USCRIPT_MYANMAR },
    292         { "myv", USCRIPT_CYRILLIC },
    293         { "na", USCRIPT_LATIN },
    294         { "nb", USCRIPT_LATIN },
    295         { "ne", USCRIPT_DEVANAGARI },
    296         { "niu", USCRIPT_LATIN },
    297         { "nl", USCRIPT_LATIN },
    298         { "nn", USCRIPT_LATIN },
    299         { "nr", USCRIPT_LATIN },
    300         { "nso", USCRIPT_LATIN },
    301         { "ny", USCRIPT_LATIN },
    302         { "oc", USCRIPT_LATIN },
    303         { "om", USCRIPT_LATIN },
    304         { "or", USCRIPT_ORIYA },
    305         { "os", USCRIPT_CYRILLIC },
    306         { "pa", USCRIPT_GURMUKHI },
    307         { "pag", USCRIPT_LATIN },
    308         { "pap", USCRIPT_LATIN },
    309         { "pau", USCRIPT_LATIN },
    310         { "pl", USCRIPT_LATIN },
    311         { "pon", USCRIPT_LATIN },
    312         { "ps", USCRIPT_ARABIC },
    313         { "pt", USCRIPT_LATIN },
    314         { "qu", USCRIPT_LATIN },
    315         { "rm", USCRIPT_LATIN },
    316         { "rn", USCRIPT_LATIN },
    317         { "ro", USCRIPT_LATIN },
    318         { "ru", USCRIPT_CYRILLIC },
    319         { "rw", USCRIPT_LATIN },
    320         { "sa", USCRIPT_DEVANAGARI },
    321         { "sah", USCRIPT_CYRILLIC },
    322         { "sat", USCRIPT_LATIN },
    323         { "sd", USCRIPT_ARABIC },
    324         { "se", USCRIPT_LATIN },
    325         { "sg", USCRIPT_LATIN },
    326         { "si", USCRIPT_SINHALA },
    327         { "sid", USCRIPT_LATIN },
    328         { "sk", USCRIPT_LATIN },
    329         { "sl", USCRIPT_LATIN },
    330         { "sm", USCRIPT_LATIN },
    331         { "so", USCRIPT_LATIN },
    332         { "sq", USCRIPT_LATIN },
    333         { "sr", USCRIPT_CYRILLIC },
    334         { "ss", USCRIPT_LATIN },
    335         { "st", USCRIPT_LATIN },
    336         { "su", USCRIPT_LATIN },
    337         { "sv", USCRIPT_LATIN },
    338         { "sw", USCRIPT_LATIN },
    339         { "ta", USCRIPT_TAMIL },
    340         { "te", USCRIPT_TELUGU },
    341         { "tet", USCRIPT_LATIN },
    342         { "tg", USCRIPT_CYRILLIC },
    343         { "th", USCRIPT_THAI },
    344         { "ti", USCRIPT_ETHIOPIC },
    345         { "tig", USCRIPT_ETHIOPIC },
    346         { "tk", USCRIPT_LATIN },
    347         { "tkl", USCRIPT_LATIN },
    348         { "tl", USCRIPT_LATIN },
    349         { "tn", USCRIPT_LATIN },
    350         { "to", USCRIPT_LATIN },
    351         { "tpi", USCRIPT_LATIN },
    352         { "tr", USCRIPT_LATIN },
    353         { "trv", USCRIPT_LATIN },
    354         { "ts", USCRIPT_LATIN },
    355         { "tt", USCRIPT_CYRILLIC },
    356         { "tvl", USCRIPT_LATIN },
    357         { "tw", USCRIPT_LATIN },
    358         { "ty", USCRIPT_LATIN },
    359         { "tyv", USCRIPT_CYRILLIC },
    360         { "udm", USCRIPT_CYRILLIC },
    361         { "ug", USCRIPT_ARABIC },
    362         { "uk", USCRIPT_CYRILLIC },
    363         { "und", USCRIPT_LATIN },
    364         { "ur", USCRIPT_ARABIC },
    365         { "uz", USCRIPT_CYRILLIC },
    366         { "ve", USCRIPT_LATIN },
    367         { "vi", USCRIPT_LATIN },
    368         { "wal", USCRIPT_ETHIOPIC },
    369         { "war", USCRIPT_LATIN },
    370         { "wo", USCRIPT_LATIN },
    371         { "xh", USCRIPT_LATIN },
    372         { "yap", USCRIPT_LATIN },
    373         { "yo", USCRIPT_LATIN },
    374         { "za", USCRIPT_LATIN },
    375         { "zh", USCRIPT_SIMPLIFIED_HAN },
    376         { "zh_hk", USCRIPT_TRADITIONAL_HAN },
    377         { "zh_tw", USCRIPT_TRADITIONAL_HAN },
    378         { "zu", USCRIPT_LATIN }
    379     };
    380 
    381     typedef HashMap<String, UScriptCode> LocaleScriptMap;
    382     DEFINE_STATIC_LOCAL(LocaleScriptMap, localeScriptMap, ());
    383     if (localeScriptMap.isEmpty()) {
    384         for (size_t i = 0; i < sizeof(localeScriptList) / sizeof(localeScriptList[0]); ++i)
    385             localeScriptMap.set(localeScriptList[i].locale, localeScriptList[i].script);
    386     }
    387 
    388     String canonicalLocale = locale.lower().replace('-', '_');
    389     while (!canonicalLocale.isEmpty()) {
    390         HashMap<String, UScriptCode>::iterator it = localeScriptMap.find(canonicalLocale);
    391         if (it != localeScriptMap.end())
    392             return it->value;
    393         size_t pos = canonicalLocale.reverseFind('_');
    394         if (pos == kNotFound)
    395             break;
    396         UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos + 1));
    397         if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN)
    398             return code;
    399         canonicalLocale = canonicalLocale.substring(0, pos);
    400     }
    401     return USCRIPT_COMMON;
    402 }
    403 
    404 } // namespace WebCore
    405