Home | History | Annotate | Download | only in compact_lang_det
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // This file extends lang_enc.cc with additional languages and extended routines
      6 // It is current with Unicode 5.1 (beta Jan 2008)
      7 //
      8 
      9 #include <stdlib.h>
     10 #include <stdio.h>
     11 #include <string.h>
     12 
     13 #include "encodings/compact_lang_det/ext_lang_enc.h"
     14 #include "encodings/compact_lang_det/win/cld_macros.h"
     15 #include "encodings/compact_lang_det/win/cld_strtoint.h"
     16 
     17 // Language names above NUM_LANGUAGES
     18 // These are also the C enum declared names
     19 static const char* const kExtLanguageName[] = {
     20 "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
     21 
     22 // Pseudo-languages for Unicode scripts that express a single language
     23 "X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
     24 "X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
     25 "X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
     26 "X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
     27 "X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
     28 "X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
     29 
     30 // Unicode 5.1
     31 "X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
     32 "X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
     33 "X_CHAM",
     34 };
     35 
     36 
     37 // These are the C enum declared names, for programs creating C code
     38 static const char* const kExtLangDeclaredName[] = {
     39   "ENGLISH",      /* 0 */
     40   "DANISH",       /* 1 */
     41   "DUTCH",        /* 2 */
     42   "FINNISH",      /* 3 */
     43   "FRENCH",       /* 4 */
     44   "GERMAN",       /* 5 */
     45   "HEBREW",       /* 6 */
     46   "ITALIAN",      /* 7 */
     47   "JAPANESE",     /* 8 */
     48   "KOREAN",       /* 9 */
     49   "NORWEGIAN",    /* 10 */
     50   "POLISH",       /* 11 */
     51   "PORTUGUESE",   /* 12 */
     52   "RUSSIAN",      /* 13 */
     53   "SPANISH",      /* 14 */
     54   "SWEDISH",      /* 15 */
     55   "CHINESE",      /* 16 */
     56   "CZECH",        /* 17 */
     57   "GREEK",        /* 18 */
     58   "ICELANDIC",    /* 19 */
     59   "LATVIAN",      /* 20 */
     60   "LITHUANIAN",   /* 21 */
     61   "ROMANIAN",     /* 22 */
     62   "HUNGARIAN",    /* 23 */
     63   "ESTONIAN",     /* 24 */
     64   "TG_UNKNOWN_LANGUAGE",  /* 25 */
     65   "UNKNOWN_LANGUAGE",     /* 26 */
     66   "BULGARIAN",    /* 27 */
     67   "CROATIAN",     /* 28 */
     68   "SERBIAN",      /* 29 */
     69   "IRISH",        /* 30 */
     70   "GALICIAN",     /* 31 */
     71   "TAGALOG",      /* 32 */
     72   "TURKISH",      /* 33 */
     73   "UKRAINIAN",    /* 34 */
     74   "HINDI",        /* 35 */
     75   "MACEDONIAN",   /* 36 */
     76   "BENGALI",      /* 37 */
     77   "INDONESIAN",   /* 38 */
     78   "LATIN",        /* 39 */
     79   "MALAY",        /* 40 */
     80   "MALAYALAM",    /* 41 */
     81   "WELSH",        /* 42 */
     82   "NEPALI",       /* 43 */
     83   "TELUGU",       /* 44 */
     84   "ALBANIAN",     /* 45 */
     85   "TAMIL",        /* 46 */
     86   "BELARUSIAN",   /* 47 */
     87   "JAVANESE",     /* 48 */
     88   "OCCITAN",      /* 49 */
     89   "URDU",         /* 50 */
     90   "BIHARI",       /* 51 */
     91   "GUJARATI",     /* 52 */
     92   "THAI",         /* 53 */
     93   "ARABIC",       /* 54 */
     94   "CATALAN",      /* 55 */
     95   "ESPERANTO",    /* 56 */
     96   "BASQUE",       /* 57 */
     97   "INTERLINGUA",  /* 58 */
     98   "KANNADA",      /* 59 */
     99   "PUNJABI",      /* 60 */
    100   "SCOTS_GAELIC", /* 61 */
    101   "SWAHILI",      /* 62 */
    102   "SLOVENIAN",    /* 63 */
    103   "MARATHI",      /* 64 */
    104   "MALTESE",      /* 65 */
    105   "VIETNAMESE",   /* 66 */
    106   "FRISIAN",      /* 67 */
    107   "SLOVAK",       /* 68 */
    108   "CHINESE_T",    /* 69 */
    109   "FAROESE",      /* 70 */
    110   "SUNDANESE",    /* 71 */
    111   "UZBEK",        /* 72 */
    112   "AMHARIC",      /* 73 */
    113   "AZERBAIJANI",  /* 74 */
    114   "GEORGIAN",     /* 75 */
    115   "TIGRINYA",     /* 76 */
    116   "PERSIAN",      /* 77 */
    117   "BOSNIAN",      /* 78 */
    118   "SINHALESE",    /* 79 */
    119   "NORWEGIAN_N",  /* 80 */
    120   "PORTUGUESE_P", /* 81 */
    121   "PORTUGUESE_B", /* 82 */
    122   "XHOSA",        /* 83 */
    123   "ZULU",         /* 84 */
    124   "GUARANI",      /* 85 */
    125   "SESOTHO",      /* 86 */
    126   "TURKMEN",      /* 87 */
    127   "KYRGYZ",       /* 88 */
    128   "BRETON",       /* 89 */
    129   "TWI",          /* 90 */
    130   "YIDDISH",      /* 91 */
    131   "SERBO_CROATIAN",       /* 92 */
    132   "SOMALI",       /* 93 */
    133   "UIGHUR",       /* 94 */
    134   "KURDISH",      /* 95 */
    135   "MONGOLIAN",    /* 96 */
    136   "ARMENIAN",     /* 97 */
    137   "LAOTHIAN",     /* 98 */
    138   "SINDHI",       /* 99 */
    139   "RHAETO_ROMANCE",  /* 100 */
    140   "AFRIKAANS",    /* 101 */
    141   "LUXEMBOURGISH", /* 102 */
    142   "BURMESE",      /* 103 */
    143   "KHMER",        /* 104 */
    144   "TIBETAN",      /* 105 */
    145   "DHIVEHI",      /* 106 */       // sometimes spelled Divehi; lang of Maldives
    146   "CHEROKEE",     /* 107 */
    147   "SYRIAC",       /* 108 */
    148   "LIMBU",        /* 109 */
    149   "ORIYA",        /* 110 */
    150   "ASSAMESE",     /* 111 */
    151   "CORSICAN",     /* 112 */
    152   "INTERLINGUE",  /* 113 */
    153   "KAZAKH",       /* 114 */
    154   "LINGALA",      /* 115 */
    155   "MOLDAVIAN",    /* 116 */
    156   "PASHTO",       /* 117 */
    157   "QUECHUA",      /* 118 */
    158   "SHONA",        /* 119 */
    159   "TAJIK",        /* 120 */
    160   "TATAR",        /* 121 */
    161   "TONGA",        /* 122 */
    162   "YORUBA",       /* 123 */
    163   "CREOLES_AND_PIDGINS_ENGLISH_BASED",      /* 124 */
    164   "CREOLES_AND_PIDGINS_FRENCH_BASED",       /* 125 */
    165   "CREOLES_AND_PIDGINS_PORTUGUESE_BASED",   /* 126 */
    166   "CREOLES_AND_PIDGINS_OTHER",              /* 127 */
    167   "MAORI",        /* 128 */
    168   "WOLOF",        /* 129 */
    169   "ABKHAZIAN",    /* 130 */
    170   "AFAR",         /* 131 */
    171   "AYMARA",       /* 132 */
    172   "BASHKIR",      /* 133 */
    173   "BISLAMA",      /* 134 */
    174   "DZONGKHA",     /* 135 */
    175   "FIJIAN",       /* 136 */
    176   "GREENLANDIC",  /* 137 */
    177   "HAUSA",        /* 138 */
    178   "HAITIAN_CREOLE",  /* 139 */
    179   "INUPIAK",      /* 140 */
    180   "INUKTITUT",    /* 141 */
    181   "KASHMIRI",     /* 142 */
    182   "KINYARWANDA",  /* 143 */
    183   "MALAGASY",     /* 144 */
    184   "NAURU",        /* 145 */
    185   "OROMO",        /* 146 */
    186   "RUNDI",        /* 147 */
    187   "SAMOAN",       /* 148 */
    188   "SANGO",        /* 149 */
    189   "SANSKRIT",     /* 150 */
    190   "SISWANT",      /* 151 */
    191   "TSONGA",       /* 152 */
    192   "TSWANA",       /* 153 */
    193   "VOLAPUK",      /* 154 */
    194   "ZHUANG",       /* 155 */
    195   "KHASI",        /* 156 */
    196   "SCOTS",        /* 157 */
    197   "GANDA",        /* 158 */
    198   "MANX",         /* 159 */
    199   "MONTENEGRIN",  /* 160 */
    200   // Add new language declared names just before here
    201 };
    202 
    203 COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
    204        kExtLangDeclaredName_has_incorrect_length);
    205 
    206 
    207 // Language codes above NUM_LANGUAGES
    208 // I made all these up, except Klingon from ISO-639-2 (dsites)
    209 // NOTE: zza is a standard name
    210 static const char* const kExtLanguageCode[] = {
    211   // "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
    212   // All Latin script
    213   "zzb", "zzp", "zzh", "tlh", "zze",
    214 
    215   // Pseudo-languages for Unicode scripts that express a single language
    216   "xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
    217   "xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
    218   "xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
    219   "xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
    220   "xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
    221   "xx-Phnx", "xx-Phag", "xx-Nkoo",
    222 
    223   // Unicode 5.1
    224   "xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
    225   "xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
    226   "xx-Cham",
    227 };
    228 
    229 
    230 // Given the Language, returns its string name used as the output by
    231 // the lang/enc identifier, e.g. "Korean"
    232 // "invalid_language" if the input is invalid.
    233 // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
    234 // used to subtract out HTML, link farms, DNA strings, and alittle English porn
    235 const char* ExtLanguageName(const Language lang) {
    236   if (lang < 0) {
    237     // No-text-at-all result from a Tote
    238     return "";
    239   }
    240   // CompactLanguageDetect extension
    241   if (lang == TG_UNKNOWN_LANGUAGE) {
    242     return "Ignore";
    243   }
    244   if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
    245     return LanguageName(lang);
    246   }
    247   if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
    248     return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
    249   }
    250   return invalid_language_name();
    251 }
    252 
    253 
    254 // Given the Language, returns its Language enum spelling, for use by
    255 // programs that create C declarations, e.g. "KOREAN"
    256 // "UNKNOWN_LANGUAGE" if the input is invalid.
    257 const char* ExtLanguageDeclaredName(const Language lang) {
    258   if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
    259     return kExtLangDeclaredName[lang];
    260   }
    261   if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
    262     return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
    263   }
    264   return "UNKNOWN_LANGUAGE";
    265 }
    266 
    267 // Given the Language, return the language code, e.g. "ko"
    268 const char* ExtLanguageCode(const Language lang) {
    269   // Hack for ignore/porn pseudo-language
    270   if (lang == TG_UNKNOWN_LANGUAGE) {
    271     return "xxx";
    272   }
    273   if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
    274     return LanguageCode(lang);
    275   }
    276   if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
    277     return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
    278   }
    279   return "??";
    280 }
    281 
    282 
    283 // Convert "en-Latn-GB" to ENGLISH
    284 // Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
    285 // Consider for later: NORWEGIAN, NORWEGIAN_N
    286 // Consider for later: SCOTS, SCOTS_GAELIC
    287 // Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
    288 //
    289 Language GetLanguageFromNumberOrName(const char* src) {
    290   if (strspn(src, "0123456789") == strlen(src)) {
    291     // All digits
    292     return static_cast<Language>(strto32(src, NULL, 10));
    293   }
    294 
    295   Language retlang = UNKNOWN_LANGUAGE;
    296   size_t len = strlen(src);
    297 
    298   if (true /*FLAGS_mergepairs*/) {
    299     // Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
    300     if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
    301     if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
    302     if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
    303     // Use NormalizeLanguage instead
    304     if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
    305     if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
    306     if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
    307     if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
    308     if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
    309     if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
    310   }
    311 
    312   // Extensions
    313   if (len >= 3) {
    314     // Standin for ignore/porn "language"
    315     if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
    316 
    317     if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
    318     if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
    319     if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
    320     if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
    321     if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
    322   }
    323 
    324   // We have a name like en-Latn-GB or pt-BR
    325   // First, get rid of some special cases
    326   if (len <= 3) {
    327     LanguageFromCode(src, &retlang);
    328   } else if (len == 7) {
    329     // More Extensions
    330     if (memcmp(src, "xx-", 3) == 0) {
    331       if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
    332       if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
    333       if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
    334       if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
    335       if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
    336       if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
    337       if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
    338       if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
    339       if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
    340       if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
    341       if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
    342       if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
    343       if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
    344       if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
    345       if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
    346       if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
    347       if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
    348       if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
    349       if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
    350       if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
    351       if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
    352       if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
    353       if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
    354       if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
    355       if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
    356       if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
    357       if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
    358       if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
    359 
    360       // Unicode 5.1
    361       if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
    362       if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
    363       if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
    364       if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
    365       if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
    366       if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
    367       if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
    368       if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
    369       if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
    370       if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
    371       if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
    372     }
    373   }
    374   // Some other weird ones
    375   // Could be Latn or Limb; all our current training data is Latn
    376   if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
    377   if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
    378 
    379   // Multi-country langauges
    380   if (memcmp(src, "zh", 2) == 0) {
    381     if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
    382     if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
    383     return CHINESE;
    384   }
    385   if (memcmp(src, "pt", 2) == 0) {
    386     if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
    387     return PORTUGUESE;
    388   }
    389   if (memcmp(src, "fr", 2) == 0) {
    390     if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
    391     return FRENCH;
    392   }
    393 
    394   // None of the special cases matched
    395   if (src[2] == '-') {
    396     char temp[4];
    397     memcpy(temp, src, 4);
    398     temp[2] = '\0';
    399     LanguageFromCode(temp, &retlang);
    400   }
    401   if (src[3] == '-') {
    402     char temp[4];
    403     memcpy(temp, src, 4);
    404     temp[3] = '\0';
    405     LanguageFromCode(temp, &retlang);
    406   }
    407   if (retlang != UNKNOWN_LANGUAGE) {
    408     return retlang;
    409   }
    410 
    411   return retlang;
    412 }
    413 
    414 typedef struct {
    415   const char* name;
    416   UnicodeLScript lscript;
    417 } NameScriptPair;
    418 
    419 // In alphabetic order for binary search
    420 static const NameScriptPair kNameScriptPair[] = {
    421   // Unicode 5.1 additional scripts
    422   {"Arab", ULScript_Arabic},
    423   {"Armn", ULScript_Armenian},
    424   {"Bali", ULScript_Balinese},
    425   {"Beng", ULScript_Bengali},
    426   {"Bugi", ULScript_Buginese},
    427   {"Buhd", ULScript_Buhid},
    428   {"Cans", ULScript_Canadian_Aboriginal},
    429   {"Cari", ULScript_Carian},      // Unicode 5.1
    430   {"Cham", ULScript_Cham},        // Unicode 5.1
    431   {"Cher", ULScript_Cherokee},
    432   {"Copt", ULScript_Coptic},
    433   {"Cprt", ULScript_Cypriot},
    434   {"Cyrl", ULScript_Cyrillic},
    435   {"Deva", ULScript_Devanagari},
    436   {"Dsrt", ULScript_Deseret},
    437   {"Ethi", ULScript_Ethiopic},
    438   {"Geor", ULScript_Georgian},
    439   {"Glag", ULScript_Glagolitic},
    440   {"Goth", ULScript_Gothic},
    441   {"Grek", ULScript_Greek},
    442   {"Gujr", ULScript_Gujarati},
    443   {"Guru", ULScript_Gurmukhi},
    444   {"Hani", ULScript_HanCJK},
    445   {"Hano", ULScript_Hanunoo},
    446   {"Hebr", ULScript_Hebrew},
    447   {"Ital", ULScript_Old_Italic},
    448   {"Kali", ULScript_Kayah_Li},    // Unicode 5.1
    449   {"Khar", ULScript_Kharoshthi},
    450   {"Khmr", ULScript_Khmer},
    451   {"Knda", ULScript_Kannada},
    452   {"Laoo", ULScript_Lao},
    453   {"Latn", ULScript_Latin},
    454   {"Lepc", ULScript_Lepcha},      // Unicode 5.1
    455   {"Limb", ULScript_Limbu},
    456   {"Linb", ULScript_Linear_B},
    457   {"Lyci", ULScript_Lycian},      // Unicode 5.1
    458   {"Lydi", ULScript_Lydian},      // Unicode 5.1
    459   {"Mlym", ULScript_Malayalam},
    460   {"Mong", ULScript_Mongolian},
    461   {"Mymr", ULScript_Myanmar},
    462   {"Nkoo", ULScript_Nko},
    463   {"Ogam", ULScript_Ogham},
    464   {"Olck", ULScript_Ol_Chiki},    // Unicode 5.1
    465   {"Orya", ULScript_Oriya},
    466   {"Osma", ULScript_Osmanya},
    467   {"Phag", ULScript_Phags_Pa},
    468   {"Phnx", ULScript_Phoenician},
    469   {"Rjng", ULScript_Rejang},      // Unicode 5.1
    470   {"Runr", ULScript_Runic},
    471   {"Saur", ULScript_Saurashtra},  // Unicode 5.1
    472   {"Shaw", ULScript_Shavian},
    473   {"Sinh", ULScript_Sinhala},
    474   {"Sund", ULScript_Sundanese},   // Unicode 5.1
    475   {"Sylo", ULScript_Syloti_Nagri},
    476   {"Syrc", ULScript_Syriac},
    477   {"Tagb", ULScript_Tagbanwa},
    478   {"Tale", ULScript_Tai_Le},
    479   {"Talu", ULScript_New_Tai_Lue},
    480   {"Taml", ULScript_Tamil},
    481   {"Telu", ULScript_Telugu},
    482   {"Tfng", ULScript_Tifinagh},
    483   {"Tglg", ULScript_Tagalog},
    484   {"Thaa", ULScript_Thaana},
    485   {"Thai", ULScript_Thai},
    486   {"Tibt", ULScript_Tibetan},
    487   {"Ugar", ULScript_Ugaritic},
    488   {"Vaii", ULScript_Vai},         // Unicode 5.1 // NOTE: apparently 'Vai '
    489   {"Xpeo", ULScript_Old_Persian},
    490   {"Xsux", ULScript_Cuneiform},
    491   {"Yiii", ULScript_Yi},
    492   {"Zyyy", ULScript_Common},
    493   {"Zzzz", ULScript_Inherited},
    494 };
    495 
    496 // Convert "en-Latn-GB" to ULScript_Latin
    497 UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
    498   if (strspn(src, "0123456789") == strlen(src)) {
    499     // All digits
    500     return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
    501   }
    502 
    503   if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
    504   if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
    505   if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
    506   if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
    507   // Could be Latn or Limb; all our current training data is Latn
    508   if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
    509 
    510   // Isolate just the script field
    511   char temp[5];
    512   const char* src2 = strchr(src, '-');
    513   if (src2 == NULL) {return ULScript_Latin;}
    514   src2 += 1;      // over the -
    515   memcpy(temp, src2, 4);
    516   temp[4] = '\0';
    517 
    518   int lo = 0;
    519   int hi = ULScript_NUM_SCRIPTS;
    520   while (lo < hi) {
    521     int mid = (lo + hi) >> 1;
    522     if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
    523       hi = mid;
    524     } else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
    525       lo = mid + 1;
    526     } else {
    527       return kNameScriptPair[mid].lscript;
    528     }
    529   }
    530   return ULScript_Latin;
    531 }
    532 
    533 
    534 // Merge together some languages, such as bo/hr/sr
    535 // Croatian Latin and Serbian Cyrillic now.
    536 Language NormalizeLanguage(Language lang) {
    537   if (lang == BOSNIAN) {return CROATIAN;}
    538   if (lang == SERBO_CROATIAN) {return SERBIAN;}
    539 
    540   if (lang == PORTUGUESE_P) {return PORTUGUESE;}
    541   if (lang == PORTUGUESE_B) {return PORTUGUESE;}
    542 
    543   return lang;
    544 }
    545 
    546