Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 1997-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *
      9 * File ULOC.CPP
     10 *
     11 * Modification History:
     12 *
     13 *   Date        Name        Description
     14 *   04/01/97    aliu        Creation.
     15 *   08/21/98    stephen     JDK 1.2 sync
     16 *   12/08/98    rtg         New Locale implementation and C API
     17 *   03/15/99    damiba      overhaul.
     18 *   04/06/99    stephen     changed setDefault() to realloc and copy
     19 *   06/14/99    stephen     Changed calls to ures_open for new params
     20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
     21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
     22 *                           brought canonicalization code into line with spec
     23 *****************************************************************************/
     24 
     25 /*
     26    POSIX's locale format, from putil.c: [no spaces]
     27 
     28      ll [ _CC ] [ . MM ] [ @ VV]
     29 
     30      l = lang, C = ctry, M = charmap, V = variant
     31 */
     32 
     33 #include "unicode/utypes.h"
     34 #include "unicode/ustring.h"
     35 #include "unicode/uloc.h"
     36 
     37 #include "putilimp.h"
     38 #include "ustr_imp.h"
     39 #include "ulocimp.h"
     40 #include "umutex.h"
     41 #include "cstring.h"
     42 #include "cmemory.h"
     43 #include "locmap.h"
     44 #include "uarrsort.h"
     45 #include "uenumimp.h"
     46 #include "uassert.h"
     47 #include "charstr.h"
     48 
     49 #include <stdio.h> /* for sprintf */
     50 
     51 U_NAMESPACE_USE
     52 
     53 /* ### Declarations **************************************************/
     54 
     55 /* Locale stuff from locid.cpp */
     56 U_CFUNC void locale_set_default(const char *id);
     57 U_CFUNC const char *locale_get_default(void);
     58 U_CFUNC int32_t
     59 locale_getKeywords(const char *localeID,
     60             char prev,
     61             char *keywords, int32_t keywordCapacity,
     62             char *values, int32_t valuesCapacity, int32_t *valLen,
     63             UBool valuesToo,
     64             UErrorCode *status);
     65 
     66 /* ### Data tables **************************************************/
     67 
     68 /**
     69  * Table of language codes, both 2- and 3-letter, with preference
     70  * given to 2-letter codes where possible.  Includes 3-letter codes
     71  * that lack a 2-letter equivalent.
     72  *
     73  * This list must be in sorted order.  This list is returned directly
     74  * to the user by some API.
     75  *
     76  * This list must be kept in sync with LANGUAGES_3, with corresponding
     77  * entries matched.
     78  *
     79  * This table should be terminated with a NULL entry, followed by a
     80  * second list, and another NULL entry.  The first list is visible to
     81  * user code when this array is returned by API.  The second list
     82  * contains codes we support, but do not expose through user API.
     83  *
     84  * Notes
     85  *
     86  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
     87  * include the revisions up to 2001/7/27 *CWB*
     88  *
     89  * The 3 character codes are the terminology codes like RFC 3066.  This
     90  * is compatible with prior ICU codes
     91  *
     92  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
     93  * table but now at the end of the table because 3 character codes are
     94  * duplicates.  This avoids bad searches going from 3 to 2 character
     95  * codes.
     96  *
     97  * The range qaa-qtz is reserved for local use
     98  */
     99 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
    100 /* ISO639 table version is 20150505 */
    101 /* Subsequent hand addition of selected languages */
    102 static const char * const LANGUAGES[] = {
    103     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
    104     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
    105     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
    106     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
    107     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
    108     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
    109     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
    110     "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
    111     "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
    112     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
    113     "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
    114     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
    115     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
    116     "cs",  "csb", "cu",  "cv",  "cy",
    117     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
    118     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
    119     "dyo", "dyu", "dz",  "dzg",
    120     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
    121     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
    122     "ext",
    123     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
    124     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
    125     "frs", "fur", "fy",
    126     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
    127     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
    128     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
    129     "gur", "guz", "gv",  "gwi",
    130     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
    131     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
    132     "hup", "hy",  "hz",
    133     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
    134     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
    135     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
    136     "jv",
    137     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
    138     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
    139     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
    140     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
    141     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
    142     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
    143     "kv",  "kw",  "ky",
    144     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
    145     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
    146     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
    147     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
    148     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
    149     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
    150     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
    151     "ml",  "mn",  "mnc", "mni", "moh", "mos", "mr",  "mrj",
    152     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
    153     "my",  "mye", "myv", "mzn",
    154     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
    155     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
    156     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
    157     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
    158     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
    159     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
    160     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
    161     "pon", "prg", "pro", "ps",  "pt",
    162     "qu",  "quc", "qug",
    163     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
    164     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
    165     "rw",  "rwk",
    166     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
    167     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
    168     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
    169     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
    170     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
    171     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
    172     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
    173     "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
    174     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
    175     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
    176     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
    177     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
    178     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
    179     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
    180     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
    181     "vot", "vro", "vun",
    182     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
    183     "xal", "xh",  "xmf", "xog",
    184     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
    185     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
    186     "zun", "zxx", "zza",
    187 NULL,
    188     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
    189 NULL
    190 };
    191 
    192 static const char* const DEPRECATED_LANGUAGES[]={
    193     "in", "iw", "ji", "jw", NULL, NULL
    194 };
    195 static const char* const REPLACEMENT_LANGUAGES[]={
    196     "id", "he", "yi", "jv", NULL, NULL
    197 };
    198 
    199 /**
    200  * Table of 3-letter language codes.
    201  *
    202  * This is a lookup table used to convert 3-letter language codes to
    203  * their 2-letter equivalent, where possible.  It must be kept in sync
    204  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
    205  * same language as LANGUAGES_3[i].  The commented-out lines are
    206  * copied from LANGUAGES to make eyeballing this baby easier.
    207  *
    208  * Where a 3-letter language code has no 2-letter equivalent, the
    209  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
    210  *
    211  * This table should be terminated with a NULL entry, followed by a
    212  * second list, and another NULL entry.  The two lists correspond to
    213  * the two lists in LANGUAGES.
    214  */
    215 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
    216 /* ISO639 table version is 20150505 */
    217 /* Subsequent hand addition of selected languages */
    218 static const char * const LANGUAGES_3[] = {
    219     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
    220     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
    221     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
    222     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
    223     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
    224     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
    225     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
    226     "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
    227     "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
    228     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
    229     "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
    230     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
    231     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
    232     "ces", "csb", "chu", "chv", "cym",
    233     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
    234     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
    235     "dyo", "dyu", "dzo", "dzg",
    236     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
    237     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
    238     "ext",
    239     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
    240     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
    241     "frs", "fur", "fry",
    242     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
    243     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
    244     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
    245     "gur", "guz", "glv", "gwi",
    246     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
    247     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
    248     "hup", "hye", "her",
    249     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
    250     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
    251     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
    252     "jav",
    253     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
    254     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
    255     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
    256     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
    257     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
    258     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
    259     "kom", "cor", "kir",
    260     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
    261     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
    262     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
    263     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
    264     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
    265     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
    266     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
    267     "mal", "mon", "mnc", "mni", "moh", "mos", "mar", "mrj",
    268     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
    269     "mya", "mye", "myv", "mzn",
    270     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
    271     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
    272     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
    273     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
    274     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
    275     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
    276     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
    277     "pon", "prg", "pro", "pus", "por",
    278     "que", "quc", "qug",
    279     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
    280     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
    281     "kin", "rwk",
    282     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
    283     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
    284     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
    285     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
    286     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
    287     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
    288     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
    289     "swe", "swa", "swb", "swc", "syc", "syr", "szl",
    290     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
    291     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
    292     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
    293     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
    294     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
    295     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
    296     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
    297     "vot", "vro", "vun",
    298     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
    299     "xal", "xho", "xmf", "xog",
    300     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
    301     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
    302     "zun", "zxx", "zza",
    303 NULL,
    304 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
    305     "ind", "heb", "yid", "jaw", "srp",
    306 NULL
    307 };
    308 
    309 /**
    310  * Table of 2-letter country codes.
    311  *
    312  * This list must be in sorted order.  This list is returned directly
    313  * to the user by some API.
    314  *
    315  * This list must be kept in sync with COUNTRIES_3, with corresponding
    316  * entries matched.
    317  *
    318  * This table should be terminated with a NULL entry, followed by a
    319  * second list, and another NULL entry.  The first list is visible to
    320  * user code when this array is returned by API.  The second list
    321  * contains codes we support, but do not expose through user API.
    322  *
    323  * Notes:
    324  *
    325  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
    326  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
    327  * new codes keeping the old ones for compatibility updated to include
    328  * 1999/12/03 revisions *CWB*
    329  *
    330  * RO(ROM) is now RO(ROU) according to
    331  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
    332  */
    333 static const char * const COUNTRIES[] = {
    334     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
    335     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
    336     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
    337     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
    338     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
    339     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
    340     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
    341     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
    342     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
    343     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
    344     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
    345     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
    346     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
    347     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
    348     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
    349     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
    350     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
    351     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
    352     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
    353     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
    354     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
    355     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
    356     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
    357     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
    358     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
    359     "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
    360     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
    361     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
    362     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
    363     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
    364 NULL,
    365     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
    366 NULL
    367 };
    368 
    369 static const char* const DEPRECATED_COUNTRIES[] = {
    370     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
    371 };
    372 static const char* const REPLACEMENT_COUNTRIES[] = {
    373 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
    374     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
    375 };
    376 
    377 /**
    378  * Table of 3-letter country codes.
    379  *
    380  * This is a lookup table used to convert 3-letter country codes to
    381  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
    382  * For all valid i, COUNTRIES[i] must refer to the same country as
    383  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
    384  * to make eyeballing this baby easier.
    385  *
    386  * This table should be terminated with a NULL entry, followed by a
    387  * second list, and another NULL entry.  The two lists correspond to
    388  * the two lists in COUNTRIES.
    389  */
    390 static const char * const COUNTRIES_3[] = {
    391 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
    392     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
    393 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
    394     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
    395 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
    396     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
    397 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
    398     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
    399 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
    400     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
    401 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
    402     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
    403 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
    404     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
    405 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
    406     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
    407 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
    408     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
    409 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
    410     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
    411 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
    412     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
    413 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
    414     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
    415 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
    416     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
    417 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
    418     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
    419 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
    420     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
    421 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
    422     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
    423 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
    424     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
    425 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
    426     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
    427 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
    428     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
    429 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
    430     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
    431 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
    432     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
    433 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
    434     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
    435 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
    436     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
    437 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
    438     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
    439 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
    440     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
    441 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
    442     "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
    443 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
    444     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
    445 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
    446     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
    447 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
    448     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
    449 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
    450     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
    451 NULL,
    452 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
    453     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
    454 NULL
    455 };
    456 
    457 typedef struct CanonicalizationMap {
    458     const char *id;          /* input ID */
    459     const char *canonicalID; /* canonicalized output ID */
    460     const char *keyword;     /* keyword, or NULL if none */
    461     const char *value;       /* keyword value, or NULL if kw==NULL */
    462 } CanonicalizationMap;
    463 
    464 /**
    465  * A map to canonicalize locale IDs.  This handles a variety of
    466  * different semantic kinds of transformations.
    467  */
    468 static const CanonicalizationMap CANONICALIZE_MAP[] = {
    469     { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
    470     { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
    471     { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
    472     { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
    473     { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
    474     { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
    475     { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
    476     { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
    477     { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
    478     { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
    479     { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
    480     { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
    481     { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
    482     { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
    483     { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
    484     { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
    485     { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
    486     { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
    487     { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
    488     { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
    489     { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
    490     { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
    491     { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
    492     { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
    493     { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
    494     { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
    495     { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
    496     { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
    497     { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
    498     { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
    499     { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
    500     { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
    501     { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
    502     { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
    503     { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
    504     { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
    505     { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
    506     { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
    507     { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
    508     { "zh_GAN",         "gan", NULL, NULL }, /* registered name */
    509     { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
    510     { "zh_HAKKA",       "hak", NULL, NULL }, /* registered name */
    511     { "zh_MIN_NAN",     "nan", NULL, NULL }, /* registered name */
    512     { "zh_WUU",         "wuu", NULL, NULL }, /* registered name */
    513     { "zh_XIANG",       "hsn", NULL, NULL }, /* registered name */
    514     { "zh_YUE",         "yue", NULL, NULL }, /* registered name */
    515 };
    516 
    517 typedef struct VariantMap {
    518     const char *variant;          /* input ID */
    519     const char *keyword;     /* keyword, or NULL if none */
    520     const char *value;       /* keyword value, or NULL if kw==NULL */
    521 } VariantMap;
    522 
    523 static const VariantMap VARIANT_MAP[] = {
    524     { "EURO",   "currency", "EUR" },
    525     { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
    526     { "STROKE", "collation", "stroke" }  /* Solaris variant */
    527 };
    528 
    529 /* ### BCP47 Conversion *******************************************/
    530 /* Test if the locale id has BCP47 u extension and does not have '@' */
    531 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
    532 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
    533 #define _ConvertBCP47(finalID, id, buffer, length,err) \
    534         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 ||  \
    535                 U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
    536             finalID=id; \
    537             if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
    538         } else { \
    539             finalID=buffer; \
    540         }
    541 /* Gets the size of the shortest subtag in the given localeID. */
    542 static int32_t getShortestSubtagLength(const char *localeID) {
    543     int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
    544     int32_t length = localeIDLength;
    545     int32_t tmpLength = 0;
    546     int32_t i;
    547     UBool reset = TRUE;
    548 
    549     for (i = 0; i < localeIDLength; i++) {
    550         if (localeID[i] != '_' && localeID[i] != '-') {
    551             if (reset) {
    552                 tmpLength = 0;
    553                 reset = FALSE;
    554             }
    555             tmpLength++;
    556         } else {
    557             if (tmpLength != 0 && tmpLength < length) {
    558                 length = tmpLength;
    559             }
    560             reset = TRUE;
    561         }
    562     }
    563 
    564     return length;
    565 }
    566 
    567 /* ### Keywords **************************************************/
    568 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
    569 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
    570 /* Punctuation/symbols allowed in legacy key values */
    571 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
    572 
    573 #define ULOC_KEYWORD_BUFFER_LEN 25
    574 #define ULOC_MAX_NO_KEYWORDS 25
    575 
    576 U_CAPI const char * U_EXPORT2
    577 locale_getKeywordsStart(const char *localeID) {
    578     const char *result = NULL;
    579     if((result = uprv_strchr(localeID, '@')) != NULL) {
    580         return result;
    581     }
    582 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
    583     else {
    584         /* We do this because the @ sign is variant, and the @ sign used on one
    585         EBCDIC machine won't be compiled the same way on other EBCDIC based
    586         machines. */
    587         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
    588         const uint8_t *charToFind = ebcdicSigns;
    589         while(*charToFind) {
    590             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
    591                 return result;
    592             }
    593             charToFind++;
    594         }
    595     }
    596 #endif
    597     return NULL;
    598 }
    599 
    600 /**
    601  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
    602  * @param keywordName incoming name to be canonicalized
    603  * @param status return status (keyword too long)
    604  * @return length of the keyword name
    605  */
    606 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
    607 {
    608   int32_t keywordNameLen = 0;
    609 
    610   for (; *keywordName != 0; keywordName++) {
    611     if (!UPRV_ISALPHANUM(*keywordName)) {
    612       *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
    613       return 0;
    614     }
    615     if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
    616       buf[keywordNameLen++] = uprv_tolower(*keywordName);
    617     } else {
    618       /* keyword name too long for internal buffer */
    619       *status = U_INTERNAL_PROGRAM_ERROR;
    620       return 0;
    621     }
    622   }
    623   if (keywordNameLen == 0) {
    624     *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
    625     return 0;
    626   }
    627   buf[keywordNameLen] = 0; /* terminate */
    628 
    629   return keywordNameLen;
    630 }
    631 
    632 typedef struct {
    633     char keyword[ULOC_KEYWORD_BUFFER_LEN];
    634     int32_t keywordLen;
    635     const char *valueStart;
    636     int32_t valueLen;
    637 } KeywordStruct;
    638 
    639 static int32_t U_CALLCONV
    640 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
    641     const char* leftString = ((const KeywordStruct *)left)->keyword;
    642     const char* rightString = ((const KeywordStruct *)right)->keyword;
    643     return uprv_strcmp(leftString, rightString);
    644 }
    645 
    646 /**
    647  * Both addKeyword and addValue must already be in canonical form.
    648  * Either both addKeyword and addValue are NULL, or neither is NULL.
    649  * If they are not NULL they must be zero terminated.
    650  * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
    651  */
    652 static int32_t
    653 _getKeywords(const char *localeID,
    654              char prev,
    655              char *keywords, int32_t keywordCapacity,
    656              char *values, int32_t valuesCapacity, int32_t *valLen,
    657              UBool valuesToo,
    658              const char* addKeyword,
    659              const char* addValue,
    660              UErrorCode *status)
    661 {
    662     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
    663 
    664     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
    665     int32_t numKeywords = 0;
    666     const char* pos = localeID;
    667     const char* equalSign = NULL;
    668     const char* semicolon = NULL;
    669     int32_t i = 0, j, n;
    670     int32_t keywordsLen = 0;
    671     int32_t valuesLen = 0;
    672 
    673     if(prev == '@') { /* start of keyword definition */
    674         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
    675         do {
    676             UBool duplicate = FALSE;
    677             /* skip leading spaces */
    678             while(*pos == ' ') {
    679                 pos++;
    680             }
    681             if (!*pos) { /* handle trailing "; " */
    682                 break;
    683             }
    684             if(numKeywords == maxKeywords) {
    685                 *status = U_INTERNAL_PROGRAM_ERROR;
    686                 return 0;
    687             }
    688             equalSign = uprv_strchr(pos, '=');
    689             semicolon = uprv_strchr(pos, ';');
    690             /* lack of '=' [foo@currency] is illegal */
    691             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
    692             if(!equalSign || (semicolon && semicolon<equalSign)) {
    693                 *status = U_INVALID_FORMAT_ERROR;
    694                 return 0;
    695             }
    696             /* need to normalize both keyword and keyword name */
    697             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
    698                 /* keyword name too long for internal buffer */
    699                 *status = U_INTERNAL_PROGRAM_ERROR;
    700                 return 0;
    701             }
    702             for(i = 0, n = 0; i < equalSign - pos; ++i) {
    703                 if (pos[i] != ' ') {
    704                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
    705                 }
    706             }
    707 
    708             /* zero-length keyword is an error. */
    709             if (n == 0) {
    710                 *status = U_INVALID_FORMAT_ERROR;
    711                 return 0;
    712             }
    713 
    714             keywordList[numKeywords].keyword[n] = 0;
    715             keywordList[numKeywords].keywordLen = n;
    716             /* now grab the value part. First we skip the '=' */
    717             equalSign++;
    718             /* then we leading spaces */
    719             while(*equalSign == ' ') {
    720                 equalSign++;
    721             }
    722 
    723             /* Premature end or zero-length value */
    724             if (!*equalSign || equalSign == semicolon) {
    725                 *status = U_INVALID_FORMAT_ERROR;
    726                 return 0;
    727             }
    728 
    729             keywordList[numKeywords].valueStart = equalSign;
    730 
    731             pos = semicolon;
    732             i = 0;
    733             if(pos) {
    734                 while(*(pos - i - 1) == ' ') {
    735                     i++;
    736                 }
    737                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
    738                 pos++;
    739             } else {
    740                 i = (int32_t)uprv_strlen(equalSign);
    741                 while(i && equalSign[i-1] == ' ') {
    742                     i--;
    743                 }
    744                 keywordList[numKeywords].valueLen = i;
    745             }
    746             /* If this is a duplicate keyword, then ignore it */
    747             for (j=0; j<numKeywords; ++j) {
    748                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
    749                     duplicate = TRUE;
    750                     break;
    751                 }
    752             }
    753             if (!duplicate) {
    754                 ++numKeywords;
    755             }
    756         } while(pos);
    757 
    758         /* Handle addKeyword/addValue. */
    759         if (addKeyword != NULL) {
    760             UBool duplicate = FALSE;
    761             U_ASSERT(addValue != NULL);
    762             /* Search for duplicate; if found, do nothing. Explicit keyword
    763                overrides addKeyword. */
    764             for (j=0; j<numKeywords; ++j) {
    765                 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
    766                     duplicate = TRUE;
    767                     break;
    768                 }
    769             }
    770             if (!duplicate) {
    771                 if (numKeywords == maxKeywords) {
    772                     *status = U_INTERNAL_PROGRAM_ERROR;
    773                     return 0;
    774                 }
    775                 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
    776                 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
    777                 keywordList[numKeywords].valueStart = addValue;
    778                 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
    779                 ++numKeywords;
    780             }
    781         } else {
    782             U_ASSERT(addValue == NULL);
    783         }
    784 
    785         /* now we have a list of keywords */
    786         /* we need to sort it */
    787         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
    788 
    789         /* Now construct the keyword part */
    790         for(i = 0; i < numKeywords; i++) {
    791             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
    792                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
    793                 if(valuesToo) {
    794                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
    795                 } else {
    796                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
    797                 }
    798             }
    799             keywordsLen += keywordList[i].keywordLen + 1;
    800             if(valuesToo) {
    801                 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
    802                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
    803                 }
    804                 keywordsLen += keywordList[i].valueLen;
    805 
    806                 if(i < numKeywords - 1) {
    807                     if(keywordsLen < keywordCapacity) {
    808                         keywords[keywordsLen] = ';';
    809                     }
    810                     keywordsLen++;
    811                 }
    812             }
    813             if(values) {
    814                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
    815                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
    816                     values[valuesLen + keywordList[i].valueLen] = 0;
    817                 }
    818                 valuesLen += keywordList[i].valueLen + 1;
    819             }
    820         }
    821         if(values) {
    822             values[valuesLen] = 0;
    823             if(valLen) {
    824                 *valLen = valuesLen;
    825             }
    826         }
    827         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
    828     } else {
    829         return 0;
    830     }
    831 }
    832 
    833 U_CFUNC int32_t
    834 locale_getKeywords(const char *localeID,
    835                    char prev,
    836                    char *keywords, int32_t keywordCapacity,
    837                    char *values, int32_t valuesCapacity, int32_t *valLen,
    838                    UBool valuesToo,
    839                    UErrorCode *status) {
    840     return _getKeywords(localeID, prev, keywords, keywordCapacity,
    841                         values, valuesCapacity, valLen, valuesToo,
    842                         NULL, NULL, status);
    843 }
    844 
    845 U_CAPI int32_t U_EXPORT2
    846 uloc_getKeywordValue(const char* localeID,
    847                      const char* keywordName,
    848                      char* buffer, int32_t bufferCapacity,
    849                      UErrorCode* status)
    850 {
    851     const char* startSearchHere = NULL;
    852     const char* nextSeparator = NULL;
    853     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
    854     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
    855     int32_t result = 0;
    856 
    857     if(status && U_SUCCESS(*status) && localeID) {
    858       char tempBuffer[ULOC_FULLNAME_CAPACITY];
    859       const char* tmpLocaleID;
    860 
    861       if (keywordName == NULL || keywordName[0] == 0) {
    862         *status = U_ILLEGAL_ARGUMENT_ERROR;
    863         return 0;
    864       }
    865 
    866       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
    867       if(U_FAILURE(*status)) {
    868         return 0;
    869       }
    870 
    871       if (_hasBCP47Extension(localeID)) {
    872           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
    873       } else {
    874           tmpLocaleID=localeID;
    875       }
    876 
    877       startSearchHere = locale_getKeywordsStart(tmpLocaleID);
    878       if(startSearchHere == NULL) {
    879           /* no keywords, return at once */
    880           return 0;
    881       }
    882 
    883       /* find the first keyword */
    884       while(startSearchHere) {
    885           const char* keyValueTail;
    886           int32_t keyValueLen;
    887 
    888           startSearchHere++; /* skip @ or ; */
    889           nextSeparator = uprv_strchr(startSearchHere, '=');
    890           if(!nextSeparator) {
    891               *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
    892               return 0;
    893           }
    894           /* strip leading & trailing spaces (TC decided to tolerate these) */
    895           while(*startSearchHere == ' ') {
    896               startSearchHere++;
    897           }
    898           keyValueTail = nextSeparator;
    899           while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
    900               keyValueTail--;
    901           }
    902           /* now keyValueTail points to first char after the keyName */
    903           /* copy & normalize keyName from locale */
    904           if (startSearchHere == keyValueTail) {
    905               *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
    906               return 0;
    907           }
    908           keyValueLen = 0;
    909           while (startSearchHere < keyValueTail) {
    910             if (!UPRV_ISALPHANUM(*startSearchHere)) {
    911               *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
    912               return 0;
    913             }
    914             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
    915               localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
    916             } else {
    917               /* keyword name too long for internal buffer */
    918               *status = U_INTERNAL_PROGRAM_ERROR;
    919               return 0;
    920             }
    921           }
    922           localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
    923 
    924           startSearchHere = uprv_strchr(nextSeparator, ';');
    925 
    926           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
    927                /* current entry matches the keyword. */
    928              nextSeparator++; /* skip '=' */
    929               /* First strip leading & trailing spaces (TC decided to tolerate these) */
    930               while(*nextSeparator == ' ') {
    931                 nextSeparator++;
    932               }
    933               keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
    934               while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
    935                 keyValueTail--;
    936               }
    937               /* Now copy the value, but check well-formedness */
    938               if (nextSeparator == keyValueTail) {
    939                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
    940                 return 0;
    941               }
    942               keyValueLen = 0;
    943               while (nextSeparator < keyValueTail) {
    944                 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
    945                   *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
    946                   return 0;
    947                 }
    948                 if (keyValueLen < bufferCapacity) {
    949                   /* Should we lowercase value to return here? Tests expect as-is. */
    950                   buffer[keyValueLen++] = *nextSeparator++;
    951                 } else { /* keep advancing so we return correct length in case of overflow */
    952                   keyValueLen++;
    953                   nextSeparator++;
    954                 }
    955               }
    956               result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
    957               return result;
    958           }
    959       }
    960     }
    961     return 0;
    962 }
    963 
    964 U_CAPI int32_t U_EXPORT2
    965 uloc_setKeywordValue(const char* keywordName,
    966                      const char* keywordValue,
    967                      char* buffer, int32_t bufferCapacity,
    968                      UErrorCode* status)
    969 {
    970     /* TODO: sorting. removal. */
    971     int32_t keywordNameLen;
    972     int32_t keywordValueLen;
    973     int32_t bufLen;
    974     int32_t needLen = 0;
    975     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
    976     char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
    977     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
    978     int32_t rc;
    979     char* nextSeparator = NULL;
    980     char* nextEqualsign = NULL;
    981     char* startSearchHere = NULL;
    982     char* keywordStart = NULL;
    983     CharString updatedKeysAndValues;
    984     int32_t updatedKeysAndValuesLen;
    985     UBool handledInputKeyAndValue = FALSE;
    986     char keyValuePrefix = '@';
    987 
    988     if(U_FAILURE(*status)) {
    989         return -1;
    990     }
    991     if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
    992         *status = U_ILLEGAL_ARGUMENT_ERROR;
    993         return 0;
    994     }
    995     bufLen = (int32_t)uprv_strlen(buffer);
    996     if(bufferCapacity<bufLen) {
    997         /* The capacity is less than the length?! Is this NULL terminated? */
    998         *status = U_ILLEGAL_ARGUMENT_ERROR;
    999         return 0;
   1000     }
   1001     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
   1002     if(U_FAILURE(*status)) {
   1003         return 0;
   1004     }
   1005 
   1006     keywordValueLen = 0;
   1007     if(keywordValue) {
   1008         while (*keywordValue != 0) {
   1009             if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
   1010                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
   1011                 return 0;
   1012             }
   1013             if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
   1014                 /* Should we force lowercase in value to set? */
   1015                 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
   1016             } else {
   1017                 /* keywordValue too long for internal buffer */
   1018                 *status = U_INTERNAL_PROGRAM_ERROR;
   1019                 return 0;
   1020             }
   1021         }
   1022     }
   1023     keywordValueBuffer[keywordValueLen] = 0; /* terminate */
   1024 
   1025     startSearchHere = (char*)locale_getKeywordsStart(buffer);
   1026     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
   1027         if(keywordValueLen == 0) { /* no keywords = nothing to remove */
   1028             return bufLen;
   1029         }
   1030 
   1031         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
   1032         if(startSearchHere) { /* had a single @ */
   1033             needLen--; /* already had the @ */
   1034             /* startSearchHere points at the @ */
   1035         } else {
   1036             startSearchHere=buffer+bufLen;
   1037         }
   1038         if(needLen >= bufferCapacity) {
   1039             *status = U_BUFFER_OVERFLOW_ERROR;
   1040             return needLen; /* no change */
   1041         }
   1042         *startSearchHere++ = '@';
   1043         uprv_strcpy(startSearchHere, keywordNameBuffer);
   1044         startSearchHere += keywordNameLen;
   1045         *startSearchHere++ = '=';
   1046         uprv_strcpy(startSearchHere, keywordValueBuffer);
   1047         return needLen;
   1048     } /* end shortcut - no @ */
   1049 
   1050     keywordStart = startSearchHere;
   1051     /* search for keyword */
   1052     while(keywordStart) {
   1053         const char* keyValueTail;
   1054         int32_t keyValueLen;
   1055 
   1056         keywordStart++; /* skip @ or ; */
   1057         nextEqualsign = uprv_strchr(keywordStart, '=');
   1058         if (!nextEqualsign) {
   1059             *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
   1060             return 0;
   1061         }
   1062         /* strip leading & trailing spaces (TC decided to tolerate these) */
   1063         while(*keywordStart == ' ') {
   1064             keywordStart++;
   1065         }
   1066         keyValueTail = nextEqualsign;
   1067         while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
   1068             keyValueTail--;
   1069         }
   1070         /* now keyValueTail points to first char after the keyName */
   1071         /* copy & normalize keyName from locale */
   1072         if (keywordStart == keyValueTail) {
   1073             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
   1074             return 0;
   1075         }
   1076         keyValueLen = 0;
   1077         while (keywordStart < keyValueTail) {
   1078             if (!UPRV_ISALPHANUM(*keywordStart)) {
   1079                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
   1080                 return 0;
   1081             }
   1082             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
   1083                 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
   1084             } else {
   1085                 /* keyword name too long for internal buffer */
   1086                 *status = U_INTERNAL_PROGRAM_ERROR;
   1087                 return 0;
   1088             }
   1089         }
   1090         localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
   1091 
   1092         nextSeparator = uprv_strchr(nextEqualsign, ';');
   1093 
   1094         /* start processing the value part */
   1095         nextEqualsign++; /* skip '=' */
   1096         /* First strip leading & trailing spaces (TC decided to tolerate these) */
   1097         while(*nextEqualsign == ' ') {
   1098             nextEqualsign++;
   1099         }
   1100         keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
   1101         while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
   1102             keyValueTail--;
   1103         }
   1104         if (nextEqualsign == keyValueTail) {
   1105             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
   1106             return 0;
   1107         }
   1108 
   1109         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
   1110         if(rc == 0) {
   1111             /* Current entry matches the input keyword. Update the entry */
   1112             if(keywordValueLen > 0) { /* updating a value */
   1113                 updatedKeysAndValues.append(keyValuePrefix, *status);
   1114                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
   1115                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
   1116                 updatedKeysAndValues.append('=', *status);
   1117                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
   1118             } /* else removing this entry, don't emit anything */
   1119             handledInputKeyAndValue = TRUE;
   1120         } else {
   1121            /* input keyword sorts earlier than current entry, add before current entry */
   1122             if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
   1123                 /* insert new entry at this location */
   1124                 updatedKeysAndValues.append(keyValuePrefix, *status);
   1125                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
   1126                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
   1127                 updatedKeysAndValues.append('=', *status);
   1128                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
   1129                 handledInputKeyAndValue = TRUE;
   1130             }
   1131             /* copy the current entry */
   1132             updatedKeysAndValues.append(keyValuePrefix, *status);
   1133             keyValuePrefix = ';'; /* for any subsequent key-value pair */
   1134             updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
   1135             updatedKeysAndValues.append('=', *status);
   1136             updatedKeysAndValues.append(nextEqualsign, keyValueTail-nextEqualsign, *status);
   1137         }
   1138         if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
   1139             /* append new entry at the end, it sorts later than existing entries */
   1140             updatedKeysAndValues.append(keyValuePrefix, *status);
   1141             /* skip keyValuePrefix update, no subsequent key-value pair */
   1142             updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
   1143             updatedKeysAndValues.append('=', *status);
   1144             updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
   1145             handledInputKeyAndValue = TRUE;
   1146         }
   1147         keywordStart = nextSeparator;
   1148     } /* end loop searching */
   1149 
   1150     /* Any error from updatedKeysAndValues.append above would be internal and not due to
   1151      * problems with the passed-in locale. So if we did encounter problems with the
   1152      * passed-in locale above, those errors took precedence and overrode any error
   1153      * status from updatedKeysAndValues.append, and also caused a return of 0. If there
   1154      * are errors here they are from updatedKeysAndValues.append; they do cause an
   1155      * error return but the passed-in locale is unmodified and the original bufLen is
   1156      * returned.
   1157      */
   1158     if (!handledInputKeyAndValue || U_FAILURE(*status)) {
   1159         /* if input key/value specified removal of a keyword not present in locale, or
   1160          * there was an error in CharString.append, leave original locale alone. */
   1161         return bufLen;
   1162     }
   1163 
   1164     updatedKeysAndValuesLen = updatedKeysAndValues.length();
   1165     /* needLen = length of the part before '@' + length of updated key-value part including '@' */
   1166     needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
   1167     if(needLen >= bufferCapacity) {
   1168         *status = U_BUFFER_OVERFLOW_ERROR;
   1169         return needLen; /* no change */
   1170     }
   1171     if (updatedKeysAndValuesLen > 0) {
   1172         uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
   1173     }
   1174     buffer[needLen]=0;
   1175     return needLen;
   1176 }
   1177 
   1178 /* ### ID parsing implementation **************************************************/
   1179 
   1180 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
   1181 
   1182 /*returns TRUE if one of the special prefixes is here (s=string)
   1183   'x-' or 'i-' */
   1184 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
   1185 
   1186 /* Dot terminates it because of POSIX form  where dot precedes the codepage
   1187  * except for variant
   1188  */
   1189 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
   1190 
   1191 static char* _strnchr(const char* str, int32_t len, char c) {
   1192     U_ASSERT(str != 0 && len >= 0);
   1193     while (len-- != 0) {
   1194         char d = *str;
   1195         if (d == c) {
   1196             return (char*) str;
   1197         } else if (d == 0) {
   1198             break;
   1199         }
   1200         ++str;
   1201     }
   1202     return NULL;
   1203 }
   1204 
   1205 /**
   1206  * Lookup 'key' in the array 'list'.  The array 'list' should contain
   1207  * a NULL entry, followed by more entries, and a second NULL entry.
   1208  *
   1209  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
   1210  * COUNTRIES_3.
   1211  */
   1212 static int16_t _findIndex(const char* const* list, const char* key)
   1213 {
   1214     const char* const* anchor = list;
   1215     int32_t pass = 0;
   1216 
   1217     /* Make two passes through two NULL-terminated arrays at 'list' */
   1218     while (pass++ < 2) {
   1219         while (*list) {
   1220             if (uprv_strcmp(key, *list) == 0) {
   1221                 return (int16_t)(list - anchor);
   1222             }
   1223             list++;
   1224         }
   1225         ++list;     /* skip final NULL *CWB*/
   1226     }
   1227     return -1;
   1228 }
   1229 
   1230 /* count the length of src while copying it to dest; return strlen(src) */
   1231 static inline int32_t
   1232 _copyCount(char *dest, int32_t destCapacity, const char *src) {
   1233     const char *anchor;
   1234     char c;
   1235 
   1236     anchor=src;
   1237     for(;;) {
   1238         if((c=*src)==0) {
   1239             return (int32_t)(src-anchor);
   1240         }
   1241         if(destCapacity<=0) {
   1242             return (int32_t)((src-anchor)+uprv_strlen(src));
   1243         }
   1244         ++src;
   1245         *dest++=c;
   1246         --destCapacity;
   1247     }
   1248 }
   1249 
   1250 U_CFUNC const char*
   1251 uloc_getCurrentCountryID(const char* oldID){
   1252     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
   1253     if (offset >= 0) {
   1254         return REPLACEMENT_COUNTRIES[offset];
   1255     }
   1256     return oldID;
   1257 }
   1258 U_CFUNC const char*
   1259 uloc_getCurrentLanguageID(const char* oldID){
   1260     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
   1261     if (offset >= 0) {
   1262         return REPLACEMENT_LANGUAGES[offset];
   1263     }
   1264     return oldID;
   1265 }
   1266 /*
   1267  * the internal functions _getLanguage(), _getCountry(), _getVariant()
   1268  * avoid duplicating code to handle the earlier locale ID pieces
   1269  * in the functions for the later ones by
   1270  * setting the *pEnd pointer to where they stopped parsing
   1271  *
   1272  * TODO try to use this in Locale
   1273  */
   1274 U_CFUNC int32_t
   1275 ulocimp_getLanguage(const char *localeID,
   1276                     char *language, int32_t languageCapacity,
   1277                     const char **pEnd) {
   1278     int32_t i=0;
   1279     int32_t offset;
   1280     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
   1281 
   1282     /* if it starts with i- or x- then copy that prefix */
   1283     if(_isIDPrefix(localeID)) {
   1284         if(i<languageCapacity) {
   1285             language[i]=(char)uprv_tolower(*localeID);
   1286         }
   1287         if(i<languageCapacity) {
   1288             language[i+1]='-';
   1289         }
   1290         i+=2;
   1291         localeID+=2;
   1292     }
   1293 
   1294     /* copy the language as far as possible and count its length */
   1295     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
   1296         if(i<languageCapacity) {
   1297             language[i]=(char)uprv_tolower(*localeID);
   1298         }
   1299         if(i<3) {
   1300             U_ASSERT(i>=0);
   1301             lang[i]=(char)uprv_tolower(*localeID);
   1302         }
   1303         i++;
   1304         localeID++;
   1305     }
   1306 
   1307     if(i==3) {
   1308         /* convert 3 character code to 2 character code if possible *CWB*/
   1309         offset=_findIndex(LANGUAGES_3, lang);
   1310         if(offset>=0) {
   1311             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
   1312         }
   1313     }
   1314 
   1315     if(pEnd!=NULL) {
   1316         *pEnd=localeID;
   1317     }
   1318     return i;
   1319 }
   1320 
   1321 U_CFUNC int32_t
   1322 ulocimp_getScript(const char *localeID,
   1323                   char *script, int32_t scriptCapacity,
   1324                   const char **pEnd)
   1325 {
   1326     int32_t idLen = 0;
   1327 
   1328     if (pEnd != NULL) {
   1329         *pEnd = localeID;
   1330     }
   1331 
   1332     /* copy the second item as far as possible and count its length */
   1333     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
   1334             && uprv_isASCIILetter(localeID[idLen])) {
   1335         idLen++;
   1336     }
   1337 
   1338     /* If it's exactly 4 characters long, then it's a script and not a country. */
   1339     if (idLen == 4) {
   1340         int32_t i;
   1341         if (pEnd != NULL) {
   1342             *pEnd = localeID+idLen;
   1343         }
   1344         if(idLen > scriptCapacity) {
   1345             idLen = scriptCapacity;
   1346         }
   1347         if (idLen >= 1) {
   1348             script[0]=(char)uprv_toupper(*(localeID++));
   1349         }
   1350         for (i = 1; i < idLen; i++) {
   1351             script[i]=(char)uprv_tolower(*(localeID++));
   1352         }
   1353     }
   1354     else {
   1355         idLen = 0;
   1356     }
   1357     return idLen;
   1358 }
   1359 
   1360 U_CFUNC int32_t
   1361 ulocimp_getCountry(const char *localeID,
   1362                    char *country, int32_t countryCapacity,
   1363                    const char **pEnd)
   1364 {
   1365     int32_t idLen=0;
   1366     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
   1367     int32_t offset;
   1368 
   1369     /* copy the country as far as possible and count its length */
   1370     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
   1371         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
   1372             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
   1373         }
   1374         idLen++;
   1375     }
   1376 
   1377     /* the country should be either length 2 or 3 */
   1378     if (idLen == 2 || idLen == 3) {
   1379         UBool gotCountry = FALSE;
   1380         /* convert 3 character code to 2 character code if possible *CWB*/
   1381         if(idLen==3) {
   1382             offset=_findIndex(COUNTRIES_3, cnty);
   1383             if(offset>=0) {
   1384                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
   1385                 gotCountry = TRUE;
   1386             }
   1387         }
   1388         if (!gotCountry) {
   1389             int32_t i = 0;
   1390             for (i = 0; i < idLen; i++) {
   1391                 if (i < countryCapacity) {
   1392                     country[i]=(char)uprv_toupper(localeID[i]);
   1393                 }
   1394             }
   1395         }
   1396         localeID+=idLen;
   1397     } else {
   1398         idLen = 0;
   1399     }
   1400 
   1401     if(pEnd!=NULL) {
   1402         *pEnd=localeID;
   1403     }
   1404 
   1405     return idLen;
   1406 }
   1407 
   1408 /**
   1409  * @param needSeparator if true, then add leading '_' if any variants
   1410  * are added to 'variant'
   1411  */
   1412 static int32_t
   1413 _getVariantEx(const char *localeID,
   1414               char prev,
   1415               char *variant, int32_t variantCapacity,
   1416               UBool needSeparator) {
   1417     int32_t i=0;
   1418 
   1419     /* get one or more variant tags and separate them with '_' */
   1420     if(_isIDSeparator(prev)) {
   1421         /* get a variant string after a '-' or '_' */
   1422         while(!_isTerminator(*localeID)) {
   1423             if (needSeparator) {
   1424                 if (i<variantCapacity) {
   1425                     variant[i] = '_';
   1426                 }
   1427                 ++i;
   1428                 needSeparator = FALSE;
   1429             }
   1430             if(i<variantCapacity) {
   1431                 variant[i]=(char)uprv_toupper(*localeID);
   1432                 if(variant[i]=='-') {
   1433                     variant[i]='_';
   1434                 }
   1435             }
   1436             i++;
   1437             localeID++;
   1438         }
   1439     }
   1440 
   1441     /* if there is no variant tag after a '-' or '_' then look for '@' */
   1442     if(i==0) {
   1443         if(prev=='@') {
   1444             /* keep localeID */
   1445         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
   1446             ++localeID; /* point after the '@' */
   1447         } else {
   1448             return 0;
   1449         }
   1450         while(!_isTerminator(*localeID)) {
   1451             if (needSeparator) {
   1452                 if (i<variantCapacity) {
   1453                     variant[i] = '_';
   1454                 }
   1455                 ++i;
   1456                 needSeparator = FALSE;
   1457             }
   1458             if(i<variantCapacity) {
   1459                 variant[i]=(char)uprv_toupper(*localeID);
   1460                 if(variant[i]=='-' || variant[i]==',') {
   1461                     variant[i]='_';
   1462                 }
   1463             }
   1464             i++;
   1465             localeID++;
   1466         }
   1467     }
   1468 
   1469     return i;
   1470 }
   1471 
   1472 static int32_t
   1473 _getVariant(const char *localeID,
   1474             char prev,
   1475             char *variant, int32_t variantCapacity) {
   1476     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
   1477 }
   1478 
   1479 /**
   1480  * Delete ALL instances of a variant from the given list of one or
   1481  * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
   1482  * @param variants the source string of one or more variants,
   1483  * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
   1484  * terminated; if it is, trailing zero will NOT be maintained.
   1485  * @param variantsLen length of variants
   1486  * @param toDelete variant to delete, without separators, e.g.  "EURO"
   1487  * or "PREEURO"; not zero terminated
   1488  * @param toDeleteLen length of toDelete
   1489  * @return number of characters deleted from variants
   1490  */
   1491 static int32_t
   1492 _deleteVariant(char* variants, int32_t variantsLen,
   1493                const char* toDelete, int32_t toDeleteLen)
   1494 {
   1495     int32_t delta = 0; /* number of chars deleted */
   1496     for (;;) {
   1497         UBool flag = FALSE;
   1498         if (variantsLen < toDeleteLen) {
   1499             return delta;
   1500         }
   1501         if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
   1502             (variantsLen == toDeleteLen ||
   1503              (flag=(variants[toDeleteLen] == '_'))))
   1504         {
   1505             int32_t d = toDeleteLen + (flag?1:0);
   1506             variantsLen -= d;
   1507             delta += d;
   1508             if (variantsLen > 0) {
   1509                 uprv_memmove(variants, variants+d, variantsLen);
   1510             }
   1511         } else {
   1512             char* p = _strnchr(variants, variantsLen, '_');
   1513             if (p == NULL) {
   1514                 return delta;
   1515             }
   1516             ++p;
   1517             variantsLen -= (int32_t)(p - variants);
   1518             variants = p;
   1519         }
   1520     }
   1521 }
   1522 
   1523 /* Keyword enumeration */
   1524 
   1525 typedef struct UKeywordsContext {
   1526     char* keywords;
   1527     char* current;
   1528 } UKeywordsContext;
   1529 
   1530 U_CDECL_BEGIN
   1531 
   1532 static void U_CALLCONV
   1533 uloc_kw_closeKeywords(UEnumeration *enumerator) {
   1534     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
   1535     uprv_free(enumerator->context);
   1536     uprv_free(enumerator);
   1537 }
   1538 
   1539 static int32_t U_CALLCONV
   1540 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
   1541     char *kw = ((UKeywordsContext *)en->context)->keywords;
   1542     int32_t result = 0;
   1543     while(*kw) {
   1544         result++;
   1545         kw += uprv_strlen(kw)+1;
   1546     }
   1547     return result;
   1548 }
   1549 
   1550 static const char * U_CALLCONV
   1551 uloc_kw_nextKeyword(UEnumeration* en,
   1552                     int32_t* resultLength,
   1553                     UErrorCode* /*status*/) {
   1554     const char* result = ((UKeywordsContext *)en->context)->current;
   1555     int32_t len = 0;
   1556     if(*result) {
   1557         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
   1558         ((UKeywordsContext *)en->context)->current += len+1;
   1559     } else {
   1560         result = NULL;
   1561     }
   1562     if (resultLength) {
   1563         *resultLength = len;
   1564     }
   1565     return result;
   1566 }
   1567 
   1568 static void U_CALLCONV
   1569 uloc_kw_resetKeywords(UEnumeration* en,
   1570                       UErrorCode* /*status*/) {
   1571     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
   1572 }
   1573 
   1574 U_CDECL_END
   1575 
   1576 
   1577 static const UEnumeration gKeywordsEnum = {
   1578     NULL,
   1579     NULL,
   1580     uloc_kw_closeKeywords,
   1581     uloc_kw_countKeywords,
   1582     uenum_unextDefault,
   1583     uloc_kw_nextKeyword,
   1584     uloc_kw_resetKeywords
   1585 };
   1586 
   1587 U_CAPI UEnumeration* U_EXPORT2
   1588 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
   1589 {
   1590     UKeywordsContext *myContext = NULL;
   1591     UEnumeration *result = NULL;
   1592 
   1593     if(U_FAILURE(*status)) {
   1594         return NULL;
   1595     }
   1596     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
   1597     /* Null pointer test */
   1598     if (result == NULL) {
   1599         *status = U_MEMORY_ALLOCATION_ERROR;
   1600         return NULL;
   1601     }
   1602     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
   1603     myContext = static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
   1604     if (myContext == NULL) {
   1605         *status = U_MEMORY_ALLOCATION_ERROR;
   1606         uprv_free(result);
   1607         return NULL;
   1608     }
   1609     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
   1610     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
   1611     myContext->keywords[keywordListSize] = 0;
   1612     myContext->current = myContext->keywords;
   1613     result->context = myContext;
   1614     return result;
   1615 }
   1616 
   1617 U_CAPI UEnumeration* U_EXPORT2
   1618 uloc_openKeywords(const char* localeID,
   1619                         UErrorCode* status)
   1620 {
   1621     int32_t i=0;
   1622     char keywords[256];
   1623     int32_t keywordsCapacity = 256;
   1624     char tempBuffer[ULOC_FULLNAME_CAPACITY];
   1625     const char* tmpLocaleID;
   1626 
   1627     if(status==NULL || U_FAILURE(*status)) {
   1628         return 0;
   1629     }
   1630 
   1631     if (_hasBCP47Extension(localeID)) {
   1632         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
   1633     } else {
   1634         if (localeID==NULL) {
   1635            localeID=uloc_getDefault();
   1636         }
   1637         tmpLocaleID=localeID;
   1638     }
   1639 
   1640     /* Skip the language */
   1641     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
   1642     if(_isIDSeparator(*tmpLocaleID)) {
   1643         const char *scriptID;
   1644         /* Skip the script if available */
   1645         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
   1646         if(scriptID != tmpLocaleID+1) {
   1647             /* Found optional script */
   1648             tmpLocaleID = scriptID;
   1649         }
   1650         /* Skip the Country */
   1651         if (_isIDSeparator(*tmpLocaleID)) {
   1652             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
   1653             if(_isIDSeparator(*tmpLocaleID)) {
   1654                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
   1655             }
   1656         }
   1657     }
   1658 
   1659     /* keywords are located after '@' */
   1660     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
   1661         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
   1662     }
   1663 
   1664     if(i) {
   1665         return uloc_openKeywordList(keywords, i, status);
   1666     } else {
   1667         return NULL;
   1668     }
   1669 }
   1670 
   1671 
   1672 /* bit-flags for 'options' parameter of _canonicalize */
   1673 #define _ULOC_STRIP_KEYWORDS 0x2
   1674 #define _ULOC_CANONICALIZE   0x1
   1675 
   1676 #define OPTION_SET(options, mask) ((options & mask) != 0)
   1677 
   1678 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
   1679 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
   1680 
   1681 /**
   1682  * Canonicalize the given localeID, to level 1 or to level 2,
   1683  * depending on the options.  To specify level 1, pass in options=0.
   1684  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
   1685  *
   1686  * This is the code underlying uloc_getName and uloc_canonicalize.
   1687  */
   1688 static int32_t
   1689 _canonicalize(const char* localeID,
   1690               char* result,
   1691               int32_t resultCapacity,
   1692               uint32_t options,
   1693               UErrorCode* err) {
   1694     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
   1695     char localeBuffer[ULOC_FULLNAME_CAPACITY];
   1696     char tempBuffer[ULOC_FULLNAME_CAPACITY];
   1697     const char* origLocaleID;
   1698     const char* tmpLocaleID;
   1699     const char* keywordAssign = NULL;
   1700     const char* separatorIndicator = NULL;
   1701     const char* addKeyword = NULL;
   1702     const char* addValue = NULL;
   1703     char* name;
   1704     char* variant = NULL; /* pointer into name, or NULL */
   1705 
   1706     if (U_FAILURE(*err)) {
   1707         return 0;
   1708     }
   1709 
   1710     if (_hasBCP47Extension(localeID)) {
   1711         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
   1712     } else {
   1713         if (localeID==NULL) {
   1714            localeID=uloc_getDefault();
   1715         }
   1716         tmpLocaleID=localeID;
   1717     }
   1718 
   1719     origLocaleID=tmpLocaleID;
   1720 
   1721     /* if we are doing a full canonicalization, then put results in
   1722        localeBuffer, if necessary; otherwise send them to result. */
   1723     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
   1724         (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
   1725         name = localeBuffer;
   1726         nameCapacity = (int32_t)sizeof(localeBuffer);
   1727     } else {
   1728         name = result;
   1729         nameCapacity = resultCapacity;
   1730     }
   1731 
   1732     /* get all pieces, one after another, and separate with '_' */
   1733     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
   1734 
   1735     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
   1736         const char *d = uloc_getDefault();
   1737 
   1738         len = (int32_t)uprv_strlen(d);
   1739 
   1740         if (name != NULL) {
   1741             uprv_strncpy(name, d, len);
   1742         }
   1743     } else if(_isIDSeparator(*tmpLocaleID)) {
   1744         const char *scriptID;
   1745 
   1746         ++fieldCount;
   1747         if(len<nameCapacity) {
   1748             name[len]='_';
   1749         }
   1750         ++len;
   1751 
   1752         scriptSize=ulocimp_getScript(tmpLocaleID+1,
   1753             (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
   1754         if(scriptSize > 0) {
   1755             /* Found optional script */
   1756             tmpLocaleID = scriptID;
   1757             ++fieldCount;
   1758             len+=scriptSize;
   1759             if (_isIDSeparator(*tmpLocaleID)) {
   1760                 /* If there is something else, then we add the _ */
   1761                 if(len<nameCapacity) {
   1762                     name[len]='_';
   1763                 }
   1764                 ++len;
   1765             }
   1766         }
   1767 
   1768         if (_isIDSeparator(*tmpLocaleID)) {
   1769             const char *cntryID;
   1770             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
   1771                 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
   1772             if (cntrySize > 0) {
   1773                 /* Found optional country */
   1774                 tmpLocaleID = cntryID;
   1775                 len+=cntrySize;
   1776             }
   1777             if(_isIDSeparator(*tmpLocaleID)) {
   1778                 /* If there is something else, then we add the _  if we found country before. */
   1779                 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
   1780                     ++fieldCount;
   1781                     if(len<nameCapacity) {
   1782                         name[len]='_';
   1783                     }
   1784                     ++len;
   1785                 }
   1786 
   1787                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
   1788                     (len<nameCapacity ? name+len : NULL), nameCapacity-len);
   1789                 if (variantSize > 0) {
   1790                     variant = len<nameCapacity ? name+len : NULL;
   1791                     len += variantSize;
   1792                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
   1793                 }
   1794             }
   1795         }
   1796     }
   1797 
   1798     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
   1799     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
   1800         UBool done = FALSE;
   1801         do {
   1802             char c = *tmpLocaleID;
   1803             switch (c) {
   1804             case 0:
   1805             case '@':
   1806                 done = TRUE;
   1807                 break;
   1808             default:
   1809                 if (len<nameCapacity) {
   1810                     name[len] = c;
   1811                 }
   1812                 ++len;
   1813                 ++tmpLocaleID;
   1814                 break;
   1815             }
   1816         } while (!done);
   1817     }
   1818 
   1819     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
   1820        After this, tmpLocaleID either points to '@' or is NULL */
   1821     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
   1822         keywordAssign = uprv_strchr(tmpLocaleID, '=');
   1823         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
   1824     }
   1825 
   1826     /* Copy POSIX-style variant, if any [mr@FOO] */
   1827     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
   1828         tmpLocaleID != NULL && keywordAssign == NULL) {
   1829         for (;;) {
   1830             char c = *tmpLocaleID;
   1831             if (c == 0) {
   1832                 break;
   1833             }
   1834             if (len<nameCapacity) {
   1835                 name[len] = c;
   1836             }
   1837             ++len;
   1838             ++tmpLocaleID;
   1839         }
   1840     }
   1841 
   1842     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
   1843         /* Handle @FOO variant if @ is present and not followed by = */
   1844         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
   1845             int32_t posixVariantSize;
   1846             /* Add missing '_' if needed */
   1847             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
   1848                 do {
   1849                     if(len<nameCapacity) {
   1850                         name[len]='_';
   1851                     }
   1852                     ++len;
   1853                     ++fieldCount;
   1854                 } while(fieldCount<2);
   1855             }
   1856             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
   1857                                              (UBool)(variantSize > 0));
   1858             if (posixVariantSize > 0) {
   1859                 if (variant == NULL) {
   1860                     variant = name+len;
   1861                 }
   1862                 len += posixVariantSize;
   1863                 variantSize += posixVariantSize;
   1864             }
   1865         }
   1866 
   1867         /* Handle generic variants first */
   1868         if (variant) {
   1869             for (j=0; j<UPRV_LENGTHOF(VARIANT_MAP); j++) {
   1870                 const char* variantToCompare = VARIANT_MAP[j].variant;
   1871                 int32_t n = (int32_t)uprv_strlen(variantToCompare);
   1872                 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
   1873                 len -= variantLen;
   1874                 if (variantLen > 0) {
   1875                     if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
   1876                         --len;
   1877                     }
   1878                     addKeyword = VARIANT_MAP[j].keyword;
   1879                     addValue = VARIANT_MAP[j].value;
   1880                     break;
   1881                 }
   1882             }
   1883             if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
   1884                 --len;
   1885             }
   1886         }
   1887 
   1888         /* Look up the ID in the canonicalization map */
   1889         for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
   1890             const char* id = CANONICALIZE_MAP[j].id;
   1891             int32_t n = (int32_t)uprv_strlen(id);
   1892             if (len == n && uprv_strncmp(name, id, n) == 0) {
   1893                 if (n == 0 && tmpLocaleID != NULL) {
   1894                     break; /* Don't remap "" if keywords present */
   1895                 }
   1896                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
   1897                 if (CANONICALIZE_MAP[j].keyword) {
   1898                     addKeyword = CANONICALIZE_MAP[j].keyword;
   1899                     addValue = CANONICALIZE_MAP[j].value;
   1900                 }
   1901                 break;
   1902             }
   1903         }
   1904     }
   1905 
   1906     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
   1907         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
   1908             (!separatorIndicator || separatorIndicator > keywordAssign)) {
   1909             if(len<nameCapacity) {
   1910                 name[len]='@';
   1911             }
   1912             ++len;
   1913             ++fieldCount;
   1914             len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
   1915                                 NULL, 0, NULL, TRUE, addKeyword, addValue, err);
   1916         } else if (addKeyword != NULL) {
   1917             U_ASSERT(addValue != NULL && len < nameCapacity);
   1918             /* inelegant but works -- later make _getKeywords do this? */
   1919             len += _copyCount(name+len, nameCapacity-len, "@");
   1920             len += _copyCount(name+len, nameCapacity-len, addKeyword);
   1921             len += _copyCount(name+len, nameCapacity-len, "=");
   1922             len += _copyCount(name+len, nameCapacity-len, addValue);
   1923         }
   1924     }
   1925 
   1926     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
   1927         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
   1928     }
   1929 
   1930     return u_terminateChars(result, resultCapacity, len, err);
   1931 }
   1932 
   1933 /* ### ID parsing API **************************************************/
   1934 
   1935 U_CAPI int32_t  U_EXPORT2
   1936 uloc_getParent(const char*    localeID,
   1937                char* parent,
   1938                int32_t parentCapacity,
   1939                UErrorCode* err)
   1940 {
   1941     const char *lastUnderscore;
   1942     int32_t i;
   1943 
   1944     if (U_FAILURE(*err))
   1945         return 0;
   1946 
   1947     if (localeID == NULL)
   1948         localeID = uloc_getDefault();
   1949 
   1950     lastUnderscore=uprv_strrchr(localeID, '_');
   1951     if(lastUnderscore!=NULL) {
   1952         i=(int32_t)(lastUnderscore-localeID);
   1953     } else {
   1954         i=0;
   1955     }
   1956 
   1957     if(i>0 && parent != localeID) {
   1958         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
   1959     }
   1960     return u_terminateChars(parent, parentCapacity, i, err);
   1961 }
   1962 
   1963 U_CAPI int32_t U_EXPORT2
   1964 uloc_getLanguage(const char*    localeID,
   1965          char* language,
   1966          int32_t languageCapacity,
   1967          UErrorCode* err)
   1968 {
   1969     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
   1970     int32_t i=0;
   1971 
   1972     if (err==NULL || U_FAILURE(*err)) {
   1973         return 0;
   1974     }
   1975 
   1976     if(localeID==NULL) {
   1977         localeID=uloc_getDefault();
   1978     }
   1979 
   1980     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
   1981     return u_terminateChars(language, languageCapacity, i, err);
   1982 }
   1983 
   1984 U_CAPI int32_t U_EXPORT2
   1985 uloc_getScript(const char*    localeID,
   1986          char* script,
   1987          int32_t scriptCapacity,
   1988          UErrorCode* err)
   1989 {
   1990     int32_t i=0;
   1991 
   1992     if(err==NULL || U_FAILURE(*err)) {
   1993         return 0;
   1994     }
   1995 
   1996     if(localeID==NULL) {
   1997         localeID=uloc_getDefault();
   1998     }
   1999 
   2000     /* skip the language */
   2001     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
   2002     if(_isIDSeparator(*localeID)) {
   2003         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
   2004     }
   2005     return u_terminateChars(script, scriptCapacity, i, err);
   2006 }
   2007 
   2008 U_CAPI int32_t  U_EXPORT2
   2009 uloc_getCountry(const char* localeID,
   2010             char* country,
   2011             int32_t countryCapacity,
   2012             UErrorCode* err)
   2013 {
   2014     int32_t i=0;
   2015 
   2016     if(err==NULL || U_FAILURE(*err)) {
   2017         return 0;
   2018     }
   2019 
   2020     if(localeID==NULL) {
   2021         localeID=uloc_getDefault();
   2022     }
   2023 
   2024     /* Skip the language */
   2025     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
   2026     if(_isIDSeparator(*localeID)) {
   2027         const char *scriptID;
   2028         /* Skip the script if available */
   2029         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
   2030         if(scriptID != localeID+1) {
   2031             /* Found optional script */
   2032             localeID = scriptID;
   2033         }
   2034         if(_isIDSeparator(*localeID)) {
   2035             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
   2036         }
   2037     }
   2038     return u_terminateChars(country, countryCapacity, i, err);
   2039 }
   2040 
   2041 U_CAPI int32_t  U_EXPORT2
   2042 uloc_getVariant(const char* localeID,
   2043                 char* variant,
   2044                 int32_t variantCapacity,
   2045                 UErrorCode* err)
   2046 {
   2047     char tempBuffer[ULOC_FULLNAME_CAPACITY];
   2048     const char* tmpLocaleID;
   2049     int32_t i=0;
   2050 
   2051     if(err==NULL || U_FAILURE(*err)) {
   2052         return 0;
   2053     }
   2054 
   2055     if (_hasBCP47Extension(localeID)) {
   2056         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
   2057     } else {
   2058         if (localeID==NULL) {
   2059            localeID=uloc_getDefault();
   2060         }
   2061         tmpLocaleID=localeID;
   2062     }
   2063 
   2064     /* Skip the language */
   2065     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
   2066     if(_isIDSeparator(*tmpLocaleID)) {
   2067         const char *scriptID;
   2068         /* Skip the script if available */
   2069         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
   2070         if(scriptID != tmpLocaleID+1) {
   2071             /* Found optional script */
   2072             tmpLocaleID = scriptID;
   2073         }
   2074         /* Skip the Country */
   2075         if (_isIDSeparator(*tmpLocaleID)) {
   2076             const char *cntryID;
   2077             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
   2078             if (cntryID != tmpLocaleID+1) {
   2079                 /* Found optional country */
   2080                 tmpLocaleID = cntryID;
   2081             }
   2082             if(_isIDSeparator(*tmpLocaleID)) {
   2083                 /* If there was no country ID, skip a possible extra IDSeparator */
   2084                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
   2085                     tmpLocaleID++;
   2086                 }
   2087                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
   2088             }
   2089         }
   2090     }
   2091 
   2092     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
   2093     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
   2094 /*
   2095     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
   2096         i=_getVariant(localeID+1, '@', variant, variantCapacity);
   2097     }
   2098 */
   2099     return u_terminateChars(variant, variantCapacity, i, err);
   2100 }
   2101 
   2102 U_CAPI int32_t  U_EXPORT2
   2103 uloc_getName(const char* localeID,
   2104              char* name,
   2105              int32_t nameCapacity,
   2106              UErrorCode* err)
   2107 {
   2108     return _canonicalize(localeID, name, nameCapacity, 0, err);
   2109 }
   2110 
   2111 U_CAPI int32_t  U_EXPORT2
   2112 uloc_getBaseName(const char* localeID,
   2113                  char* name,
   2114                  int32_t nameCapacity,
   2115                  UErrorCode* err)
   2116 {
   2117     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
   2118 }
   2119 
   2120 U_CAPI int32_t  U_EXPORT2
   2121 uloc_canonicalize(const char* localeID,
   2122                   char* name,
   2123                   int32_t nameCapacity,
   2124                   UErrorCode* err)
   2125 {
   2126     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
   2127 }
   2128 
   2129 U_CAPI const char*  U_EXPORT2
   2130 uloc_getISO3Language(const char* localeID)
   2131 {
   2132     int16_t offset;
   2133     char lang[ULOC_LANG_CAPACITY];
   2134     UErrorCode err = U_ZERO_ERROR;
   2135 
   2136     if (localeID == NULL)
   2137     {
   2138         localeID = uloc_getDefault();
   2139     }
   2140     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
   2141     if (U_FAILURE(err))
   2142         return "";
   2143     offset = _findIndex(LANGUAGES, lang);
   2144     if (offset < 0)
   2145         return "";
   2146     return LANGUAGES_3[offset];
   2147 }
   2148 
   2149 U_CAPI const char*  U_EXPORT2
   2150 uloc_getISO3Country(const char* localeID)
   2151 {
   2152     int16_t offset;
   2153     char cntry[ULOC_LANG_CAPACITY];
   2154     UErrorCode err = U_ZERO_ERROR;
   2155 
   2156     if (localeID == NULL)
   2157     {
   2158         localeID = uloc_getDefault();
   2159     }
   2160     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
   2161     if (U_FAILURE(err))
   2162         return "";
   2163     offset = _findIndex(COUNTRIES, cntry);
   2164     if (offset < 0)
   2165         return "";
   2166 
   2167     return COUNTRIES_3[offset];
   2168 }
   2169 
   2170 U_CAPI uint32_t  U_EXPORT2
   2171 uloc_getLCID(const char* localeID)
   2172 {
   2173     UErrorCode status = U_ZERO_ERROR;
   2174     char       langID[ULOC_FULLNAME_CAPACITY];
   2175     uint32_t   lcid = 0;
   2176 
   2177     /* Check for incomplete id. */
   2178     if (!localeID || uprv_strlen(localeID) < 2) {
   2179         return 0;
   2180     }
   2181 
   2182     // Attempt platform lookup if available
   2183     lcid = uprv_convertToLCIDPlatform(localeID);
   2184     if (lcid > 0)
   2185     {
   2186         // Windows found an LCID, return that
   2187         return lcid;
   2188     }
   2189 
   2190     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
   2191     if (U_FAILURE(status)) {
   2192         return 0;
   2193     }
   2194 
   2195     if (uprv_strchr(localeID, '@')) {
   2196         // uprv_convertToLCID does not support keywords other than collation.
   2197         // Remove all keywords except collation.
   2198         int32_t len;
   2199         char collVal[ULOC_KEYWORDS_CAPACITY];
   2200         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
   2201 
   2202         len = uloc_getKeywordValue(localeID, "collation", collVal,
   2203             UPRV_LENGTHOF(collVal) - 1, &status);
   2204 
   2205         if (U_SUCCESS(status) && len > 0) {
   2206             collVal[len] = 0;
   2207 
   2208             len = uloc_getBaseName(localeID, tmpLocaleID,
   2209                 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
   2210 
   2211             if (U_SUCCESS(status) && len > 0) {
   2212                 tmpLocaleID[len] = 0;
   2213 
   2214                 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
   2215                     UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
   2216 
   2217                 if (U_SUCCESS(status) && len > 0) {
   2218                     tmpLocaleID[len] = 0;
   2219                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
   2220                 }
   2221             }
   2222         }
   2223 
   2224         // fall through - all keywords are simply ignored
   2225         status = U_ZERO_ERROR;
   2226     }
   2227 
   2228     return uprv_convertToLCID(langID, localeID, &status);
   2229 }
   2230 
   2231 U_CAPI int32_t U_EXPORT2
   2232 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
   2233                 UErrorCode *status)
   2234 {
   2235     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
   2236 }
   2237 
   2238 /* ### Default locale **************************************************/
   2239 
   2240 U_CAPI const char*  U_EXPORT2
   2241 uloc_getDefault()
   2242 {
   2243     return locale_get_default();
   2244 }
   2245 
   2246 U_CAPI void  U_EXPORT2
   2247 uloc_setDefault(const char*   newDefaultLocale,
   2248              UErrorCode* err)
   2249 {
   2250     if (U_FAILURE(*err))
   2251         return;
   2252     /* the error code isn't currently used for anything by this function*/
   2253 
   2254     /* propagate change to C++ */
   2255     locale_set_default(newDefaultLocale);
   2256 }
   2257 
   2258 /**
   2259  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
   2260  * to an array of pointers to arrays of char.  All of these pointers are owned
   2261  * by ICU-- do not delete them, and do not write through them.  The array is
   2262  * terminated with a null pointer.
   2263  */
   2264 U_CAPI const char* const*  U_EXPORT2
   2265 uloc_getISOLanguages()
   2266 {
   2267     return LANGUAGES;
   2268 }
   2269 
   2270 /**
   2271  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
   2272  * pointer to an array of pointers to arrays of char.  All of these pointers are
   2273  * owned by ICU-- do not delete them, and do not write through them.  The array is
   2274  * terminated with a null pointer.
   2275  */
   2276 U_CAPI const char* const*  U_EXPORT2
   2277 uloc_getISOCountries()
   2278 {
   2279     return COUNTRIES;
   2280 }
   2281 
   2282 
   2283 /* this function to be moved into cstring.c later */
   2284 static char gDecimal = 0;
   2285 
   2286 static /* U_CAPI */
   2287 double
   2288 /* U_EXPORT2 */
   2289 _uloc_strtod(const char *start, char **end) {
   2290     char *decimal;
   2291     char *myEnd;
   2292     char buf[30];
   2293     double rv;
   2294     if (!gDecimal) {
   2295         char rep[5];
   2296         /* For machines that decide to change the decimal on you,
   2297         and try to be too smart with localization.
   2298         This normally should be just a '.'. */
   2299         sprintf(rep, "%+1.1f", 1.0);
   2300         gDecimal = rep[2];
   2301     }
   2302 
   2303     if(gDecimal == '.') {
   2304         return uprv_strtod(start, end); /* fall through to OS */
   2305     } else {
   2306         uprv_strncpy(buf, start, 29);
   2307         buf[29]=0;
   2308         decimal = uprv_strchr(buf, '.');
   2309         if(decimal) {
   2310             *decimal = gDecimal;
   2311         } else {
   2312             return uprv_strtod(start, end); /* no decimal point */
   2313         }
   2314         rv = uprv_strtod(buf, &myEnd);
   2315         if(end) {
   2316             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
   2317         }
   2318         return rv;
   2319     }
   2320 }
   2321 
   2322 typedef struct {
   2323     float q;
   2324     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
   2325     char locale[ULOC_FULLNAME_CAPACITY+1];
   2326 } _acceptLangItem;
   2327 
   2328 static int32_t U_CALLCONV
   2329 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
   2330 {
   2331     const _acceptLangItem *aa = (const _acceptLangItem*)a;
   2332     const _acceptLangItem *bb = (const _acceptLangItem*)b;
   2333 
   2334     int32_t rc = 0;
   2335     if(bb->q < aa->q) {
   2336         rc = -1;  /* A > B */
   2337     } else if(bb->q > aa->q) {
   2338         rc = 1;   /* A < B */
   2339     } else {
   2340         rc = 0;   /* A = B */
   2341     }
   2342 
   2343     if(rc==0) {
   2344         rc = uprv_stricmp(aa->locale, bb->locale);
   2345     }
   2346 
   2347 #if defined(ULOC_DEBUG)
   2348     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
   2349     aa->locale, aa->q,
   2350     bb->locale, bb->q,
   2351     rc);*/
   2352 #endif
   2353 
   2354     return rc;
   2355 }
   2356 
   2357 /*
   2358 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
   2359 */
   2360 
   2361 U_CAPI int32_t U_EXPORT2
   2362 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
   2363                             const char *httpAcceptLanguage,
   2364                             UEnumeration* availableLocales,
   2365                             UErrorCode *status)
   2366 {
   2367   MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items.
   2368     char tmp[ULOC_FULLNAME_CAPACITY +1];
   2369     int32_t n = 0;
   2370     const char *itemEnd;
   2371     const char *paramEnd;
   2372     const char *s;
   2373     const char *t;
   2374     int32_t res;
   2375     int32_t i;
   2376     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
   2377 
   2378     if(U_FAILURE(*status)) {
   2379         return -1;
   2380     }
   2381 
   2382     for(s=httpAcceptLanguage;s&&*s;) {
   2383         while(isspace(*s)) /* eat space at the beginning */
   2384             s++;
   2385         itemEnd=uprv_strchr(s,',');
   2386         paramEnd=uprv_strchr(s,';');
   2387         if(!itemEnd) {
   2388             itemEnd = httpAcceptLanguage+l; /* end of string */
   2389         }
   2390         if(paramEnd && paramEnd<itemEnd) {
   2391             /* semicolon (;) is closer than end (,) */
   2392             t = paramEnd+1;
   2393             if(*t=='q') {
   2394                 t++;
   2395             }
   2396             while(isspace(*t)) {
   2397                 t++;
   2398             }
   2399             if(*t=='=') {
   2400                 t++;
   2401             }
   2402             while(isspace(*t)) {
   2403                 t++;
   2404             }
   2405             items[n].q = (float)_uloc_strtod(t,NULL);
   2406         } else {
   2407             /* no semicolon - it's 1.0 */
   2408             items[n].q = 1.0f;
   2409             paramEnd = itemEnd;
   2410         }
   2411         items[n].dummy=0;
   2412         /* eat spaces prior to semi */
   2413         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
   2414             ;
   2415         int32_t slen = ((t+1)-s);
   2416         if(slen > ULOC_FULLNAME_CAPACITY) {
   2417           *status = U_BUFFER_OVERFLOW_ERROR;
   2418           return -1; // too big
   2419         }
   2420         uprv_strncpy(items[n].locale, s, slen);
   2421         items[n].locale[slen]=0; // terminate
   2422         int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
   2423         if(U_FAILURE(*status)) return -1;
   2424         if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
   2425             // canonicalization had an effect- copy back
   2426             uprv_strncpy(items[n].locale, tmp, clen);
   2427             items[n].locale[clen] = 0; // terminate
   2428         }
   2429 #if defined(ULOC_DEBUG)
   2430         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
   2431 #endif
   2432         n++;
   2433         s = itemEnd;
   2434         while(*s==',') { /* eat duplicate commas */
   2435             s++;
   2436         }
   2437         if(n>=items.getCapacity()) { // If we need more items
   2438           if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
   2439               *status = U_MEMORY_ALLOCATION_ERROR;
   2440               return -1;
   2441           }
   2442 #if defined(ULOC_DEBUG)
   2443           fprintf(stderr,"malloced at size %d\n", items.getCapacity());
   2444 #endif
   2445         }
   2446     }
   2447     uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
   2448     if (U_FAILURE(*status)) {
   2449         return -1;
   2450     }
   2451     LocalMemory<const char*> strs(NULL);
   2452     if (strs.allocateInsteadAndReset(n) == NULL) {
   2453         *status = U_MEMORY_ALLOCATION_ERROR;
   2454         return -1;
   2455     }
   2456     for(i=0;i<n;i++) {
   2457 #if defined(ULOC_DEBUG)
   2458         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
   2459 #endif
   2460         strs[i]=items[i].locale;
   2461     }
   2462     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
   2463                                strs.getAlias(), n, availableLocales, status);
   2464     return res;
   2465 }
   2466 
   2467 
   2468 U_CAPI int32_t U_EXPORT2
   2469 uloc_acceptLanguage(char *result, int32_t resultAvailable,
   2470                     UAcceptResult *outResult, const char **acceptList,
   2471                     int32_t acceptListCount,
   2472                     UEnumeration* availableLocales,
   2473                     UErrorCode *status)
   2474 {
   2475     int32_t i,j;
   2476     int32_t len;
   2477     int32_t maxLen=0;
   2478     char tmp[ULOC_FULLNAME_CAPACITY+1];
   2479     const char *l;
   2480     char **fallbackList;
   2481     if(U_FAILURE(*status)) {
   2482         return -1;
   2483     }
   2484     fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
   2485     if(fallbackList==NULL) {
   2486         *status = U_MEMORY_ALLOCATION_ERROR;
   2487         return -1;
   2488     }
   2489     for(i=0;i<acceptListCount;i++) {
   2490 #if defined(ULOC_DEBUG)
   2491         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
   2492 #endif
   2493         while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
   2494 #if defined(ULOC_DEBUG)
   2495             fprintf(stderr,"  %s\n", l);
   2496 #endif
   2497             len = (int32_t)uprv_strlen(l);
   2498             if(!uprv_strcmp(acceptList[i], l)) {
   2499                 if(outResult) {
   2500                     *outResult = ULOC_ACCEPT_VALID;
   2501                 }
   2502 #if defined(ULOC_DEBUG)
   2503                 fprintf(stderr, "MATCH! %s\n", l);
   2504 #endif
   2505                 if(len>0) {
   2506                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
   2507                 }
   2508                 for(j=0;j<i;j++) {
   2509                     uprv_free(fallbackList[j]);
   2510                 }
   2511                 uprv_free(fallbackList);
   2512                 return u_terminateChars(result, resultAvailable, len, status);
   2513             }
   2514             if(len>maxLen) {
   2515                 maxLen = len;
   2516             }
   2517         }
   2518         uenum_reset(availableLocales, status);
   2519         /* save off parent info */
   2520         if(uloc_getParent(acceptList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
   2521             fallbackList[i] = uprv_strdup(tmp);
   2522         } else {
   2523             fallbackList[i]=0;
   2524         }
   2525     }
   2526 
   2527     for(maxLen--;maxLen>0;maxLen--) {
   2528         for(i=0;i<acceptListCount;i++) {
   2529             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
   2530 #if defined(ULOC_DEBUG)
   2531                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
   2532 #endif
   2533                 while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
   2534 #if defined(ULOC_DEBUG)
   2535                     fprintf(stderr,"  %s\n", l);
   2536 #endif
   2537                     len = (int32_t)uprv_strlen(l);
   2538                     if(!uprv_strcmp(fallbackList[i], l)) {
   2539                         if(outResult) {
   2540                             *outResult = ULOC_ACCEPT_FALLBACK;
   2541                         }
   2542 #if defined(ULOC_DEBUG)
   2543                         fprintf(stderr, "fallback MATCH! %s\n", l);
   2544 #endif
   2545                         if(len>0) {
   2546                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
   2547                         }
   2548                         for(j=0;j<acceptListCount;j++) {
   2549                             uprv_free(fallbackList[j]);
   2550                         }
   2551                         uprv_free(fallbackList);
   2552                         return u_terminateChars(result, resultAvailable, len, status);
   2553                     }
   2554                 }
   2555                 uenum_reset(availableLocales, status);
   2556 
   2557                 if(uloc_getParent(fallbackList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
   2558                     uprv_free(fallbackList[i]);
   2559                     fallbackList[i] = uprv_strdup(tmp);
   2560                 } else {
   2561                     uprv_free(fallbackList[i]);
   2562                     fallbackList[i]=0;
   2563                 }
   2564             }
   2565         }
   2566         if(outResult) {
   2567             *outResult = ULOC_ACCEPT_FAILED;
   2568         }
   2569     }
   2570     for(i=0;i<acceptListCount;i++) {
   2571         uprv_free(fallbackList[i]);
   2572     }
   2573     uprv_free(fallbackList);
   2574     return -1;
   2575 }
   2576 
   2577 U_CAPI const char* U_EXPORT2
   2578 uloc_toUnicodeLocaleKey(const char* keyword)
   2579 {
   2580     const char* bcpKey = ulocimp_toBcpKey(keyword);
   2581     if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
   2582         // unknown keyword, but syntax is fine..
   2583         return keyword;
   2584     }
   2585     return bcpKey;
   2586 }
   2587 
   2588 U_CAPI const char* U_EXPORT2
   2589 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
   2590 {
   2591     const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
   2592     if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
   2593         // unknown keyword, but syntax is fine..
   2594         return value;
   2595     }
   2596     return bcpType;
   2597 }
   2598 
   2599 static UBool
   2600 isWellFormedLegacyKey(const char* legacyKey)
   2601 {
   2602     const char* p = legacyKey;
   2603     while (*p) {
   2604         if (!UPRV_ISALPHANUM(*p)) {
   2605             return FALSE;
   2606         }
   2607         p++;
   2608     }
   2609     return TRUE;
   2610 }
   2611 
   2612 static UBool
   2613 isWellFormedLegacyType(const char* legacyType)
   2614 {
   2615     const char* p = legacyType;
   2616     int32_t alphaNumLen = 0;
   2617     while (*p) {
   2618         if (*p == '_' || *p == '/' || *p == '-') {
   2619             if (alphaNumLen == 0) {
   2620                 return FALSE;
   2621             }
   2622             alphaNumLen = 0;
   2623         } else if (UPRV_ISALPHANUM(*p)) {
   2624             alphaNumLen++;
   2625         } else {
   2626             return FALSE;
   2627         }
   2628         p++;
   2629     }
   2630     return (alphaNumLen != 0);
   2631 }
   2632 
   2633 U_CAPI const char* U_EXPORT2
   2634 uloc_toLegacyKey(const char* keyword)
   2635 {
   2636     const char* legacyKey = ulocimp_toLegacyKey(keyword);
   2637     if (legacyKey == NULL) {
   2638         // Checks if the specified locale key is well-formed with the legacy locale syntax.
   2639         //
   2640         // Note:
   2641         //  LDML/CLDR provides some definition of keyword syntax in
   2642         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
   2643         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
   2644         //  Keys can only consist of [0-9a-zA-Z].
   2645         if (isWellFormedLegacyKey(keyword)) {
   2646             return keyword;
   2647         }
   2648     }
   2649     return legacyKey;
   2650 }
   2651 
   2652 U_CAPI const char* U_EXPORT2
   2653 uloc_toLegacyType(const char* keyword, const char* value)
   2654 {
   2655     const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
   2656     if (legacyType == NULL) {
   2657         // Checks if the specified locale type is well-formed with the legacy locale syntax.
   2658         //
   2659         // Note:
   2660         //  LDML/CLDR provides some definition of keyword syntax in
   2661         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
   2662         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
   2663         //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
   2664         //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
   2665         if (isWellFormedLegacyType(value)) {
   2666             return value;
   2667         }
   2668     }
   2669     return legacyType;
   2670 }
   2671 
   2672 /*eof*/
   2673