Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucol_tok.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created 02/22/2001
     14 *   created by: Vladimir Weinstein
     15 *
     16 * This module reads a tailoring rule string and produces a list of
     17 * tokens that will be turned into collation elements
     18 *
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 
     23 #if !UCONFIG_NO_COLLATION
     24 
     25 #include "unicode/ustring.h"
     26 #include "unicode/uchar.h"
     27 #include "unicode/uniset.h"
     28 
     29 #include "ucol_tok.h"
     30 #include "ucol_bld.h"
     31 #include "cmemory.h"
     32 #include "../common/util.h"
     33 
     34 U_CDECL_BEGIN
     35 static int32_t U_CALLCONV
     36 uhash_hashTokens(const UHashTok k)
     37 {
     38     int32_t hash = 0;
     39     //uint32_t key = (uint32_t)k.integer;
     40     UColToken *key = (UColToken *)k.pointer;
     41     if (key != 0) {
     42         //int32_t len = (key & 0xFF000000)>>24;
     43         int32_t len = (key->source & 0xFF000000)>>24;
     44         int32_t inc = ((len - 32) / 32) + 1;
     45 
     46         //const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
     47         const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
     48         const UChar *limit = p + len;
     49 
     50         while (p<limit) {
     51             hash = (hash * 37) + *p;
     52             p += inc;
     53         }
     54     }
     55     return hash;
     56 }
     57 
     58 static UBool U_CALLCONV
     59 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
     60 {
     61     //uint32_t p1 = (uint32_t) key1.integer;
     62     //uint32_t p2 = (uint32_t) key2.integer;
     63     UColToken *p1 = (UColToken *)key1.pointer;
     64     UColToken *p2 = (UColToken *)key2.pointer;
     65     const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
     66     const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
     67     uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
     68     uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
     69     const UChar *end = s1+s1L-1;
     70 
     71     if (p1 == p2) {
     72         return TRUE;
     73     }
     74     if (p1->source == 0 || p2->source == 0) {
     75         return FALSE;
     76     }
     77     if(s1L != s2L) {
     78         return FALSE;
     79     }
     80     if(p1->source == p2->source) {
     81         return TRUE;
     82     }
     83     while((s1 < end) && *s1 == *s2) {
     84         ++s1;
     85         ++s2;
     86     }
     87     if(*s1 == *s2) {
     88         return TRUE;
     89     } else {
     90         return FALSE;
     91     }
     92 }
     93 U_CDECL_END
     94 
     95 /*static inline void U_CALLCONV
     96 uhash_freeBlockWrapper(void *obj) {
     97     uhash_freeBlock(obj);
     98 }*/
     99 
    100 
    101 typedef struct {
    102     uint32_t startCE;
    103     uint32_t startContCE;
    104     uint32_t limitCE;
    105     uint32_t limitContCE;
    106 } indirectBoundaries;
    107 
    108 /* these values are used for finding CE values for indirect positioning. */
    109 /* Indirect positioning is a mechanism for allowing resets on symbolic   */
    110 /* values. It only works for resets and you cannot tailor indirect names */
    111 /* An indirect name can define either an anchor point or a range. An     */
    112 /* anchor point behaves in exactly the same way as a code point in reset */
    113 /* would, except that it cannot be tailored. A range (we currently only  */
    114 /* know for the [top] range will explicitly set the upper bound for      */
    115 /* generated CEs, thus allowing for better control over how many CEs can */
    116 /* be squeezed between in the range without performance penalty.         */
    117 /* In that respect, we use [top] for tailoring of locales that use CJK   */
    118 /* characters. Other indirect values are currently a pure convenience,   */
    119 /* they can be used to assure that the CEs will be always positioned in  */
    120 /* the same place relative to a point with known properties (e.g. first  */
    121 /* primary ignorable). */
    122 static indirectBoundaries ucolIndirectBoundaries[15];
    123 /*
    124 static indirectBoundaries ucolIndirectBoundaries[11] = {
    125 { UCOL_RESET_TOP_VALUE,               0,
    126 UCOL_NEXT_TOP_VALUE,                0 },
    127 { UCOL_FIRST_PRIMARY_IGNORABLE,       0,
    128 0,                                  0 },
    129 { UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
    130 0,                                  0 },
    131 { UCOL_FIRST_SECONDARY_IGNORABLE,     0,
    132 0,                                  0 },
    133 { UCOL_LAST_SECONDARY_IGNORABLE,      0,
    134 0,                                  0 },
    135 { UCOL_FIRST_TERTIARY_IGNORABLE,      0,
    136 0,                                  0 },
    137 { UCOL_LAST_TERTIARY_IGNORABLE,       0,
    138 0,                                  0 },
    139 { UCOL_FIRST_VARIABLE,                0,
    140 0,                                  0 },
    141 { UCOL_LAST_VARIABLE,                 0,
    142 0,                                  0 },
    143 { UCOL_FIRST_NON_VARIABLE,            0,
    144 0,                                  0 },
    145 { UCOL_LAST_NON_VARIABLE,             0,
    146 0,                                  0 },
    147 };
    148 */
    149 
    150 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
    151 
    152     // Set values for the top - TODO: once we have values for all the indirects, we are going
    153     // to initalize here.
    154     ucolIndirectBoundaries[indexR].startCE = start[0];
    155     ucolIndirectBoundaries[indexR].startContCE = start[1];
    156     if(end) {
    157         ucolIndirectBoundaries[indexR].limitCE = end[0];
    158         ucolIndirectBoundaries[indexR].limitContCE = end[1];
    159     } else {
    160         ucolIndirectBoundaries[indexR].limitCE = 0;
    161         ucolIndirectBoundaries[indexR].limitContCE = 0;
    162     }
    163 }
    164 
    165 
    166 static inline
    167 void syntaxError(const UChar* rules,
    168                  int32_t pos,
    169                  int32_t rulesLen,
    170                  UParseError* parseError)
    171 {
    172     parseError->offset = pos;
    173     parseError->line = 0 ; /* we are not using line numbers */
    174 
    175     // for pre-context
    176     int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
    177     int32_t stop  = pos;
    178 
    179     u_memcpy(parseError->preContext,rules+start,stop-start);
    180     //null terminate the buffer
    181     parseError->preContext[stop-start] = 0;
    182 
    183     //for post-context
    184     start = pos+1;
    185     stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
    186     rulesLen;
    187 
    188     if(start < stop) {
    189         u_memcpy(parseError->postContext,rules+start,stop-start);
    190         //null terminate the buffer
    191         parseError->postContext[stop-start]= 0;
    192     } else {
    193         parseError->postContext[0] = 0;
    194     }
    195 }
    196 
    197 static
    198 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
    199     switch(attrib) {
    200     case UCOL_HIRAGANA_QUATERNARY_MODE:
    201         opts->hiraganaQ = value;
    202         break;
    203     case UCOL_FRENCH_COLLATION:
    204         opts->frenchCollation = value;
    205         break;
    206     case UCOL_ALTERNATE_HANDLING:
    207         opts->alternateHandling = value;
    208         break;
    209     case UCOL_CASE_FIRST:
    210         opts->caseFirst = value;
    211         break;
    212     case UCOL_CASE_LEVEL:
    213         opts->caseLevel = value;
    214         break;
    215     case UCOL_NORMALIZATION_MODE:
    216         opts->normalizationMode = value;
    217         break;
    218     case UCOL_STRENGTH:
    219         opts->strength = value;
    220         break;
    221     case UCOL_NUMERIC_COLLATION:
    222         opts->numericCollation = value;
    223         break;
    224     case UCOL_ATTRIBUTE_COUNT:
    225     default:
    226         break;
    227     }
    228 }
    229 
    230 #define UTOK_OPTION_COUNT 20
    231 
    232 static UBool didInit = FALSE;
    233 /* we can be strict, or we can be lenient */
    234 /* I'd surely be lenient with the option arguments */
    235 /* maybe even with options */
    236 U_STRING_DECL(suboption_00, "non-ignorable", 13);
    237 U_STRING_DECL(suboption_01, "shifted",        7);
    238 
    239 U_STRING_DECL(suboption_02, "lower",          5);
    240 U_STRING_DECL(suboption_03, "upper",          5);
    241 U_STRING_DECL(suboption_04, "off",            3);
    242 U_STRING_DECL(suboption_05, "on",             2);
    243 U_STRING_DECL(suboption_06, "1",              1);
    244 U_STRING_DECL(suboption_07, "2",              1);
    245 U_STRING_DECL(suboption_08, "3",              1);
    246 U_STRING_DECL(suboption_09, "4",              1);
    247 U_STRING_DECL(suboption_10, "I",              1);
    248 
    249 U_STRING_DECL(suboption_11, "primary",        7);
    250 U_STRING_DECL(suboption_12, "secondary",      9);
    251 U_STRING_DECL(suboption_13, "tertiary",       8);
    252 U_STRING_DECL(suboption_14, "variable",       8);
    253 U_STRING_DECL(suboption_15, "regular",        7);
    254 U_STRING_DECL(suboption_16, "implicit",       8);
    255 U_STRING_DECL(suboption_17, "trailing",       8);
    256 
    257 
    258 U_STRING_DECL(option_00,    "undefined",      9);
    259 U_STRING_DECL(option_01,    "rearrange",      9);
    260 U_STRING_DECL(option_02,    "alternate",      9);
    261 U_STRING_DECL(option_03,    "backwards",      9);
    262 U_STRING_DECL(option_04,    "variable top",  12);
    263 U_STRING_DECL(option_05,    "top",            3);
    264 U_STRING_DECL(option_06,    "normalization", 13);
    265 U_STRING_DECL(option_07,    "caseLevel",      9);
    266 U_STRING_DECL(option_08,    "caseFirst",      9);
    267 U_STRING_DECL(option_09,    "scriptOrder",   11);
    268 U_STRING_DECL(option_10,    "charsetname",   11);
    269 U_STRING_DECL(option_11,    "charset",        7);
    270 U_STRING_DECL(option_12,    "before",         6);
    271 U_STRING_DECL(option_13,    "hiraganaQ",      9);
    272 U_STRING_DECL(option_14,    "strength",       8);
    273 U_STRING_DECL(option_15,    "first",          5);
    274 U_STRING_DECL(option_16,    "last",           4);
    275 U_STRING_DECL(option_17,    "optimize",       8);
    276 U_STRING_DECL(option_18,    "suppressContractions",         20);
    277 U_STRING_DECL(option_19,    "numericOrdering",              15);
    278 
    279 
    280 /*
    281 [last variable] last variable value
    282 [last primary ignorable] largest CE for primary ignorable
    283 [last secondary ignorable] largest CE for secondary ignorable
    284 [last tertiary ignorable] largest CE for tertiary ignorable
    285 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
    286 */
    287 
    288 
    289 static const ucolTokSuboption alternateSub[2] = {
    290     {suboption_00, 13, UCOL_NON_IGNORABLE},
    291     {suboption_01,  7, UCOL_SHIFTED}
    292 };
    293 
    294 static const ucolTokSuboption caseFirstSub[3] = {
    295     {suboption_02, 5, UCOL_LOWER_FIRST},
    296     {suboption_03,  5, UCOL_UPPER_FIRST},
    297     {suboption_04,  3, UCOL_OFF},
    298 };
    299 
    300 static const ucolTokSuboption onOffSub[2] = {
    301     {suboption_04, 3, UCOL_OFF},
    302     {suboption_05, 2, UCOL_ON}
    303 };
    304 
    305 static const ucolTokSuboption frenchSub[1] = {
    306     {suboption_07, 1, UCOL_ON}
    307 };
    308 
    309 static const ucolTokSuboption beforeSub[3] = {
    310     {suboption_06, 1, UCOL_PRIMARY},
    311     {suboption_07, 1, UCOL_SECONDARY},
    312     {suboption_08, 1, UCOL_TERTIARY}
    313 };
    314 
    315 static const ucolTokSuboption strengthSub[5] = {
    316     {suboption_06, 1, UCOL_PRIMARY},
    317     {suboption_07, 1, UCOL_SECONDARY},
    318     {suboption_08, 1, UCOL_TERTIARY},
    319     {suboption_09, 1, UCOL_QUATERNARY},
    320     {suboption_10, 1, UCOL_IDENTICAL},
    321 };
    322 
    323 static const ucolTokSuboption firstLastSub[7] = {
    324     {suboption_11, 7, UCOL_PRIMARY},
    325     {suboption_12, 9, UCOL_PRIMARY},
    326     {suboption_13, 8, UCOL_PRIMARY},
    327     {suboption_14, 8, UCOL_PRIMARY},
    328     {suboption_15, 7, UCOL_PRIMARY},
    329     {suboption_16, 8, UCOL_PRIMARY},
    330     {suboption_17, 8, UCOL_PRIMARY},
    331 };
    332 
    333 enum OptionNumber {
    334     OPTION_ALTERNATE_HANDLING = 0,
    335     OPTION_FRENCH_COLLATION,
    336     OPTION_CASE_LEVEL,
    337     OPTION_CASE_FIRST,
    338     OPTION_NORMALIZATION_MODE,
    339     OPTION_HIRAGANA_QUATERNARY,
    340     OPTION_STRENGTH,
    341     OPTION_NUMERIC_COLLATION,
    342     OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
    343     OPTION_VARIABLE_TOP,
    344     OPTION_REARRANGE,
    345     OPTION_BEFORE,
    346     OPTION_TOP,
    347     OPTION_FIRST,
    348     OPTION_LAST,
    349     OPTION_OPTIMIZE,
    350     OPTION_SUPPRESS_CONTRACTIONS,
    351     OPTION_UNDEFINED,
    352     OPTION_SCRIPT_ORDER,
    353     OPTION_CHARSET_NAME,
    354     OPTION_CHARSET
    355 } ;
    356 
    357 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
    358     /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
    359     /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
    360     /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
    361     /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
    362     /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
    363     /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
    364     /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
    365     /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
    366     /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
    367     /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
    368     /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
    369     /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
    370     /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
    371     /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
    372     /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
    373     /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
    374     /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
    375     /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
    376     /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
    377     /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"charset"        */
    378 };
    379 
    380 static
    381 int32_t u_strncmpNoCase(const UChar     *s1,
    382                         const UChar     *s2,
    383                         int32_t     n)
    384 {
    385     if(n > 0) {
    386         int32_t rc;
    387         for(;;) {
    388             rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
    389             if(rc != 0 || *s1 == 0 || --n == 0) {
    390                 return rc;
    391             }
    392             ++s1;
    393             ++s2;
    394         }
    395     }
    396     return 0;
    397 }
    398 
    399 static
    400 void ucol_uprv_tok_initData() {
    401     if(!didInit) {
    402         U_STRING_INIT(suboption_00, "non-ignorable", 13);
    403         U_STRING_INIT(suboption_01, "shifted",        7);
    404 
    405         U_STRING_INIT(suboption_02, "lower",          5);
    406         U_STRING_INIT(suboption_03, "upper",          5);
    407         U_STRING_INIT(suboption_04, "off",            3);
    408         U_STRING_INIT(suboption_05, "on",             2);
    409 
    410         U_STRING_INIT(suboption_06, "1",              1);
    411         U_STRING_INIT(suboption_07, "2",              1);
    412         U_STRING_INIT(suboption_08, "3",              1);
    413         U_STRING_INIT(suboption_09, "4",              1);
    414         U_STRING_INIT(suboption_10, "I",              1);
    415 
    416         U_STRING_INIT(suboption_11, "primary",        7);
    417         U_STRING_INIT(suboption_12, "secondary",      9);
    418         U_STRING_INIT(suboption_13, "tertiary",       8);
    419         U_STRING_INIT(suboption_14, "variable",       8);
    420         U_STRING_INIT(suboption_15, "regular",        7);
    421         U_STRING_INIT(suboption_16, "implicit",       8);
    422         U_STRING_INIT(suboption_17, "trailing",       8);
    423 
    424 
    425         U_STRING_INIT(option_00, "undefined",      9);
    426         U_STRING_INIT(option_01, "rearrange",      9);
    427         U_STRING_INIT(option_02, "alternate",      9);
    428         U_STRING_INIT(option_03, "backwards",      9);
    429         U_STRING_INIT(option_04, "variable top",  12);
    430         U_STRING_INIT(option_05, "top",            3);
    431         U_STRING_INIT(option_06, "normalization", 13);
    432         U_STRING_INIT(option_07, "caseLevel",      9);
    433         U_STRING_INIT(option_08, "caseFirst",      9);
    434         U_STRING_INIT(option_09, "scriptOrder",   11);
    435         U_STRING_INIT(option_10, "charsetname",   11);
    436         U_STRING_INIT(option_11, "charset",        7);
    437         U_STRING_INIT(option_12, "before",         6);
    438         U_STRING_INIT(option_13, "hiraganaQ",      9);
    439         U_STRING_INIT(option_14, "strength",       8);
    440         U_STRING_INIT(option_15, "first",          5);
    441         U_STRING_INIT(option_16, "last",           4);
    442         U_STRING_INIT(option_17, "optimize",       8);
    443         U_STRING_INIT(option_18, "suppressContractions",         20);
    444         U_STRING_INIT(option_19, "numericOrdering",      15);
    445         didInit = TRUE;
    446     }
    447 }
    448 
    449 
    450 // This function reads basic options to set in the runtime collator
    451 // used by data driven tests. Should not support build time options
    452 U_CAPI const UChar * U_EXPORT2
    453 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
    454                          UColAttribute *attrib, UColAttributeValue *value,
    455                          UErrorCode *status)
    456 {
    457     uint32_t i = 0;
    458     int32_t j=0;
    459     UBool foundOption = FALSE;
    460     const UChar *optionArg = NULL;
    461 
    462     ucol_uprv_tok_initData();
    463 
    464     while(start < end && (u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start))) { /* eat whitespace */
    465         start++;
    466     }
    467     if(start >= end) {
    468         return NULL;
    469     }
    470     /* skip opening '[' */
    471     if(*start == 0x005b) {
    472         start++;
    473     } else {
    474         *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
    475         return NULL;
    476     }
    477 
    478     while(i < UTOK_OPTION_COUNT) {
    479         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
    480             foundOption = TRUE;
    481             if(end - start > rulesOptions[i].optionLen) {
    482                 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
    483                 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */
    484                     optionArg++;
    485                 }
    486             }
    487             break;
    488         }
    489         i++;
    490     }
    491 
    492     if(!foundOption) {
    493         *status = U_ILLEGAL_ARGUMENT_ERROR;
    494         return NULL;
    495     }
    496 
    497     if(optionArg) {
    498         for(j = 0; j<rulesOptions[i].subSize; j++) {
    499             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    500                 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
    501                 *attrib = rulesOptions[i].attr;
    502                 *value = rulesOptions[i].subopts[j].attrVal;
    503                 optionArg += rulesOptions[i].subopts[j].subLen;
    504                 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */
    505                     optionArg++;
    506                 }
    507                 if(*optionArg == 0x005d) {
    508                     optionArg++;
    509                     return optionArg;
    510                 } else {
    511                     *status = U_ILLEGAL_ARGUMENT_ERROR;
    512                     return NULL;
    513                 }
    514             }
    515         }
    516     }
    517     *status = U_ILLEGAL_ARGUMENT_ERROR;
    518     return NULL;
    519 }
    520 
    521 static
    522 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
    523     while(*start != 0x005b) { /* advance while we find the first '[' */
    524         start++;
    525     }
    526     // now we need to get a balanced set of '[]'. The problem is that a set can have
    527     // many, and *end point to the first closing '['
    528     int32_t noOpenBraces = 1;
    529     int32_t current = 1; // skip the opening brace
    530     while(start+current < end && noOpenBraces != 0) {
    531         if(start[current] == 0x005b) {
    532             noOpenBraces++;
    533         } else if(start[current] == 0x005D) { // closing brace
    534             noOpenBraces--;
    535         }
    536         current++;
    537     }
    538 
    539     if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
    540         *status = U_ILLEGAL_ARGUMENT_ERROR;
    541         return NULL;
    542     }
    543     return uset_openPattern(start, current, status);
    544 }
    545 
    546 static
    547 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
    548     int32_t i = 0;
    549     ucol_uprv_tok_initData();
    550 
    551     while(u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start)) { /* eat whitespace */
    552         start++;
    553     }
    554     while(i < UTOK_OPTION_COUNT) {
    555         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
    556             if(end - start > rulesOptions[i].optionLen) {
    557                 *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
    558                 while(u_isWhitespace(**optionArg) || uprv_isRuleWhiteSpace(**optionArg)) { /* eat whitespace */
    559                     (*optionArg)++;
    560                 }
    561             }
    562             break;
    563         }
    564         i++;
    565     }
    566     if(i == UTOK_OPTION_COUNT) {
    567         i = -1; // didn't find an option
    568     }
    569     return i;
    570 }
    571 
    572 
    573 // reads and conforms to various options in rules
    574 // end is the position of the first closing ']'
    575 // However, some of the options take an UnicodeSet definition
    576 // which needs to duplicate the closing ']'
    577 // for example: '[copy [\uAC00-\uD7FF]]'
    578 // These options will move end to the second ']' and the
    579 // caller will set the current to it.
    580 static
    581 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
    582     const UChar* start = src->current;
    583     int32_t i = 0;
    584     int32_t j=0;
    585     const UChar *optionArg = NULL;
    586 
    587     uint8_t result = 0;
    588 
    589     start++; /*skip opening '['*/
    590     i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
    591     if(optionArg) {
    592         src->current = optionArg;
    593     }
    594 
    595     if(i < 0) {
    596         *status = U_ILLEGAL_ARGUMENT_ERROR;
    597     } else {
    598         int32_t noOpenBraces = 1;
    599         switch(i) {
    600     case OPTION_ALTERNATE_HANDLING:
    601     case OPTION_FRENCH_COLLATION:
    602     case OPTION_CASE_LEVEL:
    603     case OPTION_CASE_FIRST:
    604     case OPTION_NORMALIZATION_MODE:
    605     case OPTION_HIRAGANA_QUATERNARY:
    606     case OPTION_STRENGTH:
    607     case OPTION_NUMERIC_COLLATION:
    608         if(optionArg) {
    609             for(j = 0; j<rulesOptions[i].subSize; j++) {
    610                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    611                     ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
    612                     result =  UCOL_TOK_SUCCESS;
    613                 }
    614             }
    615         }
    616         if(result == 0) {
    617             *status = U_ILLEGAL_ARGUMENT_ERROR;
    618         }
    619         break;
    620     case OPTION_VARIABLE_TOP:
    621         result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
    622         break;
    623     case OPTION_REARRANGE:
    624         result = UCOL_TOK_SUCCESS;
    625         break;
    626     case OPTION_BEFORE:
    627         if(optionArg) {
    628             for(j = 0; j<rulesOptions[i].subSize; j++) {
    629                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    630                     result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
    631                 }
    632             }
    633         }
    634         if(result == 0) {
    635             *status = U_ILLEGAL_ARGUMENT_ERROR;
    636         }
    637         break;
    638     case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
    639         /* index to this array will be src->parsedToken.indirectIndex*/
    640         src->parsedToken.indirectIndex = 0;
    641         result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
    642         break;
    643     case OPTION_FIRST:
    644     case OPTION_LAST: /* first, last */
    645         for(j = 0; j<rulesOptions[i].subSize; j++) {
    646             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    647                 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
    648                 // element of indirect boundaries is reserved for top.
    649                 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
    650                 result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
    651             }
    652         }
    653         if(result == 0) {
    654             *status = U_ILLEGAL_ARGUMENT_ERROR;
    655         }
    656         break;
    657     case OPTION_OPTIMIZE:
    658     case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
    659         // we need to move end here
    660         src->current++; // skip opening brace
    661         while(src->current < src->end && noOpenBraces != 0) {
    662             if(*src->current == 0x005b) {
    663                 noOpenBraces++;
    664             } else if(*src->current == 0x005D) { // closing brace
    665                 noOpenBraces--;
    666             }
    667             src->current++;
    668         }
    669         result = UCOL_TOK_SUCCESS;
    670         break;
    671     default:
    672         *status = U_UNSUPPORTED_ERROR;
    673         break;
    674         }
    675     }
    676     src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
    677     return result;
    678 }
    679 
    680 
    681 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
    682     if (stuff == NULL || len <= 0) {
    683         return;
    684     }
    685     UChar *tempStuff = (UChar *)stuff;
    686     if(src->extraCurrent+len >= src->extraEnd) {
    687         /* reallocate */
    688         if (stuff >= src->source && stuff <= src->end) {
    689           // Copy stuff to a new buffer if stuff points to an address within
    690           // src->source buffer.
    691           tempStuff = (UChar*)uprv_malloc(len*sizeof(UChar));
    692           if (tempStuff == NULL) {
    693             *status = U_MEMORY_ALLOCATION_ERROR;
    694             return;
    695           }
    696           uprv_memcpy(tempStuff, stuff, len*sizeof(UChar));
    697         }
    698         UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
    699         if(newSrc != NULL) {
    700             src->current = newSrc + (src->current - src->source);
    701             src->extraCurrent = newSrc + (src->extraCurrent - src->source);
    702             src->end = newSrc + (src->end - src->source);
    703             src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
    704             src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
    705             src->source = newSrc;
    706         } else {
    707             *status = U_MEMORY_ALLOCATION_ERROR;
    708             if (tempStuff != stuff) {
    709                 uprv_free(tempStuff);
    710             }
    711             return;
    712         }
    713     }
    714     if(len == 1) {
    715         *src->extraCurrent++ = *tempStuff;
    716     } else {
    717         uprv_memcpy(src->extraCurrent, tempStuff, len*sizeof(UChar));
    718         src->extraCurrent += len;
    719     }
    720     if (tempStuff != stuff) {
    721         uprv_free(tempStuff);
    722     }
    723 }
    724 
    725 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
    726     /*
    727     top = TRUE;
    728     */
    729     UChar buff[5];
    730     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    731     buff[0] = 0xFFFE;
    732     buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
    733     buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
    734     if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
    735         src->parsedToken.charsLen = 3;
    736         ucol_tok_addToExtraCurrent(src, buff, 3, status);
    737     } else {
    738         buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
    739         buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
    740         src->parsedToken.charsLen = 5;
    741         ucol_tok_addToExtraCurrent(src, buff, 5, status);
    742     }
    743     return TRUE;
    744 }
    745 
    746 static UBool isCharNewLine(UChar c){
    747     switch(c){
    748     case 0x000A: /* LF  */
    749     case 0x000D: /* CR  */
    750     case 0x000C: /* FF  */
    751     case 0x0085: /* NEL */
    752     case 0x2028: /* LS  */
    753     case 0x2029: /* PS  */
    754         return TRUE;
    755     default:
    756         return FALSE;
    757     }
    758 }
    759 
    760 U_CAPI const UChar* U_EXPORT2
    761 ucol_tok_parseNextToken(UColTokenParser *src,
    762                         UBool startOfRules,
    763                         UParseError *parseError,
    764                         UErrorCode *status)
    765 {
    766     /* parsing part */
    767     UBool variableTop = FALSE;
    768     UBool top = FALSE;
    769     UBool inChars = TRUE;
    770     UBool inQuote = FALSE;
    771     UBool wasInQuote = FALSE;
    772     uint8_t before = 0;
    773     UBool isEscaped = FALSE;
    774     // TODO: replace these variables with src->parsedToken counterparts
    775     // no need to use them anymore since we have src->parsedToken.
    776     // Ideally, token parser would be a nice class... Once, when I have
    777     // more time (around 2020 probably).
    778     uint32_t newExtensionLen = 0;
    779     uint32_t extensionOffset = 0;
    780     uint32_t newStrength = UCOL_TOK_UNSET;
    781     UChar buff[10];
    782     UChar32 codepoint;
    783 
    784     src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
    785     src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
    786     src->parsedToken.indirectIndex = 0;
    787 
    788     while (src->current < src->end) {
    789         UChar ch = *(src->current);
    790 
    791         if (inQuote) {
    792             if (ch == 0x0027/*'\''*/) {
    793                 inQuote = FALSE;
    794             } else {
    795                 if ((src->parsedToken.charsLen == 0) || inChars) {
    796                     if(src->parsedToken.charsLen == 0) {
    797                         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    798                     }
    799                     src->parsedToken.charsLen++;
    800                 } else {
    801                     if(newExtensionLen == 0) {
    802                         extensionOffset = (uint32_t)(src->extraCurrent - src->source);
    803                     }
    804                     newExtensionLen++;
    805                 }
    806             }
    807         }else if(isEscaped){
    808             isEscaped =FALSE;
    809             if (newStrength == UCOL_TOK_UNSET) {
    810                 *status = U_INVALID_FORMAT_ERROR;
    811                 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    812                 return NULL;
    813                 // enabling rules to start with non-tokens a < b
    814                 // newStrength = UCOL_TOK_RESET;
    815             }
    816             if(ch != 0x0000  && src->current != src->end) {
    817                 if (inChars) {
    818                     if(src->parsedToken.charsLen == 0) {
    819                         src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
    820                     }
    821                     src->parsedToken.charsLen++;
    822                 } else {
    823                     if(newExtensionLen == 0) {
    824                         extensionOffset = (uint32_t)(src->current - src->source);
    825                     }
    826                     newExtensionLen++;
    827                 }
    828             }
    829         }else {
    830             if(!uprv_isRuleWhiteSpace(ch)) {
    831                 /* Sets the strength for this entry */
    832                 switch (ch) {
    833                 case 0x003D/*'='*/ :
    834                     if (newStrength != UCOL_TOK_UNSET) {
    835                         goto EndOfLoop;
    836                     }
    837 
    838                     /* if we start with strength, we'll reset to top */
    839                     if(startOfRules == TRUE) {
    840                         src->parsedToken.indirectIndex = 5;
    841                         top = ucol_tok_doSetTop(src, status);
    842                         newStrength = UCOL_TOK_RESET;
    843                         goto EndOfLoop;
    844                     }
    845                     newStrength = UCOL_IDENTICAL;
    846                     if(*(src->current+1) == 0x002A) {/*'*'*/
    847                         src->current++;
    848                         src->prevStrength = newStrength;
    849                     }else{
    850                         src->prevStrength = UCOL_TOK_UNSET;
    851                     }
    852                     break;
    853 
    854                 case 0x002C/*','*/:
    855                     if (newStrength != UCOL_TOK_UNSET) {
    856                         goto EndOfLoop;
    857                     }
    858 
    859                     /* if we start with strength, we'll reset to top */
    860                     if(startOfRules == TRUE) {
    861                         src->parsedToken.indirectIndex = 5;
    862                         top = ucol_tok_doSetTop(src, status);
    863                         newStrength = UCOL_TOK_RESET;
    864                         goto EndOfLoop;
    865                     }
    866                     newStrength = UCOL_TERTIARY;
    867                     src->prevStrength = UCOL_TOK_UNSET;
    868                     break;
    869 
    870                 case  0x003B/*';'*/:
    871                     if (newStrength != UCOL_TOK_UNSET) {
    872                         goto EndOfLoop;
    873                     }
    874 
    875                     /* if we start with strength, we'll reset to top */
    876                     if(startOfRules == TRUE) {
    877                         src->parsedToken.indirectIndex = 5;
    878                         top = ucol_tok_doSetTop(src, status);
    879                         newStrength = UCOL_TOK_RESET;
    880                         goto EndOfLoop;
    881                     }
    882                     newStrength = UCOL_SECONDARY;
    883                     src->prevStrength = UCOL_TOK_UNSET;
    884                     break;
    885 
    886                 case 0x003C/*'<'*/:
    887                     if (newStrength != UCOL_TOK_UNSET) {
    888                         goto EndOfLoop;
    889                     }
    890 
    891                     /* if we start with strength, we'll reset to top */
    892                     if(startOfRules == TRUE) {
    893                         src->parsedToken.indirectIndex = 5;
    894                         top = ucol_tok_doSetTop(src, status);
    895                         newStrength = UCOL_TOK_RESET;
    896                         goto EndOfLoop;
    897                     }
    898                     /* before this, do a scan to verify whether this is */
    899                     /* another strength */
    900                     if(*(src->current+1) == 0x003C) {
    901                         src->current++;
    902                         if(*(src->current+1) == 0x003C) {
    903                             src->current++; /* three in a row! */
    904                             newStrength = UCOL_TERTIARY;
    905                         } else { /* two in a row */
    906                             newStrength = UCOL_SECONDARY;
    907                         }
    908                     } else { /* just one */
    909                         newStrength = UCOL_PRIMARY;
    910                     }
    911                     if(*(src->current+1) == 0x002A) {/*'*'*/
    912                         src->current++;
    913                         src->prevStrength = newStrength;
    914                     }else{
    915                         src->prevStrength = UCOL_TOK_UNSET;
    916                     }
    917                     break;
    918 
    919                 case 0x0026/*'&'*/:
    920                     if (newStrength != UCOL_TOK_UNSET) {
    921                         /**/
    922                         goto EndOfLoop;
    923                     }
    924 
    925                     newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
    926                     src->prevStrength = UCOL_TOK_UNSET;
    927                     break;
    928 
    929                 case 0x005b/*'['*/:
    930                     /* options - read an option, analyze it */
    931                     if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
    932                         uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
    933                         if(U_SUCCESS(*status)) {
    934                             if(result & UCOL_TOK_TOP) {
    935                                 if(newStrength == UCOL_TOK_RESET) {
    936                                     top = ucol_tok_doSetTop(src, status);
    937                                     if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
    938                                         src->parsedToken.charsLen+=2;
    939                                         buff[0] = 0x002d;
    940                                         buff[1] = before;
    941                                         ucol_tok_addToExtraCurrent(src, buff, 2, status);
    942                                     }
    943 
    944                                     src->current++;
    945                                     goto EndOfLoop;
    946                                 } else {
    947                                     *status = U_INVALID_FORMAT_ERROR;
    948                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    949                                 }
    950                             } else if(result & UCOL_TOK_VARIABLE_TOP) {
    951                                 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
    952                                     variableTop = TRUE;
    953                                     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    954                                     src->parsedToken.charsLen = 1;
    955                                     buff[0] = 0xFFFF;
    956                                     ucol_tok_addToExtraCurrent(src, buff, 1, status);
    957                                     src->current++;
    958                                     goto EndOfLoop;
    959                                 } else {
    960                                     *status = U_INVALID_FORMAT_ERROR;
    961                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    962                                 }
    963                             } else if (result & UCOL_TOK_BEFORE){
    964                                 if(newStrength == UCOL_TOK_RESET) {
    965                                     before = result & UCOL_TOK_BEFORE;
    966                                 } else {
    967                                     *status = U_INVALID_FORMAT_ERROR;
    968                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    969 
    970                                 }
    971                             }
    972                         } else {
    973                             *status = U_INVALID_FORMAT_ERROR;
    974                             syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    975                             return NULL;
    976                         }
    977                     }
    978                     break;
    979                 case 0x0021/*! skip java thai modifier reordering*/:
    980                     break;
    981                 case 0x002F/*'/'*/:
    982                     wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
    983                     inChars = FALSE; /* we're now processing expansion */
    984                     break;
    985                 case 0x005C /* back slash for escaped chars */:
    986                     isEscaped = TRUE;
    987                     break;
    988                     /* found a quote, we're gonna start copying */
    989                 case 0x0027/*'\''*/:
    990                     if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
    991                         if(src->prevStrength == UCOL_TOK_UNSET){
    992                             *status = U_INVALID_FORMAT_ERROR;
    993                             syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    994                             return NULL;
    995                             // enabling rules to start with a non-token character a < b
    996                             // newStrength = UCOL_TOK_RESET;
    997                         }else{
    998                             newStrength = src->prevStrength;
    999                         }
   1000                     }
   1001 
   1002                     inQuote = TRUE;
   1003 
   1004                     if(inChars) { /* we're doing characters */
   1005                         if(wasInQuote == FALSE) {
   1006                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1007                         }
   1008                         if (src->parsedToken.charsLen != 0) {
   1009                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
   1010                         }
   1011                         src->parsedToken.charsLen++;
   1012                     } else { /* we're doing an expansion */
   1013                         if(wasInQuote == FALSE) {
   1014                             extensionOffset = (uint32_t)(src->extraCurrent - src->source);
   1015                         }
   1016                         if (newExtensionLen != 0) {
   1017                             ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
   1018                         }
   1019                         newExtensionLen++;
   1020                     }
   1021 
   1022                     wasInQuote = TRUE;
   1023 
   1024                     ch = *(++(src->current));
   1025                     if(ch == 0x0027) { /* copy the double quote */
   1026                         ucol_tok_addToExtraCurrent(src, &ch, 1, status);
   1027                         inQuote = FALSE;
   1028                     }
   1029                     break;
   1030 
   1031                     /* '@' is french only if the strength is not currently set */
   1032                     /* if it is, it's just a regular character in collation rules */
   1033                 case 0x0040/*'@'*/:
   1034                     if (newStrength == UCOL_TOK_UNSET) {
   1035                         src->opts->frenchCollation = UCOL_ON;
   1036                         break;
   1037                     }
   1038 
   1039                 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
   1040                     // we want to store read characters to the prefix part and continue reading
   1041                     // the characters (proper way would be to restart reading the chars, but in
   1042                     // that case we would have to complicate the token hasher, which I do not
   1043                     // intend to play with. Instead, we will do prefixes when prefixes are due
   1044                     // (before adding the elements).
   1045                     src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
   1046                     src->parsedToken.prefixLen = src->parsedToken.charsLen;
   1047 
   1048                     if(inChars) { /* we're doing characters */
   1049                         if(wasInQuote == FALSE) {
   1050                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1051                         }
   1052                         if (src->parsedToken.charsLen != 0) {
   1053                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
   1054                         }
   1055                         src->parsedToken.charsLen++;
   1056                     }
   1057 
   1058                     wasInQuote = TRUE;
   1059 
   1060                     do {
   1061                         ch = *(++(src->current));
   1062                         // skip whitespace between '|' and the character
   1063                     } while (uprv_isRuleWhiteSpace(ch));
   1064                     break;
   1065 
   1066                     //charsOffset = 0;
   1067                     //newCharsLen = 0;
   1068                     //break; // We want to store the whole prefix/character sequence. If we break
   1069                     // the '|' is going to get lost.
   1070                 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
   1071                     do {
   1072                         ch = *(++(src->current));
   1073                     } while (!isCharNewLine(ch));
   1074 
   1075                     break;
   1076                 default:
   1077                     if (newStrength == UCOL_TOK_UNSET) {
   1078                         if(src->prevStrength == UCOL_TOK_UNSET){
   1079                             *status = U_INVALID_FORMAT_ERROR;
   1080                             syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1081                             return NULL;
   1082                         }else{
   1083                             newStrength = src->prevStrength;
   1084                         }
   1085                     }
   1086 
   1087                     if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
   1088                         *status = U_INVALID_FORMAT_ERROR;
   1089                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1090                         return NULL;
   1091                     }
   1092 
   1093                     if(ch == 0x0000 && src->current+1 == src->end) {
   1094                         break;
   1095                     }
   1096 
   1097                     if (inChars) {
   1098                         if(src->parsedToken.charsLen == 0) {
   1099                             src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
   1100                         }
   1101                         src->parsedToken.charsLen++;
   1102                         if(src->prevStrength != UCOL_TOK_UNSET){
   1103                             U16_NEXT(0, src->current, src->end, codepoint);
   1104                             src->parsedToken.charsLen+= U16_LENGTH(codepoint) - 1;
   1105                             goto EndOfLoop;
   1106                         }
   1107                     } else {
   1108                         if(newExtensionLen == 0) {
   1109                             extensionOffset = (uint32_t)(src->current - src->source);
   1110                         }
   1111                         newExtensionLen++;
   1112                     }
   1113 
   1114                     break;
   1115                 }
   1116             }
   1117         }
   1118 
   1119         if(wasInQuote) {
   1120             if(src->prevStrength != UCOL_TOK_UNSET && !inQuote){
   1121                 src->current++;
   1122                 goto EndOfLoop;
   1123             }
   1124             if(ch != 0x27) {
   1125                 if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
   1126                     ucol_tok_addToExtraCurrent(src, &ch, 1, status);
   1127                 }
   1128             }
   1129         }
   1130 
   1131         src->current++;
   1132     }
   1133 
   1134 EndOfLoop:
   1135     wasInQuote = FALSE;
   1136     if (newStrength == UCOL_TOK_UNSET) {
   1137         return NULL;
   1138     }
   1139 
   1140     if (src->parsedToken.charsLen == 0 && top == FALSE) {
   1141         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1142         *status = U_INVALID_FORMAT_ERROR;
   1143         return NULL;
   1144     }
   1145 
   1146     src->parsedToken.strength = newStrength;
   1147     src->parsedToken.extensionOffset = extensionOffset;
   1148     src->parsedToken.extensionLen = newExtensionLen;
   1149     src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
   1150 
   1151     return src->current;
   1152 }
   1153 
   1154 /*
   1155 Processing Description
   1156 1 Build a ListList. Each list has a header, which contains two lists (positive
   1157 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
   1158 reset may be null.
   1159 2 As you process, you keep a LAST pointer that points to the last token you
   1160 handled.
   1161 */
   1162 
   1163 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
   1164                                       UParseError *parseError, UErrorCode *status)
   1165 {
   1166     if(src->resultLen == src->listCapacity) {
   1167         // Unfortunately, this won't work, as we store addresses of lhs in token
   1168         src->listCapacity *= 2;
   1169         src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
   1170         if(src->lh == NULL) {
   1171             *status = U_MEMORY_ALLOCATION_ERROR;
   1172             return NULL;
   1173         }
   1174     }
   1175     /* do the reset thing */
   1176     UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
   1177     /* test for NULL */
   1178     if (sourceToken == NULL) {
   1179         *status = U_MEMORY_ALLOCATION_ERROR;
   1180         return NULL;
   1181     }
   1182     sourceToken->rulesToParse = src->source;
   1183     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1184     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
   1185 
   1186     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
   1187     sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
   1188 
   1189     // keep the flags around so that we know about before
   1190     sourceToken->flags = src->parsedToken.flags;
   1191 
   1192     if(src->parsedToken.prefixOffset != 0) {
   1193         // this is a syntax error
   1194         *status = U_INVALID_FORMAT_ERROR;
   1195         syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
   1196         uprv_free(sourceToken);
   1197         return 0;
   1198     } else {
   1199         sourceToken->prefix = 0;
   1200     }
   1201 
   1202     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
   1203     sourceToken->strength = UCOL_TOK_RESET;
   1204     sourceToken->next = NULL;
   1205     sourceToken->previous = NULL;
   1206     sourceToken->noOfCEs = 0;
   1207     sourceToken->noOfExpCEs = 0;
   1208     sourceToken->listHeader = &src->lh[src->resultLen];
   1209 
   1210     src->lh[src->resultLen].first = NULL;
   1211     src->lh[src->resultLen].last = NULL;
   1212     src->lh[src->resultLen].first = NULL;
   1213     src->lh[src->resultLen].last = NULL;
   1214 
   1215     src->lh[src->resultLen].reset = sourceToken;
   1216 
   1217     /*
   1218     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
   1219     First convert all expansions into normal form. Examples:
   1220     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
   1221     d * ... into &x * c/y * d * ...
   1222     Note: reset values can never have expansions, although they can cause the
   1223     very next item to have one. They may be contractions, if they are found
   1224     earlier in the list.
   1225     */
   1226     *expandNext = 0;
   1227     if(expand != NULL) {
   1228         /* check to see if there is an expansion */
   1229         if(src->parsedToken.charsLen > 1) {
   1230             uint32_t resetCharsOffset;
   1231             resetCharsOffset = (uint32_t)(expand - src->source);
   1232             sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
   1233             *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
   1234         }
   1235     }
   1236 
   1237     src->resultLen++;
   1238 
   1239     uhash_put(src->tailored, sourceToken, sourceToken, status);
   1240 
   1241     return sourceToken;
   1242 }
   1243 
   1244 static
   1245 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
   1246     if(U_FAILURE(*status)) {
   1247         return NULL;
   1248     }
   1249     /* this is a virgin before - we need to fish the anchor from the UCA */
   1250     collIterate s;
   1251     uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
   1252     uint32_t CE, SecondCE;
   1253     uint32_t invPos;
   1254     if(sourceToken != NULL) {
   1255         uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
   1256     } else {
   1257         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
   1258     }
   1259     if(U_FAILURE(*status)) {
   1260         return NULL;
   1261     }
   1262 
   1263     baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
   1264     baseContCE = ucol_getNextCE(src->UCA, &s, status);
   1265     if(baseContCE == UCOL_NO_MORE_CES) {
   1266         baseContCE = 0;
   1267     }
   1268 
   1269 
   1270     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   1271     uint32_t ch = 0;
   1272     uint32_t expandNext = 0;
   1273     UColToken key;
   1274 
   1275     if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
   1276         uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
   1277         uint32_t raw = uprv_uca_getRawFromImplicit(primary);
   1278         ch = uprv_uca_getCodePointFromRaw(raw-1);
   1279         uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
   1280         CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
   1281         SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
   1282 
   1283         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1284         *src->extraCurrent++ = 0xFFFE;
   1285         *src->extraCurrent++ = (UChar)ch;
   1286         src->parsedToken.charsLen++;
   1287 
   1288         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
   1289         key.rulesToParse = src->source;
   1290 
   1291         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
   1292         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1293 
   1294         if(sourceToken == NULL) {
   1295             src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   1296             if(isContinuation(SecondCE)) {
   1297                 src->lh[src->resultLen].baseContCE = SecondCE;
   1298             } else {
   1299                 src->lh[src->resultLen].baseContCE = 0;
   1300             }
   1301             src->lh[src->resultLen].nextCE = 0;
   1302             src->lh[src->resultLen].nextContCE = 0;
   1303             src->lh[src->resultLen].previousCE = 0;
   1304             src->lh[src->resultLen].previousContCE = 0;
   1305 
   1306             src->lh[src->resultLen].indirect = FALSE;
   1307 
   1308             sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1309         }
   1310 
   1311     } else {
   1312         invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
   1313 
   1314         // we got the previous CE. Now we need to see if the difference between
   1315         // the two CEs is really of the requested strength.
   1316         // if it's a bigger difference (we asked for secondary and got primary), we
   1317         // need to modify the CE.
   1318         if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
   1319             // adjust the strength
   1320             // now we are in the situation where our baseCE should actually be modified in
   1321             // order to get the CE in the right position.
   1322             if(strength == UCOL_SECONDARY) {
   1323                 CE = baseCE - 0x0200;
   1324             } else { // strength == UCOL_TERTIARY
   1325                 CE = baseCE - 0x02;
   1326             }
   1327             if(baseContCE) {
   1328                 if(strength == UCOL_SECONDARY) {
   1329                     SecondCE = baseContCE - 0x0200;
   1330                 } else { // strength == UCOL_TERTIARY
   1331                     SecondCE = baseContCE - 0x02;
   1332                 }
   1333             }
   1334         }
   1335 
   1336 #if 0
   1337         // the code below relies on getting a code point from the inverse table, in order to be
   1338         // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
   1339         // 1. There are many code points that have the same CE
   1340         // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
   1341         // Also, in case when there is no equivalent strength before an element, we have to actually
   1342         // construct one. For example, &[before 2]a << x won't result in x << a, because the element
   1343         // before a is a primary difference.
   1344 
   1345         //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   1346 
   1347 
   1348         ch = CETable[3*invPos+2];
   1349 
   1350         if((ch &  UCOL_INV_SIZEMASK) != 0) {
   1351             uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
   1352             uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
   1353             ch = conts[offset];
   1354         }
   1355 
   1356         *src->extraCurrent++ = (UChar)ch;
   1357         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
   1358         src->parsedToken.charsLen = 1;
   1359 
   1360         // We got an UCA before. However, this might have been tailored.
   1361         // example:
   1362         // &\u30ca = \u306a
   1363         // &[before 3]\u306a<<<\u306a|\u309d
   1364 
   1365 
   1366         // uint32_t key = (*newCharsLen << 24) | *charsOffset;
   1367         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
   1368         key.rulesToParse = src->source;
   1369 
   1370         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
   1371         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1372 #endif
   1373 
   1374         // here is how it should be. The situation such as &[before 1]a < x, should be
   1375         // resolved exactly as if we wrote &a > x.
   1376         // therefore, I don't really care if the UCA value before a has been changed.
   1377         // However, I do care if the strength between my element and the previous element
   1378         // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
   1379         // have to construct the base CE.
   1380 
   1381 
   1382 
   1383         // if we found a tailored thing, we have to use the UCA value and construct
   1384         // a new reset token with constructed name
   1385         //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
   1386         // character to which we want to anchor is already tailored.
   1387         // We need to construct a new token which will be the anchor
   1388         // point
   1389         //*(src->extraCurrent-1) = 0xFFFE;
   1390         //*src->extraCurrent++ = (UChar)ch;
   1391         // grab before
   1392         src->parsedToken.charsOffset -= 10;
   1393         src->parsedToken.charsLen += 10;
   1394         src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   1395         if(isContinuation(SecondCE)) {
   1396             src->lh[src->resultLen].baseContCE = SecondCE;
   1397         } else {
   1398             src->lh[src->resultLen].baseContCE = 0;
   1399         }
   1400         src->lh[src->resultLen].nextCE = 0;
   1401         src->lh[src->resultLen].nextContCE = 0;
   1402         src->lh[src->resultLen].previousCE = 0;
   1403         src->lh[src->resultLen].previousContCE = 0;
   1404 
   1405         src->lh[src->resultLen].indirect = FALSE;
   1406 
   1407         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1408         //}
   1409     }
   1410 
   1411     return sourceToken;
   1412 
   1413 }
   1414 
   1415 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
   1416     UColToken *lastToken = NULL;
   1417     const UChar *parseEnd = NULL;
   1418     uint32_t expandNext = 0;
   1419     UBool variableTop = FALSE;
   1420     UBool top = FALSE;
   1421     uint16_t specs = 0;
   1422     UColTokListHeader *ListList = NULL;
   1423 
   1424     src->parsedToken.strength = UCOL_TOK_UNSET;
   1425 
   1426     ListList = src->lh;
   1427 
   1428     if(U_FAILURE(*status)) {
   1429         return 0;
   1430     }
   1431 
   1432     while(src->current < src->end) {
   1433         src->parsedToken.prefixOffset = 0;
   1434 
   1435         parseEnd = ucol_tok_parseNextToken(src,
   1436             (UBool)(lastToken == NULL),
   1437             parseError,
   1438             status);
   1439 
   1440         specs = src->parsedToken.flags;
   1441 
   1442 
   1443         variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
   1444         top = ((specs & UCOL_TOK_TOP) != 0);
   1445 
   1446         if(U_SUCCESS(*status) && parseEnd != NULL) {
   1447             UColToken *sourceToken = NULL;
   1448             //uint32_t key = 0;
   1449             uint32_t lastStrength = UCOL_TOK_UNSET;
   1450 
   1451             if(lastToken != NULL ) {
   1452                 lastStrength = lastToken->strength;
   1453             }
   1454 
   1455             //key = newCharsLen << 24 | charsOffset;
   1456             UColToken key;
   1457             key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1458             key.rulesToParse = src->source;
   1459 
   1460             /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
   1461             sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1462 
   1463             if(src->parsedToken.strength != UCOL_TOK_RESET) {
   1464                 if(lastToken == NULL) { /* this means that rules haven't started properly */
   1465                     *status = U_INVALID_FORMAT_ERROR;
   1466                     syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
   1467                     return 0;
   1468                 }
   1469                 /*  6 Otherwise (when relation != reset) */
   1470                 if(sourceToken == NULL) {
   1471                     /* If sourceToken is null, create new one, */
   1472                     sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
   1473                     /* test for NULL */
   1474                     if (sourceToken == NULL) {
   1475                         *status = U_MEMORY_ALLOCATION_ERROR;
   1476                         return 0;
   1477                     }
   1478                     sourceToken->rulesToParse = src->source;
   1479                     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1480 
   1481                     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
   1482 
   1483                     sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
   1484                     sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
   1485 
   1486                     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
   1487                     sourceToken->next = NULL;
   1488                     sourceToken->previous = NULL;
   1489                     sourceToken->noOfCEs = 0;
   1490                     sourceToken->noOfExpCEs = 0;
   1491                     // keep the flags around so that we know about before
   1492                     sourceToken->flags = src->parsedToken.flags;
   1493                     uhash_put(src->tailored, sourceToken, sourceToken, status);
   1494                     if(U_FAILURE(*status)) {
   1495                         return 0;
   1496                     }
   1497                 } else {
   1498                     /* we could have fished out a reset here */
   1499                     if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
   1500                         /* otherwise remove sourceToken from where it was. */
   1501                         if(sourceToken->next != NULL) {
   1502                             if(sourceToken->next->strength > sourceToken->strength) {
   1503                                 sourceToken->next->strength = sourceToken->strength;
   1504                             }
   1505                             sourceToken->next->previous = sourceToken->previous;
   1506                         } else {
   1507                             sourceToken->listHeader->last = sourceToken->previous;
   1508                         }
   1509 
   1510                         if(sourceToken->previous != NULL) {
   1511                             sourceToken->previous->next = sourceToken->next;
   1512                         } else {
   1513                             sourceToken->listHeader->first = sourceToken->next;
   1514                         }
   1515                         sourceToken->next = NULL;
   1516                         sourceToken->previous = NULL;
   1517                     }
   1518                 }
   1519 
   1520                 sourceToken->strength = src->parsedToken.strength;
   1521                 sourceToken->listHeader = lastToken->listHeader;
   1522 
   1523                 /*
   1524                 1.  Find the strongest strength in each list, and set strongestP and strongestN
   1525                 accordingly in the headers.
   1526                 */
   1527                 if(lastStrength == UCOL_TOK_RESET
   1528                     || sourceToken->listHeader->first == 0) {
   1529                         /* If LAST is a reset
   1530                         insert sourceToken in the list. */
   1531                         if(sourceToken->listHeader->first == 0) {
   1532                             sourceToken->listHeader->first = sourceToken;
   1533                             sourceToken->listHeader->last = sourceToken;
   1534                         } else { /* we need to find a place for us */
   1535                             /* and we'll get in front of the same strength */
   1536                             if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
   1537                                 sourceToken->next = sourceToken->listHeader->first;
   1538                                 sourceToken->next->previous = sourceToken;
   1539                                 sourceToken->listHeader->first = sourceToken;
   1540                                 sourceToken->previous = NULL;
   1541                             } else {
   1542                                 lastToken = sourceToken->listHeader->first;
   1543                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
   1544                                     lastToken = lastToken->next;
   1545                                 }
   1546                                 if(lastToken->next != NULL) {
   1547                                     lastToken->next->previous = sourceToken;
   1548                                 } else {
   1549                                     sourceToken->listHeader->last = sourceToken;
   1550                                 }
   1551                                 sourceToken->previous = lastToken;
   1552                                 sourceToken->next = lastToken->next;
   1553                                 lastToken->next = sourceToken;
   1554                             }
   1555                         }
   1556                     } else {
   1557                         /* Otherwise (when LAST is not a reset)
   1558                         if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
   1559                         otherwise insert before.
   1560                         when inserting after or before, search to the next position with the same
   1561                         strength in that direction. (This is called postpone insertion).         */
   1562                         if(sourceToken != lastToken) {
   1563                             if(lastToken->polarity == sourceToken->polarity) {
   1564                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
   1565                                     lastToken = lastToken->next;
   1566                                 }
   1567                                 sourceToken->previous = lastToken;
   1568                                 if(lastToken->next != NULL) {
   1569                                     lastToken->next->previous = sourceToken;
   1570                                 } else {
   1571                                     sourceToken->listHeader->last = sourceToken;
   1572                                 }
   1573 
   1574                                 sourceToken->next = lastToken->next;
   1575                                 lastToken->next = sourceToken;
   1576                             } else {
   1577                                 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
   1578                                     lastToken = lastToken->previous;
   1579                                 }
   1580                                 sourceToken->next = lastToken;
   1581                                 if(lastToken->previous != NULL) {
   1582                                     lastToken->previous->next = sourceToken;
   1583                                 } else {
   1584                                     sourceToken->listHeader->first = sourceToken;
   1585                                 }
   1586                                 sourceToken->previous = lastToken->previous;
   1587                                 lastToken->previous = sourceToken;
   1588                             }
   1589                         } else { /* repeated one thing twice in rules, stay with the stronger strength */
   1590                             if(lastStrength < sourceToken->strength) {
   1591                                 sourceToken->strength = lastStrength;
   1592                             }
   1593                         }
   1594                     }
   1595 
   1596                     /* if the token was a variable top, we're gonna put it in */
   1597                     if(variableTop == TRUE && src->varTop == NULL) {
   1598                         variableTop = FALSE;
   1599                         src->varTop = sourceToken;
   1600                     }
   1601 
   1602                     // Treat the expansions.
   1603                     // There are two types of expansions: explicit (x / y) and reset based propagating expansions
   1604                     // (&abc * d * e <=> &ab * d / c * e / c)
   1605                     // if both of them are in effect for a token, they are combined.
   1606 
   1607                     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
   1608 
   1609                     if(expandNext != 0) {
   1610                         if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
   1611                             expandNext = 0;
   1612                         } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
   1613                             sourceToken->expansion = expandNext;
   1614                         } else { /* there is both explicit and implicit expansion. We need to make a combination */
   1615                             uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
   1616                             uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
   1617                             sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
   1618                             src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
   1619                         }
   1620                     }
   1621 
   1622                     // This is just for debugging purposes
   1623                     if(sourceToken->expansion != 0) {
   1624                         sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
   1625                     } else {
   1626                         sourceToken->debugExpansion = 0;
   1627                     }
   1628                     // if the previous token was a reset before, the strength of this
   1629                     // token must match the strength of before. Otherwise we have an
   1630                     // undefined situation.
   1631                     // In other words, we currently have a cludge which we use to
   1632                     // represent &a >> x. This is written as &[before 2]a << x.
   1633                     if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
   1634                         uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
   1635                         if(beforeStrength != sourceToken->strength) {
   1636                             *status = U_INVALID_FORMAT_ERROR;
   1637                             syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
   1638                             return 0;
   1639                         }
   1640                     }
   1641             } else {
   1642                 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
   1643                     /* if the previous token was also a reset, */
   1644                     /*this means that we have two consecutive resets */
   1645                     /* and we want to remove the previous one if empty*/
   1646                     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
   1647                         src->resultLen--;
   1648                     }
   1649                 }
   1650 
   1651                 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
   1652                     uint32_t searchCharsLen = src->parsedToken.charsLen;
   1653                     while(searchCharsLen > 1 && sourceToken == NULL) {
   1654                         searchCharsLen--;
   1655                         //key = searchCharsLen << 24 | charsOffset;
   1656                         UColToken key;
   1657                         key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
   1658                         key.rulesToParse = src->source;
   1659                         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1660                     }
   1661                     if(sourceToken != NULL) {
   1662                         expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
   1663                     }
   1664                 }
   1665 
   1666                 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
   1667                     if(top == FALSE) { /* there is no indirection */
   1668                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
   1669                         if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
   1670                             /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
   1671                             while(sourceToken->strength > strength && sourceToken->previous != NULL) {
   1672                                 sourceToken = sourceToken->previous;
   1673                             }
   1674                             /* here, either we hit the strength or NULL */
   1675                             if(sourceToken->strength == strength) {
   1676                                 if(sourceToken->previous != NULL) {
   1677                                     sourceToken = sourceToken->previous;
   1678                                 } else { /* start of list */
   1679                                     sourceToken = sourceToken->listHeader->reset;
   1680                                 }
   1681                             } else { /* we hit NULL */
   1682                                 /* we should be doing the else part */
   1683                                 sourceToken = sourceToken->listHeader->reset;
   1684                                 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
   1685                             }
   1686                         } else {
   1687                             sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
   1688                         }
   1689                     } else { /* this is both before and indirection */
   1690                         top = FALSE;
   1691                         ListList[src->resultLen].previousCE = 0;
   1692                         ListList[src->resultLen].previousContCE = 0;
   1693                         ListList[src->resultLen].indirect = TRUE;
   1694                         /* we need to do slightly more work. we need to get the baseCE using the */
   1695                         /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
   1696                         /* in ucol_bld */
   1697                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
   1698                         uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
   1699                         uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
   1700                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
   1701 
   1702                         UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   1703                         if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
   1704                             uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
   1705                             uint32_t raw = uprv_uca_getRawFromImplicit(primary);
   1706                             uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
   1707                             CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
   1708                             SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
   1709                         } else {
   1710                             /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
   1711                             ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
   1712                         }
   1713 
   1714                         ListList[src->resultLen].baseCE = CE;
   1715                         ListList[src->resultLen].baseContCE = SecondCE;
   1716                         ListList[src->resultLen].nextCE = 0;
   1717                         ListList[src->resultLen].nextContCE = 0;
   1718 
   1719                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1720                     }
   1721                 }
   1722 
   1723 
   1724                 /*  5 If the relation is a reset:
   1725                 If sourceToken is null
   1726                 Create new list, create new sourceToken, make the baseCE from source, put
   1727                 the sourceToken in ListHeader of the new list */
   1728                 if(sourceToken == NULL) {
   1729                     /*
   1730                     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
   1731                     First convert all expansions into normal form. Examples:
   1732                     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
   1733                     d * ... into &x * c/y * d * ...
   1734                     Note: reset values can never have expansions, although they can cause the
   1735                     very next item to have one. They may be contractions, if they are found
   1736                     earlier in the list.
   1737                     */
   1738                     if(top == FALSE) {
   1739                         collIterate s;
   1740                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
   1741 
   1742                         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
   1743 
   1744                         CE = ucol_getNextCE(src->UCA, &s, status);
   1745                         const UChar *expand = s.pos;
   1746                         SecondCE = ucol_getNextCE(src->UCA, &s, status);
   1747 
   1748                         ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   1749                         if(isContinuation(SecondCE)) {
   1750                             ListList[src->resultLen].baseContCE = SecondCE;
   1751                         } else {
   1752                             ListList[src->resultLen].baseContCE = 0;
   1753                         }
   1754                         ListList[src->resultLen].nextCE = 0;
   1755                         ListList[src->resultLen].nextContCE = 0;
   1756                         ListList[src->resultLen].previousCE = 0;
   1757                         ListList[src->resultLen].previousContCE = 0;
   1758                         ListList[src->resultLen].indirect = FALSE;
   1759                         sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
   1760                     } else { /* top == TRUE */
   1761                         /* just use the supplied values */
   1762                         top = FALSE;
   1763                         ListList[src->resultLen].previousCE = 0;
   1764                         ListList[src->resultLen].previousContCE = 0;
   1765                         ListList[src->resultLen].indirect = TRUE;
   1766                         ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
   1767                         ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
   1768                         ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
   1769                         ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
   1770 
   1771                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1772 
   1773                     }
   1774                 } else { /* reset to something already in rules */
   1775                     top = FALSE;
   1776                 }
   1777             }
   1778             /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
   1779             lastToken = sourceToken;
   1780         } else {
   1781             if(U_FAILURE(*status)) {
   1782                 return 0;
   1783             }
   1784         }
   1785     }
   1786 
   1787     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
   1788         src->resultLen--;
   1789     }
   1790     return src->resultLen;
   1791 }
   1792 
   1793 void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
   1794     U_NAMESPACE_USE
   1795 
   1796     uint32_t nSize = 0;
   1797     uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
   1798     if(U_FAILURE(*status)) {
   1799         return;
   1800     }
   1801 
   1802     // set everything to zero, so that we can clean up gracefully
   1803     uprv_memset(src, 0, sizeof(UColTokenParser));
   1804 
   1805     // first we need to find options that don't like to be normalized,
   1806     // like copy and remove...
   1807     //const UChar *openBrace = rules;
   1808     int32_t optionNumber = -1;
   1809     const UChar *setStart = NULL;
   1810     uint32_t i = 0;
   1811     while(i < rulesLength) {
   1812         if(rules[i] == 0x005B) {
   1813             // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
   1814             //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
   1815             optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
   1816             if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
   1817                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
   1818                 if(U_SUCCESS(*status)) {
   1819                     if(src->copySet == NULL) {
   1820                         src->copySet = newSet;
   1821                     } else {
   1822                         uset_addAll(src->copySet, newSet);
   1823                         uset_close(newSet);
   1824                     }
   1825                 } else {
   1826                     return;
   1827                 }
   1828             } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
   1829                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
   1830                 if(U_SUCCESS(*status)) {
   1831                     if(src->removeSet == NULL) {
   1832                         src->removeSet = newSet;
   1833                     } else {
   1834                         uset_addAll(src->removeSet, newSet);
   1835                         uset_close(newSet);
   1836                     }
   1837                 } else {
   1838                     return;
   1839                 }
   1840             }
   1841         }
   1842         //openBrace++;
   1843         i++;
   1844     }
   1845 
   1846     src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
   1847     /* test for NULL */
   1848     if (src->source == NULL) {
   1849         *status = U_MEMORY_ALLOCATION_ERROR;
   1850         return;
   1851     }
   1852     uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
   1853     nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
   1854     if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
   1855         *status = U_ZERO_ERROR;
   1856         src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
   1857         /* test for NULL */
   1858         if (src->source == NULL) {
   1859             *status = U_MEMORY_ALLOCATION_ERROR;
   1860             return;
   1861         }
   1862         nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
   1863     }
   1864     src->current = src->source;
   1865     src->end = src->source+nSize;
   1866     src->sourceCurrent = src->source;
   1867     src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
   1868     src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
   1869     src->varTop = NULL;
   1870     src->UCA = UCA;
   1871     src->invUCA = ucol_initInverseUCA(status);
   1872     src->parsedToken.charsLen = 0;
   1873     src->parsedToken.charsOffset = 0;
   1874     src->parsedToken.extensionLen = 0;
   1875     src->parsedToken.extensionOffset = 0;
   1876     src->parsedToken.prefixLen = 0;
   1877     src->parsedToken.prefixOffset = 0;
   1878     src->parsedToken.flags = 0;
   1879     src->parsedToken.strength = UCOL_TOK_UNSET;
   1880     src->buildCCTabFlag = FALSE;
   1881     src->prevStrength = UCOL_TOK_UNSET;
   1882 
   1883     if(U_FAILURE(*status)) {
   1884         return;
   1885     }
   1886     src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
   1887     if(U_FAILURE(*status)) {
   1888         return;
   1889     }
   1890     uhash_setValueDeleter(src->tailored, uhash_freeBlock);
   1891 
   1892     src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
   1893     /* test for NULL */
   1894     if (src->opts == NULL) {
   1895         *status = U_MEMORY_ALLOCATION_ERROR;
   1896         return;
   1897     }
   1898 
   1899     uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
   1900 
   1901     // rulesToParse = src->source;
   1902     src->lh = 0;
   1903     src->listCapacity = 1024;
   1904     src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
   1905     //Test for NULL
   1906     if (src->lh == NULL) {
   1907         *status = U_MEMORY_ALLOCATION_ERROR;
   1908         return;
   1909     }
   1910     uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
   1911     src->resultLen = 0;
   1912 
   1913     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   1914 
   1915     // UCOL_RESET_TOP_VALUE
   1916     setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
   1917     // UCOL_FIRST_PRIMARY_IGNORABLE
   1918     setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
   1919     // UCOL_LAST_PRIMARY_IGNORABLE
   1920     setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
   1921     // UCOL_FIRST_SECONDARY_IGNORABLE
   1922     setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
   1923     // UCOL_LAST_SECONDARY_IGNORABLE
   1924     setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
   1925     // UCOL_FIRST_TERTIARY_IGNORABLE
   1926     setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
   1927     // UCOL_LAST_TERTIARY_IGNORABLE
   1928     setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
   1929     // UCOL_FIRST_VARIABLE
   1930     setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
   1931     // UCOL_LAST_VARIABLE
   1932     setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
   1933     // UCOL_FIRST_NON_VARIABLE
   1934     setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
   1935     // UCOL_LAST_NON_VARIABLE
   1936     setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
   1937     // UCOL_FIRST_IMPLICIT
   1938     setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
   1939     // UCOL_LAST_IMPLICIT
   1940     setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
   1941     // UCOL_FIRST_TRAILING
   1942     setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
   1943     // UCOL_LAST_TRAILING
   1944     setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
   1945     ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
   1946 }
   1947 
   1948 
   1949 void ucol_tok_closeTokenList(UColTokenParser *src) {
   1950     if(src->copySet != NULL) {
   1951         uset_close(src->copySet);
   1952     }
   1953     if(src->removeSet != NULL) {
   1954         uset_close(src->removeSet);
   1955     }
   1956     if(src->tailored != NULL) {
   1957         uhash_close(src->tailored);
   1958     }
   1959     if(src->lh != NULL) {
   1960         uprv_free(src->lh);
   1961     }
   1962     if(src->source != NULL) {
   1963         uprv_free(src->source);
   1964     }
   1965     if(src->opts != NULL) {
   1966         uprv_free(src->opts);
   1967     }
   1968 }
   1969 
   1970 #endif /* #if !UCONFIG_NO_COLLATION */
   1971