Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucol_tok.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created 02/22/2001
     14 *   created by: Vladimir Weinstein
     15 *
     16 * This module reads a tailoring rule string and produces a list of
     17 * tokens that will be turned into collation elements
     18 *
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 
     23 #if !UCONFIG_NO_COLLATION
     24 
     25 #include "unicode/uscript.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/uchar.h"
     28 #include "unicode/uniset.h"
     29 
     30 #include "cmemory.h"
     31 #include "cstring.h"
     32 #include "patternprops.h"
     33 #include "ucol_bld.h"
     34 #include "ucol_tok.h"
     35 #include "ulocimp.h"
     36 #include "uresimp.h"
     37 
     38 // Define this only for debugging.
     39 // #define DEBUG_FOR_COLL_RULES 1
     40 
     41 #ifdef DEBUG_FOR_COLL_RULES
     42 #include <iostream>
     43 #endif
     44 
     45 U_NAMESPACE_USE
     46 
     47 U_CDECL_BEGIN
     48 static int32_t U_CALLCONV
     49 uhash_hashTokens(const UHashTok k)
     50 {
     51     int32_t hash = 0;
     52     //uint32_t key = (uint32_t)k.integer;
     53     UColToken *key = (UColToken *)k.pointer;
     54     if (key != 0) {
     55         int32_t len = (key->source & 0xFF000000)>>24;
     56         int32_t inc = ((len - 32) / 32) + 1;
     57 
     58         const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);
     59         const UChar *limit = p + len;
     60 
     61         while (p<limit) {
     62             hash = (hash * 37) + *p;
     63             p += inc;
     64         }
     65     }
     66     return hash;
     67 }
     68 
     69 static UBool U_CALLCONV
     70 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
     71 {
     72     //uint32_t p1 = (uint32_t) key1.integer;
     73     //uint32_t p2 = (uint32_t) key2.integer;
     74     UColToken *p1 = (UColToken *)key1.pointer;
     75     UColToken *p2 = (UColToken *)key2.pointer;
     76     const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);
     77     const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);
     78     uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
     79     uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
     80     const UChar *end = s1+s1L-1;
     81 
     82     if (p1 == p2) {
     83         return TRUE;
     84     }
     85     if (p1->source == 0 || p2->source == 0) {
     86         return FALSE;
     87     }
     88     if(s1L != s2L) {
     89         return FALSE;
     90     }
     91     if(p1->source == p2->source) {
     92         return TRUE;
     93     }
     94     while((s1 < end) && *s1 == *s2) {
     95         ++s1;
     96         ++s2;
     97     }
     98     if(*s1 == *s2) {
     99         return TRUE;
    100     } else {
    101         return FALSE;
    102     }
    103 }
    104 U_CDECL_END
    105 
    106 /*
    107  * Debug messages used to pinpoint where a format error occurred.
    108  * A better way is to include context-sensitive information in syntaxError() function.
    109  *
    110  * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
    111  * in the compile line.
    112  */
    113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */
    114 
    115 #ifdef DEBUG_FOR_FORMAT_ERROR
    116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
    117 #else
    118 #define DBG_FORMAT_ERROR
    119 #endif
    120 
    121 
    122 /*
    123  * Controls debug messages so that the output can be compared before and after a
    124  * big change.  Prints the information of every code point that comes out of the
    125  * collation parser and its strength into a file.  When a big change in format
    126  * happens, the files before and after the change should be identical.
    127  *
    128  * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
    129  * in the compile line.
    130  */
    131 // #define DEBUG_FOR_CODE_POINTS 1
    132 
    133 #ifdef DEBUG_FOR_CODE_POINTS
    134     FILE* dfcp_fp = NULL;
    135 #endif
    136 
    137 
    138 typedef struct {
    139     uint32_t startCE;
    140     uint32_t startContCE;
    141     uint32_t limitCE;
    142     uint32_t limitContCE;
    143 } indirectBoundaries;
    144 
    145 /* these values are used for finding CE values for indirect positioning. */
    146 /* Indirect positioning is a mechanism for allowing resets on symbolic   */
    147 /* values. It only works for resets and you cannot tailor indirect names */
    148 /* An indirect name can define either an anchor point or a range. An     */
    149 /* anchor point behaves in exactly the same way as a code point in reset */
    150 /* would, except that it cannot be tailored. A range (we currently only  */
    151 /* know for the [top] range will explicitly set the upper bound for      */
    152 /* generated CEs, thus allowing for better control over how many CEs can */
    153 /* be squeezed between in the range without performance penalty.         */
    154 /* In that respect, we use [top] for tailoring of locales that use CJK   */
    155 /* characters. Other indirect values are currently a pure convenience,   */
    156 /* they can be used to assure that the CEs will be always positioned in  */
    157 /* the same place relative to a point with known properties (e.g. first  */
    158 /* primary ignorable). */
    159 static indirectBoundaries ucolIndirectBoundaries[15];
    160 /*
    161 static indirectBoundaries ucolIndirectBoundaries[11] = {
    162 { UCOL_RESET_TOP_VALUE,               0,
    163 UCOL_NEXT_TOP_VALUE,                0 },
    164 { UCOL_FIRST_PRIMARY_IGNORABLE,       0,
    165 0,                                  0 },
    166 { UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
    167 0,                                  0 },
    168 { UCOL_FIRST_SECONDARY_IGNORABLE,     0,
    169 0,                                  0 },
    170 { UCOL_LAST_SECONDARY_IGNORABLE,      0,
    171 0,                                  0 },
    172 { UCOL_FIRST_TERTIARY_IGNORABLE,      0,
    173 0,                                  0 },
    174 { UCOL_LAST_TERTIARY_IGNORABLE,       0,
    175 0,                                  0 },
    176 { UCOL_FIRST_VARIABLE,                0,
    177 0,                                  0 },
    178 { UCOL_LAST_VARIABLE,                 0,
    179 0,                                  0 },
    180 { UCOL_FIRST_NON_VARIABLE,            0,
    181 0,                                  0 },
    182 { UCOL_LAST_NON_VARIABLE,             0,
    183 0,                                  0 },
    184 };
    185 */
    186 
    187 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
    188 
    189     // Set values for the top - TODO: once we have values for all the indirects, we are going
    190     // to initalize here.
    191     ucolIndirectBoundaries[indexR].startCE = start[0];
    192     ucolIndirectBoundaries[indexR].startContCE = start[1];
    193     if(end) {
    194         ucolIndirectBoundaries[indexR].limitCE = end[0];
    195         ucolIndirectBoundaries[indexR].limitContCE = end[1];
    196     } else {
    197         ucolIndirectBoundaries[indexR].limitCE = 0;
    198         ucolIndirectBoundaries[indexR].limitContCE = 0;
    199     }
    200 }
    201 
    202 
    203 static inline
    204 void syntaxError(const UChar* rules,
    205                  int32_t pos,
    206                  int32_t rulesLen,
    207                  UParseError* parseError)
    208 {
    209     parseError->offset = pos;
    210     parseError->line = 0 ; /* we are not using line numbers */
    211 
    212     // for pre-context
    213     int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
    214     int32_t stop  = pos;
    215 
    216     u_memcpy(parseError->preContext,rules+start,stop-start);
    217     //null terminate the buffer
    218     parseError->preContext[stop-start] = 0;
    219 
    220     //for post-context
    221     start = pos+1;
    222     stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
    223     rulesLen;
    224 
    225     if(start < stop) {
    226         u_memcpy(parseError->postContext,rules+start,stop-start);
    227         //null terminate the buffer
    228         parseError->postContext[stop-start]= 0;
    229     } else {
    230         parseError->postContext[0] = 0;
    231     }
    232 }
    233 
    234 static
    235 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
    236     switch(attrib) {
    237     case UCOL_HIRAGANA_QUATERNARY_MODE:
    238         opts->hiraganaQ = value;
    239         break;
    240     case UCOL_FRENCH_COLLATION:
    241         opts->frenchCollation = value;
    242         break;
    243     case UCOL_ALTERNATE_HANDLING:
    244         opts->alternateHandling = value;
    245         break;
    246     case UCOL_CASE_FIRST:
    247         opts->caseFirst = value;
    248         break;
    249     case UCOL_CASE_LEVEL:
    250         opts->caseLevel = value;
    251         break;
    252     case UCOL_NORMALIZATION_MODE:
    253         opts->normalizationMode = value;
    254         break;
    255     case UCOL_STRENGTH:
    256         opts->strength = value;
    257         break;
    258     case UCOL_NUMERIC_COLLATION:
    259         opts->numericCollation = value;
    260         break;
    261     case UCOL_ATTRIBUTE_COUNT:
    262     default:
    263         break;
    264     }
    265 }
    266 
    267 #define UTOK_OPTION_COUNT 22
    268 
    269 static UBool didInit = FALSE;
    270 /* we can be strict, or we can be lenient */
    271 /* I'd surely be lenient with the option arguments */
    272 /* maybe even with options */
    273 U_STRING_DECL(suboption_00, "non-ignorable", 13);
    274 U_STRING_DECL(suboption_01, "shifted",        7);
    275 
    276 U_STRING_DECL(suboption_02, "lower",          5);
    277 U_STRING_DECL(suboption_03, "upper",          5);
    278 U_STRING_DECL(suboption_04, "off",            3);
    279 U_STRING_DECL(suboption_05, "on",             2);
    280 U_STRING_DECL(suboption_06, "1",              1);
    281 U_STRING_DECL(suboption_07, "2",              1);
    282 U_STRING_DECL(suboption_08, "3",              1);
    283 U_STRING_DECL(suboption_09, "4",              1);
    284 U_STRING_DECL(suboption_10, "I",              1);
    285 
    286 U_STRING_DECL(suboption_11, "primary",        7);
    287 U_STRING_DECL(suboption_12, "secondary",      9);
    288 U_STRING_DECL(suboption_13, "tertiary",       8);
    289 U_STRING_DECL(suboption_14, "variable",       8);
    290 U_STRING_DECL(suboption_15, "regular",        7);
    291 U_STRING_DECL(suboption_16, "implicit",       8);
    292 U_STRING_DECL(suboption_17, "trailing",       8);
    293 
    294 
    295 U_STRING_DECL(option_00,    "undefined",      9);
    296 U_STRING_DECL(option_01,    "rearrange",      9);
    297 U_STRING_DECL(option_02,    "alternate",      9);
    298 U_STRING_DECL(option_03,    "backwards",      9);
    299 U_STRING_DECL(option_04,    "variable top",  12);
    300 U_STRING_DECL(option_05,    "top",            3);
    301 U_STRING_DECL(option_06,    "normalization", 13);
    302 U_STRING_DECL(option_07,    "caseLevel",      9);
    303 U_STRING_DECL(option_08,    "caseFirst",      9);
    304 U_STRING_DECL(option_09,    "scriptOrder",   11);
    305 U_STRING_DECL(option_10,    "charsetname",   11);
    306 U_STRING_DECL(option_11,    "charset",        7);
    307 U_STRING_DECL(option_12,    "before",         6);
    308 U_STRING_DECL(option_13,    "hiraganaQ",      9);
    309 U_STRING_DECL(option_14,    "strength",       8);
    310 U_STRING_DECL(option_15,    "first",          5);
    311 U_STRING_DECL(option_16,    "last",           4);
    312 U_STRING_DECL(option_17,    "optimize",       8);
    313 U_STRING_DECL(option_18,    "suppressContractions",         20);
    314 U_STRING_DECL(option_19,    "numericOrdering",              15);
    315 U_STRING_DECL(option_20,    "import",         6);
    316 U_STRING_DECL(option_21,    "reorder",         7);
    317 
    318 /*
    319 [last variable] last variable value
    320 [last primary ignorable] largest CE for primary ignorable
    321 [last secondary ignorable] largest CE for secondary ignorable
    322 [last tertiary ignorable] largest CE for tertiary ignorable
    323 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
    324 */
    325 
    326 
    327 static const ucolTokSuboption alternateSub[2] = {
    328     {suboption_00, 13, UCOL_NON_IGNORABLE},
    329     {suboption_01,  7, UCOL_SHIFTED}
    330 };
    331 
    332 static const ucolTokSuboption caseFirstSub[3] = {
    333     {suboption_02, 5, UCOL_LOWER_FIRST},
    334     {suboption_03,  5, UCOL_UPPER_FIRST},
    335     {suboption_04,  3, UCOL_OFF},
    336 };
    337 
    338 static const ucolTokSuboption onOffSub[2] = {
    339     {suboption_04, 3, UCOL_OFF},
    340     {suboption_05, 2, UCOL_ON}
    341 };
    342 
    343 static const ucolTokSuboption frenchSub[1] = {
    344     {suboption_07, 1, UCOL_ON}
    345 };
    346 
    347 static const ucolTokSuboption beforeSub[3] = {
    348     {suboption_06, 1, UCOL_PRIMARY},
    349     {suboption_07, 1, UCOL_SECONDARY},
    350     {suboption_08, 1, UCOL_TERTIARY}
    351 };
    352 
    353 static const ucolTokSuboption strengthSub[5] = {
    354     {suboption_06, 1, UCOL_PRIMARY},
    355     {suboption_07, 1, UCOL_SECONDARY},
    356     {suboption_08, 1, UCOL_TERTIARY},
    357     {suboption_09, 1, UCOL_QUATERNARY},
    358     {suboption_10, 1, UCOL_IDENTICAL},
    359 };
    360 
    361 static const ucolTokSuboption firstLastSub[7] = {
    362     {suboption_11, 7, UCOL_PRIMARY},
    363     {suboption_12, 9, UCOL_PRIMARY},
    364     {suboption_13, 8, UCOL_PRIMARY},
    365     {suboption_14, 8, UCOL_PRIMARY},
    366     {suboption_15, 7, UCOL_PRIMARY},
    367     {suboption_16, 8, UCOL_PRIMARY},
    368     {suboption_17, 8, UCOL_PRIMARY},
    369 };
    370 
    371 enum OptionNumber {
    372     OPTION_ALTERNATE_HANDLING = 0,
    373     OPTION_FRENCH_COLLATION,
    374     OPTION_CASE_LEVEL,
    375     OPTION_CASE_FIRST,
    376     OPTION_NORMALIZATION_MODE,
    377     OPTION_HIRAGANA_QUATERNARY,
    378     OPTION_STRENGTH,
    379     OPTION_NUMERIC_COLLATION,
    380     OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
    381     OPTION_VARIABLE_TOP,
    382     OPTION_REARRANGE,
    383     OPTION_BEFORE,
    384     OPTION_TOP,
    385     OPTION_FIRST,
    386     OPTION_LAST,
    387     OPTION_OPTIMIZE,
    388     OPTION_SUPPRESS_CONTRACTIONS,
    389     OPTION_UNDEFINED,
    390     OPTION_SCRIPT_ORDER,
    391     OPTION_CHARSET_NAME,
    392     OPTION_CHARSET,
    393     OPTION_IMPORT,
    394     OPTION_SCRIPTREORDER
    395 } ;
    396 
    397 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
    398     /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
    399     /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
    400     /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
    401     /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
    402     /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
    403     /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
    404     /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
    405     /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
    406     /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
    407     /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
    408     /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
    409     /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
    410     /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
    411     /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
    412     /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
    413     /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
    414     /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
    415     /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
    416     /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
    417     /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"charset"        */
    418     /*20*/ {option_20,  6, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"import"        */
    419     /*21*/ {option_21,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"reorder"        */
    420 };
    421 
    422 static
    423 int32_t u_strncmpNoCase(const UChar     *s1,
    424                         const UChar     *s2,
    425                         int32_t     n)
    426 {
    427     if(n > 0) {
    428         int32_t rc;
    429         for(;;) {
    430             rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
    431             if(rc != 0 || *s1 == 0 || --n == 0) {
    432                 return rc;
    433             }
    434             ++s1;
    435             ++s2;
    436         }
    437     }
    438     return 0;
    439 }
    440 
    441 static
    442 void ucol_uprv_tok_initData() {
    443     if(!didInit) {
    444         U_STRING_INIT(suboption_00, "non-ignorable", 13);
    445         U_STRING_INIT(suboption_01, "shifted",        7);
    446 
    447         U_STRING_INIT(suboption_02, "lower",          5);
    448         U_STRING_INIT(suboption_03, "upper",          5);
    449         U_STRING_INIT(suboption_04, "off",            3);
    450         U_STRING_INIT(suboption_05, "on",             2);
    451 
    452         U_STRING_INIT(suboption_06, "1",              1);
    453         U_STRING_INIT(suboption_07, "2",              1);
    454         U_STRING_INIT(suboption_08, "3",              1);
    455         U_STRING_INIT(suboption_09, "4",              1);
    456         U_STRING_INIT(suboption_10, "I",              1);
    457 
    458         U_STRING_INIT(suboption_11, "primary",        7);
    459         U_STRING_INIT(suboption_12, "secondary",      9);
    460         U_STRING_INIT(suboption_13, "tertiary",       8);
    461         U_STRING_INIT(suboption_14, "variable",       8);
    462         U_STRING_INIT(suboption_15, "regular",        7);
    463         U_STRING_INIT(suboption_16, "implicit",       8);
    464         U_STRING_INIT(suboption_17, "trailing",       8);
    465 
    466 
    467         U_STRING_INIT(option_00, "undefined",      9);
    468         U_STRING_INIT(option_01, "rearrange",      9);
    469         U_STRING_INIT(option_02, "alternate",      9);
    470         U_STRING_INIT(option_03, "backwards",      9);
    471         U_STRING_INIT(option_04, "variable top",  12);
    472         U_STRING_INIT(option_05, "top",            3);
    473         U_STRING_INIT(option_06, "normalization", 13);
    474         U_STRING_INIT(option_07, "caseLevel",      9);
    475         U_STRING_INIT(option_08, "caseFirst",      9);
    476         U_STRING_INIT(option_09, "scriptOrder",   11);
    477         U_STRING_INIT(option_10, "charsetname",   11);
    478         U_STRING_INIT(option_11, "charset",        7);
    479         U_STRING_INIT(option_12, "before",         6);
    480         U_STRING_INIT(option_13, "hiraganaQ",      9);
    481         U_STRING_INIT(option_14, "strength",       8);
    482         U_STRING_INIT(option_15, "first",          5);
    483         U_STRING_INIT(option_16, "last",           4);
    484         U_STRING_INIT(option_17, "optimize",       8);
    485         U_STRING_INIT(option_18, "suppressContractions",         20);
    486         U_STRING_INIT(option_19, "numericOrdering",      15);
    487         U_STRING_INIT(option_20, "import ",        6);
    488         U_STRING_INIT(option_21, "reorder",        7);
    489         didInit = TRUE;
    490     }
    491 }
    492 
    493 
    494 // This function reads basic options to set in the runtime collator
    495 // used by data driven tests. Should not support build time options
    496 U_CAPI const UChar * U_EXPORT2
    497 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
    498                          UColAttribute *attrib, UColAttributeValue *value,
    499                          UErrorCode *status)
    500 {
    501     uint32_t i = 0;
    502     int32_t j=0;
    503     UBool foundOption = FALSE;
    504     const UChar *optionArg = NULL;
    505 
    506     ucol_uprv_tok_initData();
    507 
    508     while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
    509         start++;
    510     }
    511     if(start >= end) {
    512         return NULL;
    513     }
    514     /* skip opening '[' */
    515     if(*start == 0x005b) {
    516         start++;
    517     } else {
    518         *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
    519         return NULL;
    520     }
    521 
    522     while(i < UTOK_OPTION_COUNT) {
    523         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
    524             foundOption = TRUE;
    525             if(end - start > rulesOptions[i].optionLen) {
    526                 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
    527                 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
    528                     optionArg++;
    529                 }
    530             }
    531             break;
    532         }
    533         i++;
    534     }
    535 
    536     if(!foundOption) {
    537         *status = U_ILLEGAL_ARGUMENT_ERROR;
    538         return NULL;
    539     }
    540 
    541     if(optionArg) {
    542         for(j = 0; j<rulesOptions[i].subSize; j++) {
    543             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    544                 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
    545                 *attrib = rulesOptions[i].attr;
    546                 *value = rulesOptions[i].subopts[j].attrVal;
    547                 optionArg += rulesOptions[i].subopts[j].subLen;
    548                 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
    549                     optionArg++;
    550                 }
    551                 if(*optionArg == 0x005d) {
    552                     optionArg++;
    553                     return optionArg;
    554                 } else {
    555                     *status = U_ILLEGAL_ARGUMENT_ERROR;
    556                     return NULL;
    557                 }
    558             }
    559         }
    560     }
    561     *status = U_ILLEGAL_ARGUMENT_ERROR;
    562     return NULL;
    563 }
    564 
    565 static
    566 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
    567     while(*start != 0x005b) { /* advance while we find the first '[' */
    568         start++;
    569     }
    570     // now we need to get a balanced set of '[]'. The problem is that a set can have
    571     // many, and *end point to the first closing '['
    572     int32_t noOpenBraces = 1;
    573     int32_t current = 1; // skip the opening brace
    574     while(start+current < end && noOpenBraces != 0) {
    575         if(start[current] == 0x005b) {
    576             noOpenBraces++;
    577         } else if(start[current] == 0x005D) { // closing brace
    578             noOpenBraces--;
    579         }
    580         current++;
    581     }
    582 
    583     if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
    584         *status = U_ILLEGAL_ARGUMENT_ERROR;
    585         return NULL;
    586     }
    587     return uset_openPattern(start, current, status);
    588 }
    589 
    590 /**
    591  * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
    592  * @param start Pointer to the start UChar.
    593  * @param end Pointer to the last valid pointer beyond which the option will not extend.
    594  * @param optionArg Address of the pointer at which the options start (after the option name)
    595  * @return The index of the option, or -1 if the option is not valid.
    596  */
    597 static
    598 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
    599     int32_t i = 0;
    600     ucol_uprv_tok_initData();
    601 
    602     while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
    603         start++;
    604     }
    605     while(i < UTOK_OPTION_COUNT) {
    606         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
    607             if(end - start > rulesOptions[i].optionLen) {
    608                 *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */
    609                 while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */
    610                     (*optionArg)++;
    611                 }
    612             }
    613             break;
    614         }
    615         i++;
    616     }
    617     if(i == UTOK_OPTION_COUNT) {
    618         i = -1; // didn't find an option
    619     }
    620     return i;
    621 }
    622 
    623 
    624 static
    625 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {
    626     int32_t codeCount = 0;
    627     int32_t codeIndex = 0;
    628     char conversion[64];
    629     int32_t tokenLength = 0;
    630     const UChar* space;
    631 
    632     const UChar* current = src->current;
    633     const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
    634 
    635     // eat leading whitespace
    636     while(current < end && u_isWhitespace(*current)) {
    637         current++;
    638     }
    639 
    640     while(current < end) {
    641         space = u_memchr(current, 0x0020, end - current);
    642         space = space == 0 ? end : space;
    643         tokenLength = space - current;
    644         if (tokenLength < 4) {
    645             *status = U_INVALID_FORMAT_ERROR;
    646             return;
    647         }
    648         codeCount++;
    649         current += tokenLength;
    650         while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
    651             ++current;
    652         }
    653     }
    654 
    655     if (codeCount == 0) {
    656         *status = U_INVALID_FORMAT_ERROR;
    657     }
    658 
    659     src->reorderCodesLength = codeCount;
    660     src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
    661     current = src->current;
    662 
    663     // eat leading whitespace
    664     while(current < end && u_isWhitespace(*current)) {
    665         current++;
    666     }
    667 
    668     while(current < end) {
    669         space = u_memchr(current, 0x0020, end - current);
    670         space = space == 0 ? end : space;
    671         tokenLength = space - current;
    672         if (tokenLength < 4) {
    673             *status = U_ILLEGAL_ARGUMENT_ERROR;
    674             return;
    675         } else {
    676             u_UCharsToChars(current, conversion, tokenLength);
    677             conversion[tokenLength] = '\0';
    678             src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);
    679             if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
    680                 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
    681             }
    682             if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
    683                 *status = U_ILLEGAL_ARGUMENT_ERROR;
    684             }
    685         }
    686         codeIndex++;
    687         current += tokenLength;
    688         while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
    689             ++current;
    690         }
    691     }
    692 }
    693 
    694 // reads and conforms to various options in rules
    695 // end is the position of the first closing ']'
    696 // However, some of the options take an UnicodeSet definition
    697 // which needs to duplicate the closing ']'
    698 // for example: '[copy [\uAC00-\uD7FF]]'
    699 // These options will move end to the second ']' and the
    700 // caller will set the current to it.
    701 static
    702 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
    703     const UChar* start = src->current;
    704     int32_t i = 0;
    705     int32_t j=0;
    706     const UChar *optionArg = NULL;
    707 
    708     uint8_t result = 0;
    709 
    710     start++; /*skip opening '['*/
    711     i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
    712     if(optionArg) {
    713         src->current = optionArg;
    714     }
    715 
    716     if(i < 0) {
    717         *status = U_ILLEGAL_ARGUMENT_ERROR;
    718     } else {
    719         int32_t noOpenBraces = 1;
    720         switch(i) {
    721     case OPTION_ALTERNATE_HANDLING:
    722     case OPTION_FRENCH_COLLATION:
    723     case OPTION_CASE_LEVEL:
    724     case OPTION_CASE_FIRST:
    725     case OPTION_NORMALIZATION_MODE:
    726     case OPTION_HIRAGANA_QUATERNARY:
    727     case OPTION_STRENGTH:
    728     case OPTION_NUMERIC_COLLATION:
    729         if(optionArg) {
    730             for(j = 0; j<rulesOptions[i].subSize; j++) {
    731                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    732                     ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
    733                     result =  UCOL_TOK_SUCCESS;
    734                 }
    735             }
    736         }
    737         if(result == 0) {
    738             *status = U_ILLEGAL_ARGUMENT_ERROR;
    739         }
    740         break;
    741     case OPTION_VARIABLE_TOP:
    742         result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
    743         break;
    744     case OPTION_REARRANGE:
    745         result = UCOL_TOK_SUCCESS;
    746         break;
    747     case OPTION_BEFORE:
    748         if(optionArg) {
    749             for(j = 0; j<rulesOptions[i].subSize; j++) {
    750                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    751                     result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);
    752                 }
    753             }
    754         }
    755         if(result == 0) {
    756             *status = U_ILLEGAL_ARGUMENT_ERROR;
    757         }
    758         break;
    759     case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
    760         /* index to this array will be src->parsedToken.indirectIndex*/
    761         src->parsedToken.indirectIndex = 0;
    762         result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
    763         break;
    764     case OPTION_FIRST:
    765     case OPTION_LAST: /* first, last */
    766         for(j = 0; j<rulesOptions[i].subSize; j++) {
    767             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    768                 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
    769                 // element of indirect boundaries is reserved for top.
    770                 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
    771                 result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
    772             }
    773         }
    774         if(result == 0) {
    775             *status = U_ILLEGAL_ARGUMENT_ERROR;
    776         }
    777         break;
    778     case OPTION_OPTIMIZE:
    779     case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
    780         // we need to move end here
    781         src->current++; // skip opening brace
    782         while(src->current < src->end && noOpenBraces != 0) {
    783             if(*src->current == 0x005b) {
    784                 noOpenBraces++;
    785             } else if(*src->current == 0x005D) { // closing brace
    786                 noOpenBraces--;
    787             }
    788             src->current++;
    789         }
    790         result = UCOL_TOK_SUCCESS;
    791         break;
    792     case OPTION_SCRIPTREORDER:
    793         ucol_tok_parseScriptReorder(src, status);
    794         break;
    795     default:
    796         *status = U_UNSUPPORTED_ERROR;
    797         break;
    798         }
    799     }
    800     src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
    801     return result;
    802 }
    803 
    804 
    805 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
    806     if (stuff == NULL || len <= 0) {
    807         return;
    808     }
    809     UnicodeString tempStuff(FALSE, stuff, len);
    810     if(src->extraCurrent+len >= src->extraEnd) {
    811         /* reallocate */
    812         if (stuff >= src->source && stuff <= src->end) {
    813             // Copy the "stuff" contents into tempStuff's own buffer.
    814             // UnicodeString is copy-on-write.
    815             if (len > 0) {
    816                 tempStuff.setCharAt(0, tempStuff[0]);
    817             } else {
    818                 tempStuff.remove();
    819             }
    820         }
    821         UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
    822         if(newSrc != NULL) {
    823             src->current = newSrc + (src->current - src->source);
    824             src->extraCurrent = newSrc + (src->extraCurrent - src->source);
    825             src->end = newSrc + (src->end - src->source);
    826             src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
    827             src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
    828             src->source = newSrc;
    829         } else {
    830             *status = U_MEMORY_ALLOCATION_ERROR;
    831             return;
    832         }
    833     }
    834     if(len == 1) {
    835         *src->extraCurrent++ = tempStuff[0];
    836     } else {
    837         u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);
    838         src->extraCurrent += len;
    839     }
    840 }
    841 
    842 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
    843     /*
    844     top = TRUE;
    845     */
    846     UChar buff[5];
    847     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    848     buff[0] = 0xFFFE;
    849     buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
    850     buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
    851     if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
    852         src->parsedToken.charsLen = 3;
    853         ucol_tok_addToExtraCurrent(src, buff, 3, status);
    854     } else {
    855         buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
    856         buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
    857         src->parsedToken.charsLen = 5;
    858         ucol_tok_addToExtraCurrent(src, buff, 5, status);
    859     }
    860     return TRUE;
    861 }
    862 
    863 static UBool isCharNewLine(UChar c){
    864     switch(c){
    865     case 0x000A: /* LF  */
    866     case 0x000D: /* CR  */
    867     case 0x000C: /* FF  */
    868     case 0x0085: /* NEL */
    869     case 0x2028: /* LS  */
    870     case 0x2029: /* PS  */
    871         return TRUE;
    872     default:
    873         return FALSE;
    874     }
    875 }
    876 
    877 /*
    878  * This function is called several times when a range is processed.  Each time, the next code point
    879  * is processed.
    880  * The following variables must be set before calling this function:
    881  *   src->currentRangeCp:  The current code point to process.
    882  *   src->lastRangeCp: The last code point in the range.
    883  * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
    884  */
    885 static const UChar*
    886 ucol_tok_processNextCodePointInRange(UColTokenParser *src,
    887                                      UErrorCode *status)
    888 {
    889   // Append current code point to source
    890   UChar buff[U16_MAX_LENGTH];
    891   uint32_t i = 0;
    892 
    893   uint32_t nChars = U16_LENGTH(src->currentRangeCp);
    894   src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    895   src->parsedToken.charsLen = nChars;
    896 
    897   U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);
    898   ucol_tok_addToExtraCurrent(src, buff, nChars, status);
    899 
    900   ++src->currentRangeCp;
    901   if (src->currentRangeCp > src->lastRangeCp) {
    902     src->inRange = FALSE;
    903 
    904     if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
    905       src->isStarred = FALSE;
    906     }
    907   } else {
    908     src->previousCp = src->currentRangeCp;
    909   }
    910   return src->current;
    911 }
    912 
    913 /*
    914  * This function is called several times when a starred list is processed.  Each time, the next code point
    915  * in the list is processed.
    916  * The following variables must be set before calling this function:
    917  *   src->currentStarredCharIndex:  Index (in src->source) of the first char of the current code point.
    918  *   src->lastStarredCharIndex: Index to the last character in the list.
    919  * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
    920  */
    921 static const UChar*
    922 ucol_tok_processNextTokenInStarredList(UColTokenParser *src)
    923 {
    924   // Extract the characters corresponding to the next code point.
    925   UChar32 cp;
    926   src->parsedToken.charsOffset = src->currentStarredCharIndex;
    927   int32_t prev = src->currentStarredCharIndex;
    928   U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);
    929   src->parsedToken.charsLen = src->currentStarredCharIndex - prev;
    930 
    931   // When we are done parsing the starred string, turn the flag off so that
    932   // the normal processing is restored.
    933   if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
    934     src->isStarred = FALSE;
    935   }
    936   src->previousCp = cp;
    937   return src->current;
    938 }
    939 
    940 /*
    941  * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
    942  *
    943  * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
    944  *  # : Comment character
    945  *  & : Reset operator
    946  *  = : Equality
    947  *  < : Primary collation
    948  *  << : Secondary collation
    949  *  <<< : Tertiary collation
    950  *  ; : Secondary collation
    951  *  , : Tertiary collation
    952  *  / : Expansions
    953  *  | : Prefix
    954  *  - : Range
    955 
    956  *  ! : Java Thai modifier, ignored
    957  *  @ : French only
    958 
    959  * [] : Options
    960  * '' : Quotes
    961  *
    962  *  Along with operators =, <, <<, <<<, the operator * is supported to indicate a list.  For example, &a<*bcdexyz
    963  *  is equivalent to &a<b<c<d<e<x<y<z.  In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
    964  *  This function do not separate the tokens in a list.  Instead, &a<*b-ex-z is parsed as three tokens - "&a",
    965  *  "<*b", "-ex", "-z".  The strength (< in this case), whether in a list, whether in a range and the previous
    966  *  character returned as cached so that the calling program can do further splitting.
    967  */
    968 static const UChar*
    969 ucol_tok_parseNextTokenInternal(UColTokenParser *src,
    970                                 UBool startOfRules,
    971                                 UParseError *parseError,
    972                                 UErrorCode *status)
    973 {
    974     UBool variableTop = FALSE;
    975     UBool top = FALSE;
    976     UBool inChars = TRUE;
    977     UBool inQuote = FALSE;
    978     UBool wasInQuote = FALSE;
    979     uint8_t before = 0;
    980     UBool isEscaped = FALSE;
    981 
    982     // TODO: replace these variables with src->parsedToken counterparts
    983     // no need to use them anymore since we have src->parsedToken.
    984     // Ideally, token parser would be a nice class... Once, when I have
    985     // more time (around 2020 probably).
    986     uint32_t newExtensionLen = 0;
    987     uint32_t extensionOffset = 0;
    988     uint32_t newStrength = UCOL_TOK_UNSET;
    989     UChar buff[10];
    990 
    991     src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
    992     src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
    993     src->parsedToken.indirectIndex = 0;
    994 
    995     while (src->current < src->end) {
    996         UChar ch = *(src->current);
    997 
    998         if (inQuote) {
    999             if (ch == 0x0027/*'\''*/) {
   1000                 inQuote = FALSE;
   1001             } else {
   1002                 if ((src->parsedToken.charsLen == 0) || inChars) {
   1003                     if(src->parsedToken.charsLen == 0) {
   1004                         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1005                     }
   1006                     src->parsedToken.charsLen++;
   1007                 } else {
   1008                     if(newExtensionLen == 0) {
   1009                         extensionOffset = (uint32_t)(src->extraCurrent - src->source);
   1010                     }
   1011                     newExtensionLen++;
   1012                 }
   1013             }
   1014         }else if(isEscaped){
   1015             isEscaped =FALSE;
   1016             if (newStrength == UCOL_TOK_UNSET) {
   1017                 *status = U_INVALID_FORMAT_ERROR;
   1018                 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1019                 DBG_FORMAT_ERROR
   1020                 return NULL;
   1021                 // enabling rules to start with non-tokens a < b
   1022                 // newStrength = UCOL_TOK_RESET;
   1023             }
   1024             if(ch != 0x0000  && src->current != src->end) {
   1025                 if (inChars) {
   1026                     if(src->parsedToken.charsLen == 0) {
   1027                         src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
   1028                     }
   1029                     src->parsedToken.charsLen++;
   1030                 } else {
   1031                     if(newExtensionLen == 0) {
   1032                         extensionOffset = (uint32_t)(src->current - src->source);
   1033                     }
   1034                     newExtensionLen++;
   1035                 }
   1036             }
   1037         }else {
   1038             if(!PatternProps::isWhiteSpace(ch)) {
   1039                 /* Sets the strength for this entry */
   1040                 switch (ch) {
   1041                 case 0x003D/*'='*/ :
   1042                     if (newStrength != UCOL_TOK_UNSET) {
   1043                         goto EndOfLoop;
   1044                     }
   1045 
   1046                     /* if we start with strength, we'll reset to top */
   1047                     if(startOfRules == TRUE) {
   1048                         src->parsedToken.indirectIndex = 5;
   1049                         top = ucol_tok_doSetTop(src, status);
   1050                         newStrength = UCOL_TOK_RESET;
   1051                         goto EndOfLoop;
   1052                     }
   1053                     newStrength = UCOL_IDENTICAL;
   1054                     if(*(src->current+1) == 0x002A) {/*'*'*/
   1055                         src->current++;
   1056                         src->isStarred = TRUE;
   1057                     }
   1058                     break;
   1059 
   1060                 case 0x002C/*','*/:
   1061                     if (newStrength != UCOL_TOK_UNSET) {
   1062                         goto EndOfLoop;
   1063                     }
   1064 
   1065                     /* if we start with strength, we'll reset to top */
   1066                     if(startOfRules == TRUE) {
   1067                         src->parsedToken.indirectIndex = 5;
   1068                         top = ucol_tok_doSetTop(src, status);
   1069                         newStrength = UCOL_TOK_RESET;
   1070                         goto EndOfLoop;
   1071                     }
   1072                     newStrength = UCOL_TERTIARY;
   1073                     break;
   1074 
   1075                 case  0x003B/*';'*/:
   1076                     if (newStrength != UCOL_TOK_UNSET) {
   1077                         goto EndOfLoop;
   1078                     }
   1079 
   1080                     /* if we start with strength, we'll reset to top */
   1081                     if(startOfRules == TRUE) {
   1082                         src->parsedToken.indirectIndex = 5;
   1083                         top = ucol_tok_doSetTop(src, status);
   1084                         newStrength = UCOL_TOK_RESET;
   1085                         goto EndOfLoop;
   1086                     }
   1087                     newStrength = UCOL_SECONDARY;
   1088                     break;
   1089 
   1090                 case 0x003C/*'<'*/:
   1091                     if (newStrength != UCOL_TOK_UNSET) {
   1092                         goto EndOfLoop;
   1093                     }
   1094 
   1095                     /* if we start with strength, we'll reset to top */
   1096                     if(startOfRules == TRUE) {
   1097                         src->parsedToken.indirectIndex = 5;
   1098                         top = ucol_tok_doSetTop(src, status);
   1099                         newStrength = UCOL_TOK_RESET;
   1100                         goto EndOfLoop;
   1101                     }
   1102                     /* before this, do a scan to verify whether this is */
   1103                     /* another strength */
   1104                     if(*(src->current+1) == 0x003C) {
   1105                         src->current++;
   1106                         if(*(src->current+1) == 0x003C) {
   1107                             src->current++; /* three in a row! */
   1108                             newStrength = UCOL_TERTIARY;
   1109                         } else { /* two in a row */
   1110                             newStrength = UCOL_SECONDARY;
   1111                         }
   1112                     } else { /* just one */
   1113                         newStrength = UCOL_PRIMARY;
   1114                     }
   1115                     if(*(src->current+1) == 0x002A) {/*'*'*/
   1116                         src->current++;
   1117                         src->isStarred = TRUE;
   1118                     }
   1119                     break;
   1120 
   1121                 case 0x0026/*'&'*/:
   1122                     if (newStrength != UCOL_TOK_UNSET) {
   1123                         /**/
   1124                         goto EndOfLoop;
   1125                     }
   1126 
   1127                     newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
   1128                     break;
   1129 
   1130                 case 0x005b/*'['*/:
   1131                     /* options - read an option, analyze it */
   1132                     if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
   1133                         uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
   1134                         if(U_SUCCESS(*status)) {
   1135                             if(result & UCOL_TOK_TOP) {
   1136                                 if(newStrength == UCOL_TOK_RESET) {
   1137                                     top = ucol_tok_doSetTop(src, status);
   1138                                     if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
   1139                                         src->parsedToken.charsLen+=2;
   1140                                         buff[0] = 0x002d;
   1141                                         buff[1] = before;
   1142                                         ucol_tok_addToExtraCurrent(src, buff, 2, status);
   1143                                     }
   1144 
   1145                                     src->current++;
   1146                                     goto EndOfLoop;
   1147                                 } else {
   1148                                     *status = U_INVALID_FORMAT_ERROR;
   1149                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1150                                     DBG_FORMAT_ERROR
   1151                                 }
   1152                             } else if(result & UCOL_TOK_VARIABLE_TOP) {
   1153                                 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
   1154                                     variableTop = TRUE;
   1155                                     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1156                                     src->parsedToken.charsLen = 1;
   1157                                     buff[0] = 0xFFFF;
   1158                                     ucol_tok_addToExtraCurrent(src, buff, 1, status);
   1159                                     src->current++;
   1160                                     goto EndOfLoop;
   1161                                 } else {
   1162                                     *status = U_INVALID_FORMAT_ERROR;
   1163                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1164                                     DBG_FORMAT_ERROR
   1165                                 }
   1166                             } else if (result & UCOL_TOK_BEFORE){
   1167                                 if(newStrength == UCOL_TOK_RESET) {
   1168                                     before = result & UCOL_TOK_BEFORE;
   1169                                 } else {
   1170                                     *status = U_INVALID_FORMAT_ERROR;
   1171                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1172                                     DBG_FORMAT_ERROR
   1173                                 }
   1174                             }
   1175                         } else {
   1176                             *status = U_INVALID_FORMAT_ERROR;
   1177                             syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1178                             DBG_FORMAT_ERROR
   1179                             return NULL;
   1180                         }
   1181                     }
   1182                     break;
   1183                 case 0x0021/*! skip java thai modifier reordering*/:
   1184                     break;
   1185                 case 0x002F/*'/'*/:
   1186                     wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
   1187                     inChars = FALSE; /* we're now processing expansion */
   1188                     break;
   1189                 case 0x005C /* back slash for escaped chars */:
   1190                     isEscaped = TRUE;
   1191                     break;
   1192                     /* found a quote, we're gonna start copying */
   1193                 case 0x0027/*'\''*/:
   1194                     if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
   1195                       *status = U_INVALID_FORMAT_ERROR;
   1196                       syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1197                       DBG_FORMAT_ERROR
   1198                       return NULL;
   1199                       // enabling rules to start with a non-token character a < b
   1200                       // newStrength = UCOL_TOK_RESET;
   1201                     }
   1202 
   1203                     inQuote = TRUE;
   1204 
   1205                     if(inChars) { /* we're doing characters */
   1206                         if(wasInQuote == FALSE) {
   1207                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1208                         }
   1209                         if (src->parsedToken.charsLen != 0) {
   1210                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
   1211                         }
   1212                         src->parsedToken.charsLen++;
   1213                     } else { /* we're doing an expansion */
   1214                         if(wasInQuote == FALSE) {
   1215                             extensionOffset = (uint32_t)(src->extraCurrent - src->source);
   1216                         }
   1217                         if (newExtensionLen != 0) {
   1218                             ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
   1219                         }
   1220                         newExtensionLen++;
   1221                     }
   1222 
   1223                     wasInQuote = TRUE;
   1224 
   1225                     ch = *(++(src->current));
   1226                     if(ch == 0x0027) { /* copy the double quote */
   1227                         ucol_tok_addToExtraCurrent(src, &ch, 1, status);
   1228                         inQuote = FALSE;
   1229                     }
   1230                     break;
   1231 
   1232                     /* '@' is french only if the strength is not currently set */
   1233                     /* if it is, it's just a regular character in collation rules */
   1234                 case 0x0040/*'@'*/:
   1235                     if (newStrength == UCOL_TOK_UNSET) {
   1236                         src->opts->frenchCollation = UCOL_ON;
   1237                         break;
   1238                     }
   1239 
   1240                 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
   1241                     // we want to store read characters to the prefix part and continue reading
   1242                     // the characters (proper way would be to restart reading the chars, but in
   1243                     // that case we would have to complicate the token hasher, which I do not
   1244                     // intend to play with. Instead, we will do prefixes when prefixes are due
   1245                     // (before adding the elements).
   1246                     src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
   1247                     src->parsedToken.prefixLen = src->parsedToken.charsLen;
   1248 
   1249                     if(inChars) { /* we're doing characters */
   1250                         if(wasInQuote == FALSE) {
   1251                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1252                         }
   1253                         if (src->parsedToken.charsLen != 0) {
   1254                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
   1255                         }
   1256                         src->parsedToken.charsLen++;
   1257                     }
   1258 
   1259                     wasInQuote = TRUE;
   1260 
   1261                     do {
   1262                         ch = *(++(src->current));
   1263                         // skip whitespace between '|' and the character
   1264                     } while (PatternProps::isWhiteSpace(ch));
   1265                     break;
   1266 
   1267                     //charsOffset = 0;
   1268                     //newCharsLen = 0;
   1269                     //break; // We want to store the whole prefix/character sequence. If we break
   1270                     // the '|' is going to get lost.
   1271 
   1272                 case 0x002D /*-*/: /* A range. */
   1273                     if (newStrength != UCOL_TOK_UNSET) {
   1274                       // While processing the pending token, the isStarred field
   1275                       // is reset, so it needs to be saved for the next
   1276                       // invocation.
   1277                       src->savedIsStarred = src->isStarred;
   1278                       goto EndOfLoop;
   1279                    }
   1280                    src->isStarred = src->savedIsStarred;
   1281 
   1282                    // Ranges are valid only in starred tokens.
   1283                    if (!src->isStarred) {
   1284                      *status = U_INVALID_FORMAT_ERROR;
   1285                      syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1286                      DBG_FORMAT_ERROR
   1287                      return NULL;
   1288                    }
   1289                    newStrength = src->parsedToken.strength;
   1290                    src->inRange = TRUE;
   1291                    break;
   1292 
   1293                 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
   1294                     do {
   1295                         ch = *(++(src->current));
   1296                     } while (!isCharNewLine(ch));
   1297 
   1298                     break;
   1299                 default:
   1300                     if (newStrength == UCOL_TOK_UNSET) {
   1301                       *status = U_INVALID_FORMAT_ERROR;
   1302                       syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1303                       DBG_FORMAT_ERROR
   1304                       return NULL;
   1305                     }
   1306 
   1307                     if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
   1308                         *status = U_INVALID_FORMAT_ERROR;
   1309                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1310                         DBG_FORMAT_ERROR
   1311                         return NULL;
   1312                     }
   1313 
   1314                     if(ch == 0x0000 && src->current+1 == src->end) {
   1315                         break;
   1316                     }
   1317 
   1318                     if (inChars) {
   1319                         if(src->parsedToken.charsLen == 0) {
   1320                             src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
   1321                         }
   1322                         src->parsedToken.charsLen++;
   1323                     } else {
   1324                         if(newExtensionLen == 0) {
   1325                             extensionOffset = (uint32_t)(src->current - src->source);
   1326                         }
   1327                         newExtensionLen++;
   1328                     }
   1329 
   1330                     break;
   1331                 }
   1332             }
   1333         }
   1334 
   1335         if(wasInQuote) {
   1336             if(ch != 0x27) {
   1337                 if(inQuote || !PatternProps::isWhiteSpace(ch)) {
   1338                     ucol_tok_addToExtraCurrent(src, &ch, 1, status);
   1339                 }
   1340             }
   1341         }
   1342 
   1343         src->current++;
   1344     }
   1345 
   1346 EndOfLoop:
   1347     wasInQuote = FALSE;
   1348     if (newStrength == UCOL_TOK_UNSET) {
   1349         return NULL;
   1350     }
   1351 
   1352     if (src->parsedToken.charsLen == 0 && top == FALSE) {
   1353         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1354         *status = U_INVALID_FORMAT_ERROR;
   1355         DBG_FORMAT_ERROR
   1356         return NULL;
   1357     }
   1358 
   1359     src->parsedToken.strength = newStrength;
   1360     src->parsedToken.extensionOffset = extensionOffset;
   1361     src->parsedToken.extensionLen = newExtensionLen;
   1362     src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
   1363 
   1364     return src->current;
   1365 }
   1366 
   1367 /*
   1368  * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
   1369  * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
   1370  *
   1371  * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
   1372  *  1) ucol_tok_parseNextTokenInternal() returns a range as a single token.  This function separates
   1373  *     it to separate tokens and returns one by one.  In order to do that, the necessary states are
   1374  *     cached as member variables of the token parser.
   1375  *  2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
   1376  *     starting character as a single list token (which is separated into individual characters here)
   1377  *     and as another list token starting with the last character in the range.  Before expanding it
   1378  *     as a list of tokens, this function expands the range by filling the intermediate characters and
   1379  *     returns them one by one as separate tokens.
   1380  * Necessary checks are done for invalid combinations.
   1381  */
   1382 U_CAPI const UChar* U_EXPORT2
   1383 ucol_tok_parseNextToken(UColTokenParser *src,
   1384                         UBool startOfRules,
   1385                         UParseError *parseError,
   1386                         UErrorCode *status)
   1387 {
   1388   const UChar *nextToken;
   1389 
   1390   if (src->inRange) {
   1391     // We are not done processing a range.  Continue it.
   1392     return ucol_tok_processNextCodePointInRange(src, status);
   1393   } else if (src->isStarred) {
   1394     // We are not done processing a starred token.  Continue it.
   1395     return ucol_tok_processNextTokenInStarredList(src);
   1396   }
   1397 
   1398   // Get the next token.
   1399   nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);
   1400 
   1401   if (nextToken == NULL) {
   1402     return NULL;
   1403   }
   1404 
   1405   if (src->inRange) {
   1406     // A new range has started.
   1407     // Check whether it is a chain of ranges with more than one hyphen.
   1408     if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {
   1409         *status = U_INVALID_FORMAT_ERROR;
   1410         syntaxError(src->source,src->parsedToken.charsOffset-1,
   1411                     src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);
   1412         DBG_FORMAT_ERROR
   1413         return NULL;
   1414     }
   1415 
   1416     // The current token indicates the second code point of the range.
   1417     // Process just that, and then proceed with the star.
   1418     src->currentStarredCharIndex = src->parsedToken.charsOffset;
   1419     U16_NEXT(src->source, src->currentStarredCharIndex,
   1420              (uint32_t)(src->end - src->source), src->lastRangeCp);
   1421     if (src->lastRangeCp <= src->previousCp) {
   1422         *status = U_INVALID_FORMAT_ERROR;
   1423         syntaxError(src->source,src->parsedToken.charsOffset-1,
   1424                     src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
   1425         DBG_FORMAT_ERROR
   1426         return NULL;
   1427     }
   1428 
   1429     // Set current range code point to process the range loop
   1430     src->currentRangeCp = src->previousCp + 1;
   1431 
   1432     src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
   1433 
   1434     return ucol_tok_processNextCodePointInRange(src, status);
   1435  } else if (src->isStarred) {
   1436     // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
   1437     // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
   1438     // separated into several tokens and returned.
   1439     src->currentStarredCharIndex = src->parsedToken.charsOffset;
   1440     src->lastStarredCharIndex =  src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
   1441 
   1442     return ucol_tok_processNextTokenInStarredList(src);
   1443   } else {
   1444     // Set previous codepoint
   1445     U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);
   1446   }
   1447   return nextToken;
   1448 }
   1449 
   1450 
   1451 /*
   1452 Processing Description
   1453 1 Build a ListList. Each list has a header, which contains two lists (positive
   1454 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
   1455 reset may be null.
   1456 2 As you process, you keep a LAST pointer that points to the last token you
   1457 handled.
   1458 
   1459 */
   1460 
   1461 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
   1462                                       UParseError *parseError, UErrorCode *status)
   1463 {
   1464     if(src->resultLen == src->listCapacity) {
   1465         // Unfortunately, this won't work, as we store addresses of lhs in token
   1466         src->listCapacity *= 2;
   1467         src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
   1468         if(src->lh == NULL) {
   1469             *status = U_MEMORY_ALLOCATION_ERROR;
   1470             return NULL;
   1471         }
   1472     }
   1473     /* do the reset thing */
   1474     UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
   1475     /* test for NULL */
   1476     if (sourceToken == NULL) {
   1477         *status = U_MEMORY_ALLOCATION_ERROR;
   1478         return NULL;
   1479     }
   1480     sourceToken->rulesToParseHdl = &(src->source);
   1481     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1482     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
   1483 
   1484     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
   1485     sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
   1486 
   1487     // keep the flags around so that we know about before
   1488     sourceToken->flags = src->parsedToken.flags;
   1489 
   1490     if(src->parsedToken.prefixOffset != 0) {
   1491         // this is a syntax error
   1492         *status = U_INVALID_FORMAT_ERROR;
   1493         syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
   1494         DBG_FORMAT_ERROR
   1495         uprv_free(sourceToken);
   1496         return 0;
   1497     } else {
   1498         sourceToken->prefix = 0;
   1499     }
   1500 
   1501     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
   1502     sourceToken->strength = UCOL_TOK_RESET;
   1503     sourceToken->next = NULL;
   1504     sourceToken->previous = NULL;
   1505     sourceToken->noOfCEs = 0;
   1506     sourceToken->noOfExpCEs = 0;
   1507     sourceToken->listHeader = &src->lh[src->resultLen];
   1508 
   1509     src->lh[src->resultLen].first = NULL;
   1510     src->lh[src->resultLen].last = NULL;
   1511     src->lh[src->resultLen].first = NULL;
   1512     src->lh[src->resultLen].last = NULL;
   1513 
   1514     src->lh[src->resultLen].reset = sourceToken;
   1515 
   1516     /*
   1517     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
   1518     First convert all expansions into normal form. Examples:
   1519     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
   1520     d * ... into &x * c/y * d * ...
   1521     Note: reset values can never have expansions, although they can cause the
   1522     very next item to have one. They may be contractions, if they are found
   1523     earlier in the list.
   1524     */
   1525     *expandNext = 0;
   1526     if(expand != NULL) {
   1527         /* check to see if there is an expansion */
   1528         if(src->parsedToken.charsLen > 1) {
   1529             uint32_t resetCharsOffset;
   1530             resetCharsOffset = (uint32_t)(expand - src->source);
   1531             sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
   1532             *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
   1533         }
   1534     }
   1535 
   1536     src->resultLen++;
   1537 
   1538     uhash_put(src->tailored, sourceToken, sourceToken, status);
   1539 
   1540     return sourceToken;
   1541 }
   1542 
   1543 static
   1544 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
   1545     if(U_FAILURE(*status)) {
   1546         return NULL;
   1547     }
   1548     /* this is a virgin before - we need to fish the anchor from the UCA */
   1549     collIterate s;
   1550     uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
   1551     uint32_t CE, SecondCE;
   1552     // uint32_t invPos;
   1553     if(sourceToken != NULL) {
   1554         uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
   1555     } else {
   1556         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
   1557     }
   1558     if(U_FAILURE(*status)) {
   1559         return NULL;
   1560     }
   1561 
   1562     baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
   1563     baseContCE = ucol_getNextCE(src->UCA, &s, status);
   1564     if(baseContCE == UCOL_NO_MORE_CES) {
   1565         baseContCE = 0;
   1566     }
   1567 
   1568 
   1569     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   1570     uint32_t ch = 0;
   1571     uint32_t expandNext = 0;
   1572     UColToken key;
   1573 
   1574     if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
   1575         uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
   1576         uint32_t raw = uprv_uca_getRawFromImplicit(primary);
   1577         ch = uprv_uca_getCodePointFromRaw(raw-1);
   1578         uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
   1579         CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
   1580         SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
   1581 
   1582         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1583         *src->extraCurrent++ = 0xFFFE;
   1584         *src->extraCurrent++ = (UChar)ch;
   1585         src->parsedToken.charsLen++;
   1586 
   1587         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
   1588         key.rulesToParseHdl = &(src->source);
   1589 
   1590         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
   1591         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1592 
   1593         if(sourceToken == NULL) {
   1594             src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   1595             if(isContinuation(SecondCE)) {
   1596                 src->lh[src->resultLen].baseContCE = SecondCE;
   1597             } else {
   1598                 src->lh[src->resultLen].baseContCE = 0;
   1599             }
   1600             src->lh[src->resultLen].nextCE = 0;
   1601             src->lh[src->resultLen].nextContCE = 0;
   1602             src->lh[src->resultLen].previousCE = 0;
   1603             src->lh[src->resultLen].previousContCE = 0;
   1604 
   1605             src->lh[src->resultLen].indirect = FALSE;
   1606 
   1607             sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1608         }
   1609 
   1610     } else {
   1611         /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
   1612 
   1613         // we got the previous CE. Now we need to see if the difference between
   1614         // the two CEs is really of the requested strength.
   1615         // if it's a bigger difference (we asked for secondary and got primary), we
   1616         // need to modify the CE.
   1617         if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
   1618             // adjust the strength
   1619             // now we are in the situation where our baseCE should actually be modified in
   1620             // order to get the CE in the right position.
   1621             if(strength == UCOL_SECONDARY) {
   1622                 CE = baseCE - 0x0200;
   1623             } else { // strength == UCOL_TERTIARY
   1624                 CE = baseCE - 0x02;
   1625             }
   1626             if(baseContCE) {
   1627                 if(strength == UCOL_SECONDARY) {
   1628                     SecondCE = baseContCE - 0x0200;
   1629                 } else { // strength == UCOL_TERTIARY
   1630                     SecondCE = baseContCE - 0x02;
   1631                 }
   1632             }
   1633         }
   1634 
   1635 #if 0
   1636         // the code below relies on getting a code point from the inverse table, in order to be
   1637         // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
   1638         // 1. There are many code points that have the same CE
   1639         // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
   1640         // Also, in case when there is no equivalent strength before an element, we have to actually
   1641         // construct one. For example, &[before 2]a << x won't result in x << a, because the element
   1642         // before a is a primary difference.
   1643 
   1644         //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   1645 
   1646 
   1647         ch = CETable[3*invPos+2];
   1648 
   1649         if((ch &  UCOL_INV_SIZEMASK) != 0) {
   1650             uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
   1651             uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
   1652             ch = conts[offset];
   1653         }
   1654 
   1655         *src->extraCurrent++ = (UChar)ch;
   1656         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
   1657         src->parsedToken.charsLen = 1;
   1658 
   1659         // We got an UCA before. However, this might have been tailored.
   1660         // example:
   1661         // &\u30ca = \u306a
   1662         // &[before 3]\u306a<<<\u306a|\u309d
   1663 
   1664 
   1665         // uint32_t key = (*newCharsLen << 24) | *charsOffset;
   1666         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
   1667         key.rulesToParseHdl = &(src->source);
   1668 
   1669         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
   1670         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1671 #endif
   1672 
   1673         // here is how it should be. The situation such as &[before 1]a < x, should be
   1674         // resolved exactly as if we wrote &a > x.
   1675         // therefore, I don't really care if the UCA value before a has been changed.
   1676         // However, I do care if the strength between my element and the previous element
   1677         // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
   1678         // have to construct the base CE.
   1679 
   1680 
   1681 
   1682         // if we found a tailored thing, we have to use the UCA value and construct
   1683         // a new reset token with constructed name
   1684         //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
   1685         // character to which we want to anchor is already tailored.
   1686         // We need to construct a new token which will be the anchor
   1687         // point
   1688         //*(src->extraCurrent-1) = 0xFFFE;
   1689         //*src->extraCurrent++ = (UChar)ch;
   1690         // grab before
   1691         src->parsedToken.charsOffset -= 10;
   1692         src->parsedToken.charsLen += 10;
   1693         src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   1694         if(isContinuation(SecondCE)) {
   1695             src->lh[src->resultLen].baseContCE = SecondCE;
   1696         } else {
   1697             src->lh[src->resultLen].baseContCE = 0;
   1698         }
   1699         src->lh[src->resultLen].nextCE = 0;
   1700         src->lh[src->resultLen].nextContCE = 0;
   1701         src->lh[src->resultLen].previousCE = 0;
   1702         src->lh[src->resultLen].previousContCE = 0;
   1703 
   1704         src->lh[src->resultLen].indirect = FALSE;
   1705 
   1706         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1707         //}
   1708     }
   1709 
   1710     return sourceToken;
   1711 
   1712 }
   1713 
   1714 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
   1715     UColToken *lastToken = NULL;
   1716     const UChar *parseEnd = NULL;
   1717     uint32_t expandNext = 0;
   1718     UBool variableTop = FALSE;
   1719     UBool top = FALSE;
   1720     uint16_t specs = 0;
   1721     UColTokListHeader *ListList = NULL;
   1722 
   1723     src->parsedToken.strength = UCOL_TOK_UNSET;
   1724 
   1725     ListList = src->lh;
   1726 
   1727     if(U_FAILURE(*status)) {
   1728         return 0;
   1729     }
   1730 #ifdef DEBUG_FOR_CODE_POINTS
   1731     char filename[35];
   1732     sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());
   1733     dfcp_fp = fopen(filename, "a");
   1734     fprintf(stdout, "Output is in the file %s.\n", filename);
   1735 #endif
   1736 
   1737 #ifdef DEBUG_FOR_COLL_RULES
   1738     std::string s3;
   1739     UnicodeString(src->source).toUTF8String(s3);
   1740     std::cout << "src->source = " << s3 << std::endl;
   1741 #endif
   1742 
   1743     while(src->current < src->end || src->isStarred) {
   1744         src->parsedToken.prefixOffset = 0;
   1745 
   1746         parseEnd = ucol_tok_parseNextToken(src,
   1747             (UBool)(lastToken == NULL),
   1748             parseError,
   1749             status);
   1750 
   1751         specs = src->parsedToken.flags;
   1752 
   1753 
   1754         variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
   1755         top = ((specs & UCOL_TOK_TOP) != 0);
   1756 
   1757         if(U_SUCCESS(*status) && parseEnd != NULL) {
   1758             UColToken *sourceToken = NULL;
   1759             //uint32_t key = 0;
   1760             uint32_t lastStrength = UCOL_TOK_UNSET;
   1761 
   1762             if(lastToken != NULL ) {
   1763                 lastStrength = lastToken->strength;
   1764             }
   1765 
   1766 #ifdef DEBUG_FOR_CODE_POINTS
   1767             UChar32 cp;
   1768             U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);
   1769             fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);
   1770 #endif
   1771             //key = newCharsLen << 24 | charsOffset;
   1772             UColToken key;
   1773             key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1774             key.rulesToParseHdl = &(src->source);
   1775 
   1776             /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
   1777             sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1778 
   1779             if(src->parsedToken.strength != UCOL_TOK_RESET) {
   1780                 if(lastToken == NULL) { /* this means that rules haven't started properly */
   1781                     *status = U_INVALID_FORMAT_ERROR;
   1782                     syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
   1783                     DBG_FORMAT_ERROR
   1784                     return 0;
   1785                 }
   1786                 /*  6 Otherwise (when relation != reset) */
   1787                 if(sourceToken == NULL) {
   1788                     /* If sourceToken is null, create new one, */
   1789                     sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
   1790                     /* test for NULL */
   1791                     if (sourceToken == NULL) {
   1792                         *status = U_MEMORY_ALLOCATION_ERROR;
   1793                         return 0;
   1794                     }
   1795                     sourceToken->rulesToParseHdl = &(src->source);
   1796                     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1797 
   1798                     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
   1799 
   1800                     sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
   1801                     sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
   1802 
   1803                     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
   1804                     sourceToken->next = NULL;
   1805                     sourceToken->previous = NULL;
   1806                     sourceToken->noOfCEs = 0;
   1807                     sourceToken->noOfExpCEs = 0;
   1808                     // keep the flags around so that we know about before
   1809                     sourceToken->flags = src->parsedToken.flags;
   1810                     uhash_put(src->tailored, sourceToken, sourceToken, status);
   1811                     if(U_FAILURE(*status)) {
   1812                         return 0;
   1813                     }
   1814                 } else {
   1815                     /* we could have fished out a reset here */
   1816                     if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
   1817                         /* otherwise remove sourceToken from where it was. */
   1818                         if(sourceToken->next != NULL) {
   1819                             if(sourceToken->next->strength > sourceToken->strength) {
   1820                                 sourceToken->next->strength = sourceToken->strength;
   1821                             }
   1822                             sourceToken->next->previous = sourceToken->previous;
   1823                         } else {
   1824                             sourceToken->listHeader->last = sourceToken->previous;
   1825                         }
   1826 
   1827                         if(sourceToken->previous != NULL) {
   1828                             sourceToken->previous->next = sourceToken->next;
   1829                         } else {
   1830                             sourceToken->listHeader->first = sourceToken->next;
   1831                         }
   1832                         sourceToken->next = NULL;
   1833                         sourceToken->previous = NULL;
   1834                     }
   1835                 }
   1836 
   1837                 sourceToken->strength = src->parsedToken.strength;
   1838                 sourceToken->listHeader = lastToken->listHeader;
   1839 
   1840                 /*
   1841                 1.  Find the strongest strength in each list, and set strongestP and strongestN
   1842                 accordingly in the headers.
   1843                 */
   1844                 if(lastStrength == UCOL_TOK_RESET
   1845                     || sourceToken->listHeader->first == 0) {
   1846                         /* If LAST is a reset
   1847                         insert sourceToken in the list. */
   1848                         if(sourceToken->listHeader->first == 0) {
   1849                             sourceToken->listHeader->first = sourceToken;
   1850                             sourceToken->listHeader->last = sourceToken;
   1851                         } else { /* we need to find a place for us */
   1852                             /* and we'll get in front of the same strength */
   1853                             if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
   1854                                 sourceToken->next = sourceToken->listHeader->first;
   1855                                 sourceToken->next->previous = sourceToken;
   1856                                 sourceToken->listHeader->first = sourceToken;
   1857                                 sourceToken->previous = NULL;
   1858                             } else {
   1859                                 lastToken = sourceToken->listHeader->first;
   1860                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
   1861                                     lastToken = lastToken->next;
   1862                                 }
   1863                                 if(lastToken->next != NULL) {
   1864                                     lastToken->next->previous = sourceToken;
   1865                                 } else {
   1866                                     sourceToken->listHeader->last = sourceToken;
   1867                                 }
   1868                                 sourceToken->previous = lastToken;
   1869                                 sourceToken->next = lastToken->next;
   1870                                 lastToken->next = sourceToken;
   1871                             }
   1872                         }
   1873                     } else {
   1874                         /* Otherwise (when LAST is not a reset)
   1875                         if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
   1876                         otherwise insert before.
   1877                         when inserting after or before, search to the next position with the same
   1878                         strength in that direction. (This is called postpone insertion).         */
   1879                         if(sourceToken != lastToken) {
   1880                             if(lastToken->polarity == sourceToken->polarity) {
   1881                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
   1882                                     lastToken = lastToken->next;
   1883                                 }
   1884                                 sourceToken->previous = lastToken;
   1885                                 if(lastToken->next != NULL) {
   1886                                     lastToken->next->previous = sourceToken;
   1887                                 } else {
   1888                                     sourceToken->listHeader->last = sourceToken;
   1889                                 }
   1890 
   1891                                 sourceToken->next = lastToken->next;
   1892                                 lastToken->next = sourceToken;
   1893                             } else {
   1894                                 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
   1895                                     lastToken = lastToken->previous;
   1896                                 }
   1897                                 sourceToken->next = lastToken;
   1898                                 if(lastToken->previous != NULL) {
   1899                                     lastToken->previous->next = sourceToken;
   1900                                 } else {
   1901                                     sourceToken->listHeader->first = sourceToken;
   1902                                 }
   1903                                 sourceToken->previous = lastToken->previous;
   1904                                 lastToken->previous = sourceToken;
   1905                             }
   1906                         } else { /* repeated one thing twice in rules, stay with the stronger strength */
   1907                             if(lastStrength < sourceToken->strength) {
   1908                                 sourceToken->strength = lastStrength;
   1909                             }
   1910                         }
   1911                     }
   1912 
   1913                     /* if the token was a variable top, we're gonna put it in */
   1914                     if(variableTop == TRUE && src->varTop == NULL) {
   1915                         variableTop = FALSE;
   1916                         src->varTop = sourceToken;
   1917                     }
   1918 
   1919                     // Treat the expansions.
   1920                     // There are two types of expansions: explicit (x / y) and reset based propagating expansions
   1921                     // (&abc * d * e <=> &ab * d / c * e / c)
   1922                     // if both of them are in effect for a token, they are combined.
   1923 
   1924                     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
   1925 
   1926                     if(expandNext != 0) {
   1927                         if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
   1928                             expandNext = 0;
   1929                         } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
   1930                             sourceToken->expansion = expandNext;
   1931                         } else { /* there is both explicit and implicit expansion. We need to make a combination */
   1932                             uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
   1933                             uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
   1934                             sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
   1935                             src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
   1936                         }
   1937                     }
   1938 
   1939                     // This is just for debugging purposes
   1940                     if(sourceToken->expansion != 0) {
   1941                         sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
   1942                     } else {
   1943                         sourceToken->debugExpansion = 0;
   1944                     }
   1945                     // if the previous token was a reset before, the strength of this
   1946                     // token must match the strength of before. Otherwise we have an
   1947                     // undefined situation.
   1948                     // In other words, we currently have a cludge which we use to
   1949                     // represent &a >> x. This is written as &[before 2]a << x.
   1950                     if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
   1951                         uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
   1952                         if(beforeStrength != sourceToken->strength) {
   1953                             *status = U_INVALID_FORMAT_ERROR;
   1954                             syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
   1955                             DBG_FORMAT_ERROR
   1956                             return 0;
   1957                         }
   1958                     }
   1959             } else {
   1960                 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
   1961                     /* if the previous token was also a reset, */
   1962                     /*this means that we have two consecutive resets */
   1963                     /* and we want to remove the previous one if empty*/
   1964                     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
   1965                         src->resultLen--;
   1966                     }
   1967                 }
   1968 
   1969                 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
   1970                     uint32_t searchCharsLen = src->parsedToken.charsLen;
   1971                     while(searchCharsLen > 1 && sourceToken == NULL) {
   1972                         searchCharsLen--;
   1973                         //key = searchCharsLen << 24 | charsOffset;
   1974                         UColToken key;
   1975                         key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
   1976                         key.rulesToParseHdl = &(src->source);
   1977                         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1978                     }
   1979                     if(sourceToken != NULL) {
   1980                         expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
   1981                     }
   1982                 }
   1983 
   1984                 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
   1985                     if(top == FALSE) { /* there is no indirection */
   1986                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
   1987                         if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
   1988                             /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
   1989                             while(sourceToken->strength > strength && sourceToken->previous != NULL) {
   1990                                 sourceToken = sourceToken->previous;
   1991                             }
   1992                             /* here, either we hit the strength or NULL */
   1993                             if(sourceToken->strength == strength) {
   1994                                 if(sourceToken->previous != NULL) {
   1995                                     sourceToken = sourceToken->previous;
   1996                                 } else { /* start of list */
   1997                                     sourceToken = sourceToken->listHeader->reset;
   1998                                 }
   1999                             } else { /* we hit NULL */
   2000                                 /* we should be doing the else part */
   2001                                 sourceToken = sourceToken->listHeader->reset;
   2002                                 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
   2003                             }
   2004                         } else {
   2005                             sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
   2006                         }
   2007                     } else { /* this is both before and indirection */
   2008                         top = FALSE;
   2009                         ListList[src->resultLen].previousCE = 0;
   2010                         ListList[src->resultLen].previousContCE = 0;
   2011                         ListList[src->resultLen].indirect = TRUE;
   2012                         /* we need to do slightly more work. we need to get the baseCE using the */
   2013                         /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
   2014                         /* in ucol_bld */
   2015                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
   2016                         uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
   2017                         uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
   2018                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
   2019 
   2020                         UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   2021                         if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) &&
   2022                            (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
   2023                             uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
   2024                             uint32_t raw = uprv_uca_getRawFromImplicit(primary);
   2025                             uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
   2026                             CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
   2027                             SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
   2028                         } else {
   2029                             /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
   2030                             ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
   2031                         }
   2032 
   2033                         ListList[src->resultLen].baseCE = CE;
   2034                         ListList[src->resultLen].baseContCE = SecondCE;
   2035                         ListList[src->resultLen].nextCE = 0;
   2036                         ListList[src->resultLen].nextContCE = 0;
   2037 
   2038                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   2039                     }
   2040                 }
   2041 
   2042 
   2043                 /*  5 If the relation is a reset:
   2044                 If sourceToken is null
   2045                 Create new list, create new sourceToken, make the baseCE from source, put
   2046                 the sourceToken in ListHeader of the new list */
   2047                 if(sourceToken == NULL) {
   2048                     /*
   2049                     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
   2050                     First convert all expansions into normal form. Examples:
   2051                     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
   2052                     d * ... into &x * c/y * d * ...
   2053                     Note: reset values can never have expansions, although they can cause the
   2054                     very next item to have one. They may be contractions, if they are found
   2055                     earlier in the list.
   2056                     */
   2057                     if(top == FALSE) {
   2058                         collIterate s;
   2059                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
   2060 
   2061                         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
   2062 
   2063                         CE = ucol_getNextCE(src->UCA, &s, status);
   2064                         const UChar *expand = s.pos;
   2065                         SecondCE = ucol_getNextCE(src->UCA, &s, status);
   2066 
   2067                         ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   2068                         if(isContinuation(SecondCE)) {
   2069                             ListList[src->resultLen].baseContCE = SecondCE;
   2070                         } else {
   2071                             ListList[src->resultLen].baseContCE = 0;
   2072                         }
   2073                         ListList[src->resultLen].nextCE = 0;
   2074                         ListList[src->resultLen].nextContCE = 0;
   2075                         ListList[src->resultLen].previousCE = 0;
   2076                         ListList[src->resultLen].previousContCE = 0;
   2077                         ListList[src->resultLen].indirect = FALSE;
   2078                         sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
   2079                     } else { /* top == TRUE */
   2080                         /* just use the supplied values */
   2081                         top = FALSE;
   2082                         ListList[src->resultLen].previousCE = 0;
   2083                         ListList[src->resultLen].previousContCE = 0;
   2084                         ListList[src->resultLen].indirect = TRUE;
   2085                         ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
   2086                         ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
   2087                         ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
   2088                         ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
   2089 
   2090                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   2091 
   2092                     }
   2093                 } else { /* reset to something already in rules */
   2094                     top = FALSE;
   2095                 }
   2096             }
   2097             /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
   2098             lastToken = sourceToken;
   2099         } else {
   2100             if(U_FAILURE(*status)) {
   2101                 return 0;
   2102             }
   2103         }
   2104     }
   2105 #ifdef DEBUG_FOR_CODE_POINTS
   2106     fclose(dfcp_fp);
   2107 #endif
   2108 
   2109 
   2110     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
   2111         src->resultLen--;
   2112     }
   2113     return src->resultLen;
   2114 }
   2115 
   2116 const UChar* ucol_tok_getRulesFromBundle(
   2117     void* /*context*/,
   2118     const char* locale,
   2119     const char* type,
   2120     int32_t* pLength,
   2121     UErrorCode* status)
   2122 {
   2123     const UChar* rules = NULL;
   2124     UResourceBundle* bundle;
   2125     UResourceBundle* collations;
   2126     UResourceBundle* collation;
   2127 
   2128     *pLength = 0;
   2129 
   2130     bundle = ures_open(U_ICUDATA_COLL, locale, status);
   2131     if(U_SUCCESS(*status)){
   2132         collations = ures_getByKey(bundle, "collations", NULL, status);
   2133         if(U_SUCCESS(*status)){
   2134             collation = ures_getByKey(collations, type, NULL, status);
   2135             if(U_SUCCESS(*status)){
   2136                 rules = ures_getStringByKey(collation, "Sequence", pLength, status);
   2137                 if(U_FAILURE(*status)){
   2138                     *pLength = 0;
   2139                     rules = NULL;
   2140                 }
   2141                 ures_close(collation);
   2142             }
   2143             ures_close(collations);
   2144         }
   2145     }
   2146 
   2147     ures_close(bundle);
   2148 
   2149     return rules;
   2150 }
   2151 
   2152 void ucol_tok_initTokenList(
   2153     UColTokenParser *src,
   2154     const UChar *rules,
   2155     uint32_t rulesLength,
   2156     const UCollator *UCA,
   2157     GetCollationRulesFunction importFunc,
   2158     void* context,
   2159     UErrorCode *status) {
   2160     U_NAMESPACE_USE
   2161 
   2162     uint32_t nSize = 0;
   2163     uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
   2164 
   2165     bool needToDeallocRules = false;
   2166 
   2167     if(U_FAILURE(*status)) {
   2168         return;
   2169     }
   2170 
   2171     // set everything to zero, so that we can clean up gracefully
   2172     uprv_memset(src, 0, sizeof(UColTokenParser));
   2173 
   2174     // first we need to find options that don't like to be normalized,
   2175     // like copy and remove...
   2176     //const UChar *openBrace = rules;
   2177     int32_t optionNumber = -1;
   2178     const UChar *setStart = NULL;
   2179     uint32_t i = 0;
   2180     while(i < rulesLength) {
   2181         if(rules[i] == 0x005B) {    // '[': start of an option
   2182             /* Gets the following:
   2183                optionNumber: The index of the option.
   2184                setStart: The pointer at which the option arguments start.
   2185              */
   2186             optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
   2187 
   2188             if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
   2189                 // [optimize]
   2190                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
   2191                 if(U_SUCCESS(*status)) {
   2192                     if(src->copySet == NULL) {
   2193                         src->copySet = newSet;
   2194                     } else {
   2195                         uset_addAll(src->copySet, newSet);
   2196                         uset_close(newSet);
   2197                     }
   2198                 } else {
   2199                     return;
   2200                 }
   2201             } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
   2202                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
   2203                 if(U_SUCCESS(*status)) {
   2204                     if(src->removeSet == NULL) {
   2205                         src->removeSet = newSet;
   2206                     } else {
   2207                         uset_addAll(src->removeSet, newSet);
   2208                         uset_close(newSet);
   2209                     }
   2210                 } else {
   2211                     return;
   2212                 }
   2213             } else if(optionNumber == OPTION_IMPORT){
   2214                 // [import <collation-name>]
   2215 
   2216                 // Find the address of the closing ].
   2217                 UChar* import_end = u_strchr(setStart, 0x005D);
   2218                 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);
   2219                 // Ignore trailing whitespace.
   2220                 while(PatternProps::isWhiteSpace(*(import_end-1))) {
   2221                     --import_end;
   2222                 }
   2223 
   2224                 int32_t optionLength = (int32_t)(import_end - setStart);
   2225                 char option[50];
   2226                 if(optionLength >= (int32_t)sizeof(option)) {
   2227                     *status = U_ILLEGAL_ARGUMENT_ERROR;
   2228                     return;
   2229                 }
   2230                 u_UCharsToChars(setStart, option, optionLength);
   2231                 option[optionLength] = 0;
   2232 
   2233                 *status = U_ZERO_ERROR;
   2234                 char locale[50];
   2235                 int32_t templ;
   2236                 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);
   2237                 if(U_FAILURE(*status)) {
   2238                     *status = U_ILLEGAL_ARGUMENT_ERROR;
   2239                     return;
   2240                 }
   2241 
   2242                 char type[50];
   2243                 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||
   2244                     U_FAILURE(*status)
   2245                 ) {
   2246                     *status = U_ZERO_ERROR;
   2247                     uprv_strcpy(type, "standard");
   2248                 }
   2249 
   2250                 // TODO: Use public functions when available, see ticket #8134.
   2251                 char *keywords = (char *)locale_getKeywordsStart(locale);
   2252                 if(keywords != NULL) {
   2253                     *keywords = 0;
   2254                 }
   2255 
   2256                 int32_t importRulesLength = 0;
   2257                 const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);
   2258 
   2259 #ifdef DEBUG_FOR_COLL_RULES
   2260                 std::string s;
   2261                 UnicodeString(importRules).toUTF8String(s);
   2262                 std::cout << "Import rules = " << s << std::endl;
   2263 #endif
   2264 
   2265                 // Add the length of the imported rules to length of the original rules,
   2266                 // and subtract the length of the import option.
   2267                 uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);
   2268 
   2269                 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));
   2270 
   2271 #ifdef DEBUG_FOR_COLL_RULES
   2272                 std::string s1;
   2273                 UnicodeString(rules).toUTF8String(s1);
   2274                 std::cout << "Original rules = " << s1 << std::endl;
   2275 #endif
   2276 
   2277 
   2278                 // Copy the section of the original rules leading up to the import
   2279                 uprv_memcpy(newRules, rules, i*sizeof(UChar));
   2280                 // Copy the imported rules
   2281                 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));
   2282                 // Copy the rest of the original rules (minus the import option itself)
   2283                 uprv_memcpy(newRules+i+importRulesLength,
   2284                             rules+optionEndOffset,
   2285                             (rulesLength-optionEndOffset)*sizeof(UChar));
   2286 
   2287 #ifdef DEBUG_FOR_COLL_RULES
   2288                 std::string s2;
   2289                 UnicodeString(newRules).toUTF8String(s2);
   2290                 std::cout << "Resulting rules = " << s2 << std::endl;
   2291 #endif
   2292 
   2293                 if(needToDeallocRules){
   2294                     // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
   2295                     uprv_free((void*)rules);
   2296                 }
   2297                 needToDeallocRules = true;
   2298                 rules = newRules;
   2299                 rulesLength = newRulesLength;
   2300 
   2301                 estimatedSize += importRulesLength*2;
   2302 
   2303                 // First character of the new rules needs to be processed
   2304                 i--;
   2305             }
   2306         }
   2307         //openBrace++;
   2308         i++;
   2309     }
   2310 
   2311     src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
   2312     /* test for NULL */
   2313     if (src->source == NULL) {
   2314         *status = U_MEMORY_ALLOCATION_ERROR;
   2315         return;
   2316     }
   2317     uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
   2318     nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
   2319     if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
   2320         *status = U_ZERO_ERROR;
   2321         src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
   2322         /* test for NULL */
   2323         if (src->source == NULL) {
   2324             *status = U_MEMORY_ALLOCATION_ERROR;
   2325             return;
   2326         }
   2327         nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
   2328     }
   2329     if(needToDeallocRules){
   2330         // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
   2331         uprv_free((void*)rules);
   2332     }
   2333 
   2334 
   2335     src->current = src->source;
   2336     src->end = src->source+nSize;
   2337     src->sourceCurrent = src->source;
   2338     src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
   2339     src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
   2340     src->varTop = NULL;
   2341     src->UCA = UCA;
   2342     src->invUCA = ucol_initInverseUCA(status);
   2343     src->parsedToken.charsLen = 0;
   2344     src->parsedToken.charsOffset = 0;
   2345     src->parsedToken.extensionLen = 0;
   2346     src->parsedToken.extensionOffset = 0;
   2347     src->parsedToken.prefixLen = 0;
   2348     src->parsedToken.prefixOffset = 0;
   2349     src->parsedToken.flags = 0;
   2350     src->parsedToken.strength = UCOL_TOK_UNSET;
   2351     src->buildCCTabFlag = FALSE;
   2352     src->isStarred = FALSE;
   2353     src->inRange = FALSE;
   2354     src->lastRangeCp = 0;
   2355     src->previousCp = 0;
   2356 
   2357     if(U_FAILURE(*status)) {
   2358         return;
   2359     }
   2360     src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
   2361     if(U_FAILURE(*status)) {
   2362         return;
   2363     }
   2364     uhash_setValueDeleter(src->tailored, uprv_free);
   2365 
   2366     src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
   2367     /* test for NULL */
   2368     if (src->opts == NULL) {
   2369         *status = U_MEMORY_ALLOCATION_ERROR;
   2370         return;
   2371     }
   2372 
   2373     uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
   2374 
   2375     src->lh = 0;
   2376     src->listCapacity = 1024;
   2377     src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
   2378     //Test for NULL
   2379     if (src->lh == NULL) {
   2380         *status = U_MEMORY_ALLOCATION_ERROR;
   2381         return;
   2382     }
   2383     uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
   2384     src->resultLen = 0;
   2385 
   2386     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   2387 
   2388     // UCOL_RESET_TOP_VALUE
   2389     setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
   2390     // UCOL_FIRST_PRIMARY_IGNORABLE
   2391     setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
   2392     // UCOL_LAST_PRIMARY_IGNORABLE
   2393     setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
   2394     // UCOL_FIRST_SECONDARY_IGNORABLE
   2395     setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
   2396     // UCOL_LAST_SECONDARY_IGNORABLE
   2397     setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
   2398     // UCOL_FIRST_TERTIARY_IGNORABLE
   2399     setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
   2400     // UCOL_LAST_TERTIARY_IGNORABLE
   2401     setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
   2402     // UCOL_FIRST_VARIABLE
   2403     setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
   2404     // UCOL_LAST_VARIABLE
   2405     setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
   2406     // UCOL_FIRST_NON_VARIABLE
   2407     setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
   2408     // UCOL_LAST_NON_VARIABLE
   2409     setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
   2410     // UCOL_FIRST_IMPLICIT
   2411     setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
   2412     // UCOL_LAST_IMPLICIT
   2413     setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
   2414     // UCOL_FIRST_TRAILING
   2415     setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
   2416     // UCOL_LAST_TRAILING
   2417     setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
   2418     ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
   2419 }
   2420 
   2421 
   2422 void ucol_tok_closeTokenList(UColTokenParser *src) {
   2423     if(src->copySet != NULL) {
   2424         uset_close(src->copySet);
   2425     }
   2426     if(src->removeSet != NULL) {
   2427         uset_close(src->removeSet);
   2428     }
   2429     if(src->tailored != NULL) {
   2430         uhash_close(src->tailored);
   2431     }
   2432     if(src->lh != NULL) {
   2433         uprv_free(src->lh);
   2434     }
   2435     if(src->source != NULL) {
   2436         uprv_free(src->source);
   2437     }
   2438     if(src->opts != NULL) {
   2439         uprv_free(src->opts);
   2440     }
   2441     if (src->reorderCodes != NULL) {
   2442         uprv_free(src->reorderCodes);
   2443     }
   2444 }
   2445 
   2446 #endif /* #if !UCONFIG_NO_COLLATION */
   2447