Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucol_tok.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created 02/22/2001
     14 *   created by: Vladimir Weinstein
     15 *
     16 * This module reads a tailoring rule string and produces a list of
     17 * tokens that will be turned into collation elements
     18 *
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 
     23 #if !UCONFIG_NO_COLLATION
     24 
     25 #include "unicode/uscript.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/uchar.h"
     28 #include "unicode/uniset.h"
     29 
     30 #include "cmemory.h"
     31 #include "cstring.h"
     32 #include "ucol_bld.h"
     33 #include "ucol_tok.h"
     34 #include "ulocimp.h"
     35 #include "uresimp.h"
     36 #include "util.h"
     37 
     38 // Define this only for debugging.
     39 // #define DEBUG_FOR_COLL_RULES 1
     40 
     41 #ifdef DEBUG_FOR_COLL_RULES
     42 #include <iostream>
     43 #endif
     44 
     45 U_NAMESPACE_USE
     46 
     47 U_CDECL_BEGIN
     48 static int32_t U_CALLCONV
     49 uhash_hashTokens(const UHashTok k)
     50 {
     51     int32_t hash = 0;
     52     //uint32_t key = (uint32_t)k.integer;
     53     UColToken *key = (UColToken *)k.pointer;
     54     if (key != 0) {
     55         int32_t len = (key->source & 0xFF000000)>>24;
     56         int32_t inc = ((len - 32) / 32) + 1;
     57 
     58         const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);
     59         const UChar *limit = p + len;
     60 
     61         while (p<limit) {
     62             hash = (hash * 37) + *p;
     63             p += inc;
     64         }
     65     }
     66     return hash;
     67 }
     68 
     69 static UBool U_CALLCONV
     70 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
     71 {
     72     //uint32_t p1 = (uint32_t) key1.integer;
     73     //uint32_t p2 = (uint32_t) key2.integer;
     74     UColToken *p1 = (UColToken *)key1.pointer;
     75     UColToken *p2 = (UColToken *)key2.pointer;
     76     const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);
     77     const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);
     78     uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
     79     uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
     80     const UChar *end = s1+s1L-1;
     81 
     82     if (p1 == p2) {
     83         return TRUE;
     84     }
     85     if (p1->source == 0 || p2->source == 0) {
     86         return FALSE;
     87     }
     88     if(s1L != s2L) {
     89         return FALSE;
     90     }
     91     if(p1->source == p2->source) {
     92         return TRUE;
     93     }
     94     while((s1 < end) && *s1 == *s2) {
     95         ++s1;
     96         ++s2;
     97     }
     98     if(*s1 == *s2) {
     99         return TRUE;
    100     } else {
    101         return FALSE;
    102     }
    103 }
    104 U_CDECL_END
    105 
    106 /*
    107  * Debug messages used to pinpoint where a format error occurred.
    108  * A better way is to include context-sensitive information in syntaxError() function.
    109  *
    110  * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
    111  * in the compile line.
    112  */
    113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */
    114 
    115 #ifdef DEBUG_FOR_FORMAT_ERROR
    116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
    117 #else
    118 #define DBG_FORMAT_ERROR
    119 #endif
    120 
    121 
    122 /*
    123  * Controls debug messages so that the output can be compared before and after a
    124  * big change.  Prints the information of every code point that comes out of the
    125  * collation parser and its strength into a file.  When a big change in format
    126  * happens, the files before and after the change should be identical.
    127  *
    128  * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
    129  * in the compile line.
    130  */
    131 // #define DEBUG_FOR_CODE_POINTS 1
    132 
    133 #ifdef DEBUG_FOR_CODE_POINTS
    134     FILE* dfcp_fp = NULL;
    135 #endif
    136 
    137 
    138 /*static inline void U_CALLCONV
    139 uhash_freeBlockWrapper(void *obj) {
    140     uhash_freeBlock(obj);
    141 }*/
    142 
    143 
    144 typedef struct {
    145     uint32_t startCE;
    146     uint32_t startContCE;
    147     uint32_t limitCE;
    148     uint32_t limitContCE;
    149 } indirectBoundaries;
    150 
    151 /* these values are used for finding CE values for indirect positioning. */
    152 /* Indirect positioning is a mechanism for allowing resets on symbolic   */
    153 /* values. It only works for resets and you cannot tailor indirect names */
    154 /* An indirect name can define either an anchor point or a range. An     */
    155 /* anchor point behaves in exactly the same way as a code point in reset */
    156 /* would, except that it cannot be tailored. A range (we currently only  */
    157 /* know for the [top] range will explicitly set the upper bound for      */
    158 /* generated CEs, thus allowing for better control over how many CEs can */
    159 /* be squeezed between in the range without performance penalty.         */
    160 /* In that respect, we use [top] for tailoring of locales that use CJK   */
    161 /* characters. Other indirect values are currently a pure convenience,   */
    162 /* they can be used to assure that the CEs will be always positioned in  */
    163 /* the same place relative to a point with known properties (e.g. first  */
    164 /* primary ignorable). */
    165 static indirectBoundaries ucolIndirectBoundaries[15];
    166 /*
    167 static indirectBoundaries ucolIndirectBoundaries[11] = {
    168 { UCOL_RESET_TOP_VALUE,               0,
    169 UCOL_NEXT_TOP_VALUE,                0 },
    170 { UCOL_FIRST_PRIMARY_IGNORABLE,       0,
    171 0,                                  0 },
    172 { UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
    173 0,                                  0 },
    174 { UCOL_FIRST_SECONDARY_IGNORABLE,     0,
    175 0,                                  0 },
    176 { UCOL_LAST_SECONDARY_IGNORABLE,      0,
    177 0,                                  0 },
    178 { UCOL_FIRST_TERTIARY_IGNORABLE,      0,
    179 0,                                  0 },
    180 { UCOL_LAST_TERTIARY_IGNORABLE,       0,
    181 0,                                  0 },
    182 { UCOL_FIRST_VARIABLE,                0,
    183 0,                                  0 },
    184 { UCOL_LAST_VARIABLE,                 0,
    185 0,                                  0 },
    186 { UCOL_FIRST_NON_VARIABLE,            0,
    187 0,                                  0 },
    188 { UCOL_LAST_NON_VARIABLE,             0,
    189 0,                                  0 },
    190 };
    191 */
    192 
    193 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
    194 
    195     // Set values for the top - TODO: once we have values for all the indirects, we are going
    196     // to initalize here.
    197     ucolIndirectBoundaries[indexR].startCE = start[0];
    198     ucolIndirectBoundaries[indexR].startContCE = start[1];
    199     if(end) {
    200         ucolIndirectBoundaries[indexR].limitCE = end[0];
    201         ucolIndirectBoundaries[indexR].limitContCE = end[1];
    202     } else {
    203         ucolIndirectBoundaries[indexR].limitCE = 0;
    204         ucolIndirectBoundaries[indexR].limitContCE = 0;
    205     }
    206 }
    207 
    208 
    209 static inline
    210 void syntaxError(const UChar* rules,
    211                  int32_t pos,
    212                  int32_t rulesLen,
    213                  UParseError* parseError)
    214 {
    215     parseError->offset = pos;
    216     parseError->line = 0 ; /* we are not using line numbers */
    217 
    218     // for pre-context
    219     int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
    220     int32_t stop  = pos;
    221 
    222     u_memcpy(parseError->preContext,rules+start,stop-start);
    223     //null terminate the buffer
    224     parseError->preContext[stop-start] = 0;
    225 
    226     //for post-context
    227     start = pos+1;
    228     stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
    229     rulesLen;
    230 
    231     if(start < stop) {
    232         u_memcpy(parseError->postContext,rules+start,stop-start);
    233         //null terminate the buffer
    234         parseError->postContext[stop-start]= 0;
    235     } else {
    236         parseError->postContext[0] = 0;
    237     }
    238 }
    239 
    240 static
    241 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
    242     switch(attrib) {
    243     case UCOL_HIRAGANA_QUATERNARY_MODE:
    244         opts->hiraganaQ = value;
    245         break;
    246     case UCOL_FRENCH_COLLATION:
    247         opts->frenchCollation = value;
    248         break;
    249     case UCOL_ALTERNATE_HANDLING:
    250         opts->alternateHandling = value;
    251         break;
    252     case UCOL_CASE_FIRST:
    253         opts->caseFirst = value;
    254         break;
    255     case UCOL_CASE_LEVEL:
    256         opts->caseLevel = value;
    257         break;
    258     case UCOL_NORMALIZATION_MODE:
    259         opts->normalizationMode = value;
    260         break;
    261     case UCOL_STRENGTH:
    262         opts->strength = value;
    263         break;
    264     case UCOL_NUMERIC_COLLATION:
    265         opts->numericCollation = value;
    266         break;
    267     case UCOL_ATTRIBUTE_COUNT:
    268     default:
    269         break;
    270     }
    271 }
    272 
    273 #define UTOK_OPTION_COUNT 22
    274 
    275 static UBool didInit = FALSE;
    276 /* we can be strict, or we can be lenient */
    277 /* I'd surely be lenient with the option arguments */
    278 /* maybe even with options */
    279 U_STRING_DECL(suboption_00, "non-ignorable", 13);
    280 U_STRING_DECL(suboption_01, "shifted",        7);
    281 
    282 U_STRING_DECL(suboption_02, "lower",          5);
    283 U_STRING_DECL(suboption_03, "upper",          5);
    284 U_STRING_DECL(suboption_04, "off",            3);
    285 U_STRING_DECL(suboption_05, "on",             2);
    286 U_STRING_DECL(suboption_06, "1",              1);
    287 U_STRING_DECL(suboption_07, "2",              1);
    288 U_STRING_DECL(suboption_08, "3",              1);
    289 U_STRING_DECL(suboption_09, "4",              1);
    290 U_STRING_DECL(suboption_10, "I",              1);
    291 
    292 U_STRING_DECL(suboption_11, "primary",        7);
    293 U_STRING_DECL(suboption_12, "secondary",      9);
    294 U_STRING_DECL(suboption_13, "tertiary",       8);
    295 U_STRING_DECL(suboption_14, "variable",       8);
    296 U_STRING_DECL(suboption_15, "regular",        7);
    297 U_STRING_DECL(suboption_16, "implicit",       8);
    298 U_STRING_DECL(suboption_17, "trailing",       8);
    299 
    300 
    301 U_STRING_DECL(option_00,    "undefined",      9);
    302 U_STRING_DECL(option_01,    "rearrange",      9);
    303 U_STRING_DECL(option_02,    "alternate",      9);
    304 U_STRING_DECL(option_03,    "backwards",      9);
    305 U_STRING_DECL(option_04,    "variable top",  12);
    306 U_STRING_DECL(option_05,    "top",            3);
    307 U_STRING_DECL(option_06,    "normalization", 13);
    308 U_STRING_DECL(option_07,    "caseLevel",      9);
    309 U_STRING_DECL(option_08,    "caseFirst",      9);
    310 U_STRING_DECL(option_09,    "scriptOrder",   11);
    311 U_STRING_DECL(option_10,    "charsetname",   11);
    312 U_STRING_DECL(option_11,    "charset",        7);
    313 U_STRING_DECL(option_12,    "before",         6);
    314 U_STRING_DECL(option_13,    "hiraganaQ",      9);
    315 U_STRING_DECL(option_14,    "strength",       8);
    316 U_STRING_DECL(option_15,    "first",          5);
    317 U_STRING_DECL(option_16,    "last",           4);
    318 U_STRING_DECL(option_17,    "optimize",       8);
    319 U_STRING_DECL(option_18,    "suppressContractions",         20);
    320 U_STRING_DECL(option_19,    "numericOrdering",              15);
    321 U_STRING_DECL(option_20,    "import",         6);
    322 U_STRING_DECL(option_21,    "reorder",         7);
    323 
    324 /*
    325 [last variable] last variable value
    326 [last primary ignorable] largest CE for primary ignorable
    327 [last secondary ignorable] largest CE for secondary ignorable
    328 [last tertiary ignorable] largest CE for tertiary ignorable
    329 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
    330 */
    331 
    332 
    333 static const ucolTokSuboption alternateSub[2] = {
    334     {suboption_00, 13, UCOL_NON_IGNORABLE},
    335     {suboption_01,  7, UCOL_SHIFTED}
    336 };
    337 
    338 static const ucolTokSuboption caseFirstSub[3] = {
    339     {suboption_02, 5, UCOL_LOWER_FIRST},
    340     {suboption_03,  5, UCOL_UPPER_FIRST},
    341     {suboption_04,  3, UCOL_OFF},
    342 };
    343 
    344 static const ucolTokSuboption onOffSub[2] = {
    345     {suboption_04, 3, UCOL_OFF},
    346     {suboption_05, 2, UCOL_ON}
    347 };
    348 
    349 static const ucolTokSuboption frenchSub[1] = {
    350     {suboption_07, 1, UCOL_ON}
    351 };
    352 
    353 static const ucolTokSuboption beforeSub[3] = {
    354     {suboption_06, 1, UCOL_PRIMARY},
    355     {suboption_07, 1, UCOL_SECONDARY},
    356     {suboption_08, 1, UCOL_TERTIARY}
    357 };
    358 
    359 static const ucolTokSuboption strengthSub[5] = {
    360     {suboption_06, 1, UCOL_PRIMARY},
    361     {suboption_07, 1, UCOL_SECONDARY},
    362     {suboption_08, 1, UCOL_TERTIARY},
    363     {suboption_09, 1, UCOL_QUATERNARY},
    364     {suboption_10, 1, UCOL_IDENTICAL},
    365 };
    366 
    367 static const ucolTokSuboption firstLastSub[7] = {
    368     {suboption_11, 7, UCOL_PRIMARY},
    369     {suboption_12, 9, UCOL_PRIMARY},
    370     {suboption_13, 8, UCOL_PRIMARY},
    371     {suboption_14, 8, UCOL_PRIMARY},
    372     {suboption_15, 7, UCOL_PRIMARY},
    373     {suboption_16, 8, UCOL_PRIMARY},
    374     {suboption_17, 8, UCOL_PRIMARY},
    375 };
    376 
    377 enum OptionNumber {
    378     OPTION_ALTERNATE_HANDLING = 0,
    379     OPTION_FRENCH_COLLATION,
    380     OPTION_CASE_LEVEL,
    381     OPTION_CASE_FIRST,
    382     OPTION_NORMALIZATION_MODE,
    383     OPTION_HIRAGANA_QUATERNARY,
    384     OPTION_STRENGTH,
    385     OPTION_NUMERIC_COLLATION,
    386     OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
    387     OPTION_VARIABLE_TOP,
    388     OPTION_REARRANGE,
    389     OPTION_BEFORE,
    390     OPTION_TOP,
    391     OPTION_FIRST,
    392     OPTION_LAST,
    393     OPTION_OPTIMIZE,
    394     OPTION_SUPPRESS_CONTRACTIONS,
    395     OPTION_UNDEFINED,
    396     OPTION_SCRIPT_ORDER,
    397     OPTION_CHARSET_NAME,
    398     OPTION_CHARSET,
    399     OPTION_IMPORT,
    400     OPTION_SCRIPTREORDER
    401 } ;
    402 
    403 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
    404     /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
    405     /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
    406     /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
    407     /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
    408     /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
    409     /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
    410     /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
    411     /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
    412     /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
    413     /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
    414     /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
    415     /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
    416     /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
    417     /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
    418     /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
    419     /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
    420     /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
    421     /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
    422     /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
    423     /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"charset"        */
    424     /*20*/ {option_20,  6, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"import"        */
    425     /*21*/ {option_21,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"reorder"        */
    426 };
    427 
    428 static
    429 int32_t u_strncmpNoCase(const UChar     *s1,
    430                         const UChar     *s2,
    431                         int32_t     n)
    432 {
    433     if(n > 0) {
    434         int32_t rc;
    435         for(;;) {
    436             rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
    437             if(rc != 0 || *s1 == 0 || --n == 0) {
    438                 return rc;
    439             }
    440             ++s1;
    441             ++s2;
    442         }
    443     }
    444     return 0;
    445 }
    446 
    447 static
    448 void ucol_uprv_tok_initData() {
    449     if(!didInit) {
    450         U_STRING_INIT(suboption_00, "non-ignorable", 13);
    451         U_STRING_INIT(suboption_01, "shifted",        7);
    452 
    453         U_STRING_INIT(suboption_02, "lower",          5);
    454         U_STRING_INIT(suboption_03, "upper",          5);
    455         U_STRING_INIT(suboption_04, "off",            3);
    456         U_STRING_INIT(suboption_05, "on",             2);
    457 
    458         U_STRING_INIT(suboption_06, "1",              1);
    459         U_STRING_INIT(suboption_07, "2",              1);
    460         U_STRING_INIT(suboption_08, "3",              1);
    461         U_STRING_INIT(suboption_09, "4",              1);
    462         U_STRING_INIT(suboption_10, "I",              1);
    463 
    464         U_STRING_INIT(suboption_11, "primary",        7);
    465         U_STRING_INIT(suboption_12, "secondary",      9);
    466         U_STRING_INIT(suboption_13, "tertiary",       8);
    467         U_STRING_INIT(suboption_14, "variable",       8);
    468         U_STRING_INIT(suboption_15, "regular",        7);
    469         U_STRING_INIT(suboption_16, "implicit",       8);
    470         U_STRING_INIT(suboption_17, "trailing",       8);
    471 
    472 
    473         U_STRING_INIT(option_00, "undefined",      9);
    474         U_STRING_INIT(option_01, "rearrange",      9);
    475         U_STRING_INIT(option_02, "alternate",      9);
    476         U_STRING_INIT(option_03, "backwards",      9);
    477         U_STRING_INIT(option_04, "variable top",  12);
    478         U_STRING_INIT(option_05, "top",            3);
    479         U_STRING_INIT(option_06, "normalization", 13);
    480         U_STRING_INIT(option_07, "caseLevel",      9);
    481         U_STRING_INIT(option_08, "caseFirst",      9);
    482         U_STRING_INIT(option_09, "scriptOrder",   11);
    483         U_STRING_INIT(option_10, "charsetname",   11);
    484         U_STRING_INIT(option_11, "charset",        7);
    485         U_STRING_INIT(option_12, "before",         6);
    486         U_STRING_INIT(option_13, "hiraganaQ",      9);
    487         U_STRING_INIT(option_14, "strength",       8);
    488         U_STRING_INIT(option_15, "first",          5);
    489         U_STRING_INIT(option_16, "last",           4);
    490         U_STRING_INIT(option_17, "optimize",       8);
    491         U_STRING_INIT(option_18, "suppressContractions",         20);
    492         U_STRING_INIT(option_19, "numericOrdering",      15);
    493         U_STRING_INIT(option_20, "import ",        6);
    494         U_STRING_INIT(option_21, "reorder",        7);
    495         didInit = TRUE;
    496     }
    497 }
    498 
    499 
    500 // This function reads basic options to set in the runtime collator
    501 // used by data driven tests. Should not support build time options
    502 U_CAPI const UChar * U_EXPORT2
    503 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
    504                          UColAttribute *attrib, UColAttributeValue *value,
    505                          UErrorCode *status)
    506 {
    507     uint32_t i = 0;
    508     int32_t j=0;
    509     UBool foundOption = FALSE;
    510     const UChar *optionArg = NULL;
    511 
    512     ucol_uprv_tok_initData();
    513 
    514     while(start < end && (u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start))) { /* eat whitespace */
    515         start++;
    516     }
    517     if(start >= end) {
    518         return NULL;
    519     }
    520     /* skip opening '[' */
    521     if(*start == 0x005b) {
    522         start++;
    523     } else {
    524         *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
    525         return NULL;
    526     }
    527 
    528     while(i < UTOK_OPTION_COUNT) {
    529         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
    530             foundOption = TRUE;
    531             if(end - start > rulesOptions[i].optionLen) {
    532                 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
    533                 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */
    534                     optionArg++;
    535                 }
    536             }
    537             break;
    538         }
    539         i++;
    540     }
    541 
    542     if(!foundOption) {
    543         *status = U_ILLEGAL_ARGUMENT_ERROR;
    544         return NULL;
    545     }
    546 
    547     if(optionArg) {
    548         for(j = 0; j<rulesOptions[i].subSize; j++) {
    549             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    550                 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
    551                 *attrib = rulesOptions[i].attr;
    552                 *value = rulesOptions[i].subopts[j].attrVal;
    553                 optionArg += rulesOptions[i].subopts[j].subLen;
    554                 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */
    555                     optionArg++;
    556                 }
    557                 if(*optionArg == 0x005d) {
    558                     optionArg++;
    559                     return optionArg;
    560                 } else {
    561                     *status = U_ILLEGAL_ARGUMENT_ERROR;
    562                     return NULL;
    563                 }
    564             }
    565         }
    566     }
    567     *status = U_ILLEGAL_ARGUMENT_ERROR;
    568     return NULL;
    569 }
    570 
    571 static
    572 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
    573     while(*start != 0x005b) { /* advance while we find the first '[' */
    574         start++;
    575     }
    576     // now we need to get a balanced set of '[]'. The problem is that a set can have
    577     // many, and *end point to the first closing '['
    578     int32_t noOpenBraces = 1;
    579     int32_t current = 1; // skip the opening brace
    580     while(start+current < end && noOpenBraces != 0) {
    581         if(start[current] == 0x005b) {
    582             noOpenBraces++;
    583         } else if(start[current] == 0x005D) { // closing brace
    584             noOpenBraces--;
    585         }
    586         current++;
    587     }
    588 
    589     if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
    590         *status = U_ILLEGAL_ARGUMENT_ERROR;
    591         return NULL;
    592     }
    593     return uset_openPattern(start, current, status);
    594 }
    595 
    596 /**
    597  * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
    598  * @param start Pointer to the start UChar.
    599  * @param end Pointer to the last valid pointer beyond which the option will not extend.
    600  * @param optionArg Address of the pointer at which the options start (after the option name)
    601  * @return The index of the option, or -1 if the option is not valid.
    602  */
    603 static
    604 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
    605     int32_t i = 0;
    606     ucol_uprv_tok_initData();
    607 
    608     while(u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start)) { /* eat whitespace */
    609         start++;
    610     }
    611     while(i < UTOK_OPTION_COUNT) {
    612         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
    613             if(end - start > rulesOptions[i].optionLen) {
    614                 *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */
    615                 while(u_isWhitespace(**optionArg) || uprv_isRuleWhiteSpace(**optionArg)) { /* eat whitespace */
    616                     (*optionArg)++;
    617                 }
    618             }
    619             break;
    620         }
    621         i++;
    622     }
    623     if(i == UTOK_OPTION_COUNT) {
    624         i = -1; // didn't find an option
    625     }
    626     return i;
    627 }
    628 
    629 
    630 static
    631 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {
    632     int32_t codeCount = 0;
    633     int32_t codeIndex = 0;
    634     char conversion[64];
    635     int32_t tokenLength = 0;
    636     const UChar* space;
    637 
    638     const UChar* current = src->current;
    639     const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
    640 
    641     // eat leading whitespace
    642     while(current < end && u_isWhitespace(*current)) {
    643         current++;
    644     }
    645 
    646     while(current < end) {
    647         space = u_memchr(current, 0x0020, end - current);
    648         space = space == 0 ? end : space;
    649         tokenLength = space - current;
    650         if (tokenLength < 4) {
    651             *status = U_INVALID_FORMAT_ERROR;
    652             return;
    653         }
    654         codeCount++;
    655         current += tokenLength;
    656         while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
    657             ++current;
    658         }
    659     }
    660 
    661     if (codeCount == 0) {
    662         *status = U_INVALID_FORMAT_ERROR;
    663     }
    664 
    665     src->reorderCodesLength = codeCount;
    666     src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
    667     current = src->current;
    668 
    669     // eat leading whitespace
    670     while(current < end && u_isWhitespace(*current)) {
    671         current++;
    672     }
    673 
    674     while(current < end) {
    675         space = u_memchr(current, 0x0020, end - current);
    676         space = space == 0 ? end : space;
    677         tokenLength = space - current;
    678         if (tokenLength < 4) {
    679             *status = U_ILLEGAL_ARGUMENT_ERROR;
    680             return;
    681         } else {
    682             u_UCharsToChars(current, conversion, tokenLength);
    683             conversion[tokenLength] = '\0';
    684             src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);
    685             if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
    686                 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
    687             }
    688             if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
    689                 *status = U_ILLEGAL_ARGUMENT_ERROR;
    690             }
    691         }
    692         codeIndex++;
    693         current += tokenLength;
    694         while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
    695             ++current;
    696         }
    697     }
    698 }
    699 
    700 // reads and conforms to various options in rules
    701 // end is the position of the first closing ']'
    702 // However, some of the options take an UnicodeSet definition
    703 // which needs to duplicate the closing ']'
    704 // for example: '[copy [\uAC00-\uD7FF]]'
    705 // These options will move end to the second ']' and the
    706 // caller will set the current to it.
    707 static
    708 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
    709     const UChar* start = src->current;
    710     int32_t i = 0;
    711     int32_t j=0;
    712     const UChar *optionArg = NULL;
    713 
    714     uint8_t result = 0;
    715 
    716     start++; /*skip opening '['*/
    717     i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
    718     if(optionArg) {
    719         src->current = optionArg;
    720     }
    721 
    722     if(i < 0) {
    723         *status = U_ILLEGAL_ARGUMENT_ERROR;
    724     } else {
    725         int32_t noOpenBraces = 1;
    726         switch(i) {
    727     case OPTION_ALTERNATE_HANDLING:
    728     case OPTION_FRENCH_COLLATION:
    729     case OPTION_CASE_LEVEL:
    730     case OPTION_CASE_FIRST:
    731     case OPTION_NORMALIZATION_MODE:
    732     case OPTION_HIRAGANA_QUATERNARY:
    733     case OPTION_STRENGTH:
    734     case OPTION_NUMERIC_COLLATION:
    735         if(optionArg) {
    736             for(j = 0; j<rulesOptions[i].subSize; j++) {
    737                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    738                     ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
    739                     result =  UCOL_TOK_SUCCESS;
    740                 }
    741             }
    742         }
    743         if(result == 0) {
    744             *status = U_ILLEGAL_ARGUMENT_ERROR;
    745         }
    746         break;
    747     case OPTION_VARIABLE_TOP:
    748         result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
    749         break;
    750     case OPTION_REARRANGE:
    751         result = UCOL_TOK_SUCCESS;
    752         break;
    753     case OPTION_BEFORE:
    754         if(optionArg) {
    755             for(j = 0; j<rulesOptions[i].subSize; j++) {
    756                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    757                     result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);
    758                 }
    759             }
    760         }
    761         if(result == 0) {
    762             *status = U_ILLEGAL_ARGUMENT_ERROR;
    763         }
    764         break;
    765     case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
    766         /* index to this array will be src->parsedToken.indirectIndex*/
    767         src->parsedToken.indirectIndex = 0;
    768         result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
    769         break;
    770     case OPTION_FIRST:
    771     case OPTION_LAST: /* first, last */
    772         for(j = 0; j<rulesOptions[i].subSize; j++) {
    773             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    774                 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
    775                 // element of indirect boundaries is reserved for top.
    776                 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
    777                 result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
    778             }
    779         }
    780         if(result == 0) {
    781             *status = U_ILLEGAL_ARGUMENT_ERROR;
    782         }
    783         break;
    784     case OPTION_OPTIMIZE:
    785     case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
    786         // we need to move end here
    787         src->current++; // skip opening brace
    788         while(src->current < src->end && noOpenBraces != 0) {
    789             if(*src->current == 0x005b) {
    790                 noOpenBraces++;
    791             } else if(*src->current == 0x005D) { // closing brace
    792                 noOpenBraces--;
    793             }
    794             src->current++;
    795         }
    796         result = UCOL_TOK_SUCCESS;
    797         break;
    798     case OPTION_SCRIPTREORDER:
    799         ucol_tok_parseScriptReorder(src, status);
    800         break;
    801     default:
    802         *status = U_UNSUPPORTED_ERROR;
    803         break;
    804         }
    805     }
    806     src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
    807     return result;
    808 }
    809 
    810 
    811 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
    812     if (stuff == NULL || len <= 0) {
    813         return;
    814     }
    815     UnicodeString tempStuff(FALSE, stuff, len);
    816     if(src->extraCurrent+len >= src->extraEnd) {
    817         /* reallocate */
    818         if (stuff >= src->source && stuff <= src->end) {
    819             // Copy the "stuff" contents into tempStuff's own buffer.
    820             // UnicodeString is copy-on-write.
    821             if (len > 0) {
    822                 tempStuff.setCharAt(0, tempStuff[0]);
    823             } else {
    824                 tempStuff.remove();
    825             }
    826         }
    827         UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
    828         if(newSrc != NULL) {
    829             src->current = newSrc + (src->current - src->source);
    830             src->extraCurrent = newSrc + (src->extraCurrent - src->source);
    831             src->end = newSrc + (src->end - src->source);
    832             src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
    833             src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
    834             src->source = newSrc;
    835         } else {
    836             *status = U_MEMORY_ALLOCATION_ERROR;
    837             return;
    838         }
    839     }
    840     if(len == 1) {
    841         *src->extraCurrent++ = tempStuff[0];
    842     } else {
    843         u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);
    844         src->extraCurrent += len;
    845     }
    846 }
    847 
    848 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
    849     /*
    850     top = TRUE;
    851     */
    852     UChar buff[5];
    853     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    854     buff[0] = 0xFFFE;
    855     buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
    856     buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
    857     if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
    858         src->parsedToken.charsLen = 3;
    859         ucol_tok_addToExtraCurrent(src, buff, 3, status);
    860     } else {
    861         buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
    862         buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
    863         src->parsedToken.charsLen = 5;
    864         ucol_tok_addToExtraCurrent(src, buff, 5, status);
    865     }
    866     return TRUE;
    867 }
    868 
    869 static UBool isCharNewLine(UChar c){
    870     switch(c){
    871     case 0x000A: /* LF  */
    872     case 0x000D: /* CR  */
    873     case 0x000C: /* FF  */
    874     case 0x0085: /* NEL */
    875     case 0x2028: /* LS  */
    876     case 0x2029: /* PS  */
    877         return TRUE;
    878     default:
    879         return FALSE;
    880     }
    881 }
    882 
    883 /*
    884  * This function is called several times when a range is processed.  Each time, the next code point
    885  * is processed.
    886  * The following variables must be set before calling this function:
    887  *   src->currentRangeCp:  The current code point to process.
    888  *   src->lastRangeCp: The last code point in the range.
    889  * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
    890  */
    891 static const UChar*
    892 ucol_tok_processNextCodePointInRange(UColTokenParser *src,
    893                                      UErrorCode *status)
    894 {
    895   // Append current code point to source
    896   UChar buff[U16_MAX_LENGTH];
    897   uint32_t i = 0;
    898 
    899   uint32_t nChars = U16_LENGTH(src->currentRangeCp);
    900   src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    901   src->parsedToken.charsLen = nChars;
    902 
    903   U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);
    904   ucol_tok_addToExtraCurrent(src, buff, nChars, status);
    905 
    906   ++src->currentRangeCp;
    907   if (src->currentRangeCp > src->lastRangeCp) {
    908     src->inRange = FALSE;
    909 
    910     if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
    911       src->isStarred = FALSE;
    912     }
    913   } else {
    914     src->previousCp = src->currentRangeCp;
    915   }
    916   return src->current;
    917 }
    918 
    919 /*
    920  * This function is called several times when a starred list is processed.  Each time, the next code point
    921  * in the list is processed.
    922  * The following variables must be set before calling this function:
    923  *   src->currentStarredCharIndex:  Index (in src->source) of the first char of the current code point.
    924  *   src->lastStarredCharIndex: Index to the last character in the list.
    925  * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
    926  */
    927 static const UChar*
    928 ucol_tok_processNextTokenInStarredList(UColTokenParser *src)
    929 {
    930   // Extract the characters corresponding to the next code point.
    931   UChar32 cp;
    932   src->parsedToken.charsOffset = src->currentStarredCharIndex;
    933   int32_t prev = src->currentStarredCharIndex;
    934   U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);
    935   src->parsedToken.charsLen = src->currentStarredCharIndex - prev;
    936 
    937   // When we are done parsing the starred string, turn the flag off so that
    938   // the normal processing is restored.
    939   if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
    940     src->isStarred = FALSE;
    941   }
    942   src->previousCp = cp;
    943   return src->current;
    944 }
    945 
    946 /*
    947  * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
    948  *
    949  * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
    950  *  # : Comment character
    951  *  & : Reset operator
    952  *  = : Equality
    953  *  < : Primary collation
    954  *  << : Secondary collation
    955  *  <<< : Tertiary collation
    956  *  ; : Secondary collation
    957  *  , : Tertiary collation
    958  *  / : Expansions
    959  *  | : Prefix
    960  *  - : Range
    961 
    962  *  ! : Java Thai modifier, ignored
    963  *  @ : French only
    964 
    965  * [] : Options
    966  * '' : Quotes
    967  *
    968  *  Along with operators =, <, <<, <<<, the operator * is supported to indicate a list.  For example, &a<*bcdexyz
    969  *  is equivalent to &a<b<c<d<e<x<y<z.  In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
    970  *  This function do not separate the tokens in a list.  Instead, &a<*b-ex-z is parsed as three tokens - "&a",
    971  *  "<*b", "-ex", "-z".  The strength (< in this case), whether in a list, whether in a range and the previous
    972  *  character returned as cached so that the calling program can do further splitting.
    973  */
    974 static const UChar*
    975 ucol_tok_parseNextTokenInternal(UColTokenParser *src,
    976                                 UBool startOfRules,
    977                                 UParseError *parseError,
    978                                 UErrorCode *status)
    979 {
    980     UBool variableTop = FALSE;
    981     UBool top = FALSE;
    982     UBool inChars = TRUE;
    983     UBool inQuote = FALSE;
    984     UBool wasInQuote = FALSE;
    985     uint8_t before = 0;
    986     UBool isEscaped = FALSE;
    987 
    988     // TODO: replace these variables with src->parsedToken counterparts
    989     // no need to use them anymore since we have src->parsedToken.
    990     // Ideally, token parser would be a nice class... Once, when I have
    991     // more time (around 2020 probably).
    992     uint32_t newExtensionLen = 0;
    993     uint32_t extensionOffset = 0;
    994     uint32_t newStrength = UCOL_TOK_UNSET;
    995     UChar buff[10];
    996 
    997     src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
    998     src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
    999     src->parsedToken.indirectIndex = 0;
   1000 
   1001     while (src->current < src->end) {
   1002         UChar ch = *(src->current);
   1003 
   1004         if (inQuote) {
   1005             if (ch == 0x0027/*'\''*/) {
   1006                 inQuote = FALSE;
   1007             } else {
   1008                 if ((src->parsedToken.charsLen == 0) || inChars) {
   1009                     if(src->parsedToken.charsLen == 0) {
   1010                         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1011                     }
   1012                     src->parsedToken.charsLen++;
   1013                 } else {
   1014                     if(newExtensionLen == 0) {
   1015                         extensionOffset = (uint32_t)(src->extraCurrent - src->source);
   1016                     }
   1017                     newExtensionLen++;
   1018                 }
   1019             }
   1020         }else if(isEscaped){
   1021             isEscaped =FALSE;
   1022             if (newStrength == UCOL_TOK_UNSET) {
   1023                 *status = U_INVALID_FORMAT_ERROR;
   1024                 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1025                 DBG_FORMAT_ERROR
   1026                 return NULL;
   1027                 // enabling rules to start with non-tokens a < b
   1028                 // newStrength = UCOL_TOK_RESET;
   1029             }
   1030             if(ch != 0x0000  && src->current != src->end) {
   1031                 if (inChars) {
   1032                     if(src->parsedToken.charsLen == 0) {
   1033                         src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
   1034                     }
   1035                     src->parsedToken.charsLen++;
   1036                 } else {
   1037                     if(newExtensionLen == 0) {
   1038                         extensionOffset = (uint32_t)(src->current - src->source);
   1039                     }
   1040                     newExtensionLen++;
   1041                 }
   1042             }
   1043         }else {
   1044             if(!uprv_isRuleWhiteSpace(ch)) {
   1045                 /* Sets the strength for this entry */
   1046                 switch (ch) {
   1047                 case 0x003D/*'='*/ :
   1048                     if (newStrength != UCOL_TOK_UNSET) {
   1049                         goto EndOfLoop;
   1050                     }
   1051 
   1052                     /* if we start with strength, we'll reset to top */
   1053                     if(startOfRules == TRUE) {
   1054                         src->parsedToken.indirectIndex = 5;
   1055                         top = ucol_tok_doSetTop(src, status);
   1056                         newStrength = UCOL_TOK_RESET;
   1057                         goto EndOfLoop;
   1058                     }
   1059                     newStrength = UCOL_IDENTICAL;
   1060                     if(*(src->current+1) == 0x002A) {/*'*'*/
   1061                         src->current++;
   1062                         src->isStarred = TRUE;
   1063                     }
   1064                     break;
   1065 
   1066                 case 0x002C/*','*/:
   1067                     if (newStrength != UCOL_TOK_UNSET) {
   1068                         goto EndOfLoop;
   1069                     }
   1070 
   1071                     /* if we start with strength, we'll reset to top */
   1072                     if(startOfRules == TRUE) {
   1073                         src->parsedToken.indirectIndex = 5;
   1074                         top = ucol_tok_doSetTop(src, status);
   1075                         newStrength = UCOL_TOK_RESET;
   1076                         goto EndOfLoop;
   1077                     }
   1078                     newStrength = UCOL_TERTIARY;
   1079                     break;
   1080 
   1081                 case  0x003B/*';'*/:
   1082                     if (newStrength != UCOL_TOK_UNSET) {
   1083                         goto EndOfLoop;
   1084                     }
   1085 
   1086                     /* if we start with strength, we'll reset to top */
   1087                     if(startOfRules == TRUE) {
   1088                         src->parsedToken.indirectIndex = 5;
   1089                         top = ucol_tok_doSetTop(src, status);
   1090                         newStrength = UCOL_TOK_RESET;
   1091                         goto EndOfLoop;
   1092                     }
   1093                     newStrength = UCOL_SECONDARY;
   1094                     break;
   1095 
   1096                 case 0x003C/*'<'*/:
   1097                     if (newStrength != UCOL_TOK_UNSET) {
   1098                         goto EndOfLoop;
   1099                     }
   1100 
   1101                     /* if we start with strength, we'll reset to top */
   1102                     if(startOfRules == TRUE) {
   1103                         src->parsedToken.indirectIndex = 5;
   1104                         top = ucol_tok_doSetTop(src, status);
   1105                         newStrength = UCOL_TOK_RESET;
   1106                         goto EndOfLoop;
   1107                     }
   1108                     /* before this, do a scan to verify whether this is */
   1109                     /* another strength */
   1110                     if(*(src->current+1) == 0x003C) {
   1111                         src->current++;
   1112                         if(*(src->current+1) == 0x003C) {
   1113                             src->current++; /* three in a row! */
   1114                             newStrength = UCOL_TERTIARY;
   1115                         } else { /* two in a row */
   1116                             newStrength = UCOL_SECONDARY;
   1117                         }
   1118                     } else { /* just one */
   1119                         newStrength = UCOL_PRIMARY;
   1120                     }
   1121                     if(*(src->current+1) == 0x002A) {/*'*'*/
   1122                         src->current++;
   1123                         src->isStarred = TRUE;
   1124                     }
   1125                     break;
   1126 
   1127                 case 0x0026/*'&'*/:
   1128                     if (newStrength != UCOL_TOK_UNSET) {
   1129                         /**/
   1130                         goto EndOfLoop;
   1131                     }
   1132 
   1133                     newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
   1134                     break;
   1135 
   1136                 case 0x005b/*'['*/:
   1137                     /* options - read an option, analyze it */
   1138                     if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
   1139                         uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
   1140                         if(U_SUCCESS(*status)) {
   1141                             if(result & UCOL_TOK_TOP) {
   1142                                 if(newStrength == UCOL_TOK_RESET) {
   1143                                     top = ucol_tok_doSetTop(src, status);
   1144                                     if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
   1145                                         src->parsedToken.charsLen+=2;
   1146                                         buff[0] = 0x002d;
   1147                                         buff[1] = before;
   1148                                         ucol_tok_addToExtraCurrent(src, buff, 2, status);
   1149                                     }
   1150 
   1151                                     src->current++;
   1152                                     goto EndOfLoop;
   1153                                 } else {
   1154                                     *status = U_INVALID_FORMAT_ERROR;
   1155                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1156                                     DBG_FORMAT_ERROR
   1157                                 }
   1158                             } else if(result & UCOL_TOK_VARIABLE_TOP) {
   1159                                 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
   1160                                     variableTop = TRUE;
   1161                                     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1162                                     src->parsedToken.charsLen = 1;
   1163                                     buff[0] = 0xFFFF;
   1164                                     ucol_tok_addToExtraCurrent(src, buff, 1, status);
   1165                                     src->current++;
   1166                                     goto EndOfLoop;
   1167                                 } else {
   1168                                     *status = U_INVALID_FORMAT_ERROR;
   1169                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1170                                     DBG_FORMAT_ERROR
   1171                                 }
   1172                             } else if (result & UCOL_TOK_BEFORE){
   1173                                 if(newStrength == UCOL_TOK_RESET) {
   1174                                     before = result & UCOL_TOK_BEFORE;
   1175                                 } else {
   1176                                     *status = U_INVALID_FORMAT_ERROR;
   1177                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1178                                     DBG_FORMAT_ERROR
   1179                                 }
   1180                             }
   1181                         } else {
   1182                             *status = U_INVALID_FORMAT_ERROR;
   1183                             syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1184                             DBG_FORMAT_ERROR
   1185                             return NULL;
   1186                         }
   1187                     }
   1188                     break;
   1189                 case 0x0021/*! skip java thai modifier reordering*/:
   1190                     break;
   1191                 case 0x002F/*'/'*/:
   1192                     wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
   1193                     inChars = FALSE; /* we're now processing expansion */
   1194                     break;
   1195                 case 0x005C /* back slash for escaped chars */:
   1196                     isEscaped = TRUE;
   1197                     break;
   1198                     /* found a quote, we're gonna start copying */
   1199                 case 0x0027/*'\''*/:
   1200                     if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
   1201                       *status = U_INVALID_FORMAT_ERROR;
   1202                       syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1203                       DBG_FORMAT_ERROR
   1204                       return NULL;
   1205                       // enabling rules to start with a non-token character a < b
   1206                       // newStrength = UCOL_TOK_RESET;
   1207                     }
   1208 
   1209                     inQuote = TRUE;
   1210 
   1211                     if(inChars) { /* we're doing characters */
   1212                         if(wasInQuote == FALSE) {
   1213                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1214                         }
   1215                         if (src->parsedToken.charsLen != 0) {
   1216                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
   1217                         }
   1218                         src->parsedToken.charsLen++;
   1219                     } else { /* we're doing an expansion */
   1220                         if(wasInQuote == FALSE) {
   1221                             extensionOffset = (uint32_t)(src->extraCurrent - src->source);
   1222                         }
   1223                         if (newExtensionLen != 0) {
   1224                             ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
   1225                         }
   1226                         newExtensionLen++;
   1227                     }
   1228 
   1229                     wasInQuote = TRUE;
   1230 
   1231                     ch = *(++(src->current));
   1232                     if(ch == 0x0027) { /* copy the double quote */
   1233                         ucol_tok_addToExtraCurrent(src, &ch, 1, status);
   1234                         inQuote = FALSE;
   1235                     }
   1236                     break;
   1237 
   1238                     /* '@' is french only if the strength is not currently set */
   1239                     /* if it is, it's just a regular character in collation rules */
   1240                 case 0x0040/*'@'*/:
   1241                     if (newStrength == UCOL_TOK_UNSET) {
   1242                         src->opts->frenchCollation = UCOL_ON;
   1243                         break;
   1244                     }
   1245 
   1246                 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
   1247                     // we want to store read characters to the prefix part and continue reading
   1248                     // the characters (proper way would be to restart reading the chars, but in
   1249                     // that case we would have to complicate the token hasher, which I do not
   1250                     // intend to play with. Instead, we will do prefixes when prefixes are due
   1251                     // (before adding the elements).
   1252                     src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
   1253                     src->parsedToken.prefixLen = src->parsedToken.charsLen;
   1254 
   1255                     if(inChars) { /* we're doing characters */
   1256                         if(wasInQuote == FALSE) {
   1257                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1258                         }
   1259                         if (src->parsedToken.charsLen != 0) {
   1260                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
   1261                         }
   1262                         src->parsedToken.charsLen++;
   1263                     }
   1264 
   1265                     wasInQuote = TRUE;
   1266 
   1267                     do {
   1268                         ch = *(++(src->current));
   1269                         // skip whitespace between '|' and the character
   1270                     } while (uprv_isRuleWhiteSpace(ch));
   1271                     break;
   1272 
   1273                     //charsOffset = 0;
   1274                     //newCharsLen = 0;
   1275                     //break; // We want to store the whole prefix/character sequence. If we break
   1276                     // the '|' is going to get lost.
   1277 
   1278                 case 0x002D /*-*/: /* A range. */
   1279                     if (newStrength != UCOL_TOK_UNSET) {
   1280                       // While processing the pending token, the isStarred field
   1281                       // is reset, so it needs to be saved for the next
   1282                       // invocation.
   1283                       src->savedIsStarred = src->isStarred;
   1284                       goto EndOfLoop;
   1285                    }
   1286                    src->isStarred = src->savedIsStarred;
   1287 
   1288                    // Ranges are valid only in starred tokens.
   1289                    if (!src->isStarred) {
   1290                      *status = U_INVALID_FORMAT_ERROR;
   1291                      syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1292                      DBG_FORMAT_ERROR
   1293                      return NULL;
   1294                    }
   1295                    newStrength = src->parsedToken.strength;
   1296                    src->inRange = TRUE;
   1297                    break;
   1298 
   1299                 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
   1300                     do {
   1301                         ch = *(++(src->current));
   1302                     } while (!isCharNewLine(ch));
   1303 
   1304                     break;
   1305                 default:
   1306                     if (newStrength == UCOL_TOK_UNSET) {
   1307                       *status = U_INVALID_FORMAT_ERROR;
   1308                       syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1309                       DBG_FORMAT_ERROR
   1310                       return NULL;
   1311                     }
   1312 
   1313                     if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
   1314                         *status = U_INVALID_FORMAT_ERROR;
   1315                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1316                         DBG_FORMAT_ERROR
   1317                         return NULL;
   1318                     }
   1319 
   1320                     if(ch == 0x0000 && src->current+1 == src->end) {
   1321                         break;
   1322                     }
   1323 
   1324                     if (inChars) {
   1325                         if(src->parsedToken.charsLen == 0) {
   1326                             src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
   1327                         }
   1328                         src->parsedToken.charsLen++;
   1329                     } else {
   1330                         if(newExtensionLen == 0) {
   1331                             extensionOffset = (uint32_t)(src->current - src->source);
   1332                         }
   1333                         newExtensionLen++;
   1334                     }
   1335 
   1336                     break;
   1337                 }
   1338             }
   1339         }
   1340 
   1341         if(wasInQuote) {
   1342             if(ch != 0x27) {
   1343                 if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
   1344                     ucol_tok_addToExtraCurrent(src, &ch, 1, status);
   1345                 }
   1346             }
   1347         }
   1348 
   1349         src->current++;
   1350     }
   1351 
   1352 EndOfLoop:
   1353     wasInQuote = FALSE;
   1354     if (newStrength == UCOL_TOK_UNSET) {
   1355         return NULL;
   1356     }
   1357 
   1358     if (src->parsedToken.charsLen == 0 && top == FALSE) {
   1359         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1360         *status = U_INVALID_FORMAT_ERROR;
   1361         DBG_FORMAT_ERROR
   1362         return NULL;
   1363     }
   1364 
   1365     src->parsedToken.strength = newStrength;
   1366     src->parsedToken.extensionOffset = extensionOffset;
   1367     src->parsedToken.extensionLen = newExtensionLen;
   1368     src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
   1369 
   1370     return src->current;
   1371 }
   1372 
   1373 /*
   1374  * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
   1375  * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
   1376  *
   1377  * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
   1378  *  1) ucol_tok_parseNextTokenInternal() returns a range as a single token.  This function separates
   1379  *     it to separate tokens and returns one by one.  In order to do that, the necessary states are
   1380  *     cached as member variables of the token parser.
   1381  *  2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
   1382  *     starting character as a single list token (which is separated into individual characters here)
   1383  *     and as another list token starting with the last character in the range.  Before expanding it
   1384  *     as a list of tokens, this function expands the range by filling the intermediate characters and
   1385  *     returns them one by one as separate tokens.
   1386  * Necessary checks are done for invalid combinations.
   1387  */
   1388 U_CAPI const UChar* U_EXPORT2
   1389 ucol_tok_parseNextToken(UColTokenParser *src,
   1390                         UBool startOfRules,
   1391                         UParseError *parseError,
   1392                         UErrorCode *status)
   1393 {
   1394   const UChar *nextToken;
   1395 
   1396   if (src->inRange) {
   1397     // We are not done processing a range.  Continue it.
   1398     return ucol_tok_processNextCodePointInRange(src, status);
   1399   } else if (src->isStarred) {
   1400     // We are not done processing a starred token.  Continue it.
   1401     return ucol_tok_processNextTokenInStarredList(src);
   1402   }
   1403 
   1404   // Get the next token.
   1405   nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);
   1406 
   1407   if (nextToken == NULL) {
   1408     return NULL;
   1409   }
   1410 
   1411   if (src->inRange) {
   1412     // A new range has started.
   1413     // Check whether it is a chain of ranges with more than one hyphen.
   1414     if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {
   1415         *status = U_INVALID_FORMAT_ERROR;
   1416         syntaxError(src->source,src->parsedToken.charsOffset-1,
   1417                     src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);
   1418         DBG_FORMAT_ERROR
   1419         return NULL;
   1420     }
   1421 
   1422     // The current token indicates the second code point of the range.
   1423     // Process just that, and then proceed with the star.
   1424     src->currentStarredCharIndex = src->parsedToken.charsOffset;
   1425     U16_NEXT(src->source, src->currentStarredCharIndex,
   1426              (uint32_t)(src->end - src->source), src->lastRangeCp);
   1427     if (src->lastRangeCp <= src->previousCp) {
   1428         *status = U_INVALID_FORMAT_ERROR;
   1429         syntaxError(src->source,src->parsedToken.charsOffset-1,
   1430                     src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
   1431         DBG_FORMAT_ERROR
   1432         return NULL;
   1433     }
   1434 
   1435     // Set current range code point to process the range loop
   1436     src->currentRangeCp = src->previousCp + 1;
   1437 
   1438     src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
   1439 
   1440     return ucol_tok_processNextCodePointInRange(src, status);
   1441  } else if (src->isStarred) {
   1442     // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
   1443     // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
   1444     // separated into several tokens and returned.
   1445     src->currentStarredCharIndex = src->parsedToken.charsOffset;
   1446     src->lastStarredCharIndex =  src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
   1447 
   1448     return ucol_tok_processNextTokenInStarredList(src);
   1449   } else {
   1450     // Set previous codepoint
   1451     U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);
   1452   }
   1453   return nextToken;
   1454 }
   1455 
   1456 
   1457 /*
   1458 Processing Description
   1459 1 Build a ListList. Each list has a header, which contains two lists (positive
   1460 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
   1461 reset may be null.
   1462 2 As you process, you keep a LAST pointer that points to the last token you
   1463 handled.
   1464 
   1465 */
   1466 
   1467 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
   1468                                       UParseError *parseError, UErrorCode *status)
   1469 {
   1470     if(src->resultLen == src->listCapacity) {
   1471         // Unfortunately, this won't work, as we store addresses of lhs in token
   1472         src->listCapacity *= 2;
   1473         src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
   1474         if(src->lh == NULL) {
   1475             *status = U_MEMORY_ALLOCATION_ERROR;
   1476             return NULL;
   1477         }
   1478     }
   1479     /* do the reset thing */
   1480     UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
   1481     /* test for NULL */
   1482     if (sourceToken == NULL) {
   1483         *status = U_MEMORY_ALLOCATION_ERROR;
   1484         return NULL;
   1485     }
   1486     sourceToken->rulesToParseHdl = &(src->source);
   1487     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1488     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
   1489 
   1490     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
   1491     sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
   1492 
   1493     // keep the flags around so that we know about before
   1494     sourceToken->flags = src->parsedToken.flags;
   1495 
   1496     if(src->parsedToken.prefixOffset != 0) {
   1497         // this is a syntax error
   1498         *status = U_INVALID_FORMAT_ERROR;
   1499         syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
   1500         DBG_FORMAT_ERROR
   1501         uprv_free(sourceToken);
   1502         return 0;
   1503     } else {
   1504         sourceToken->prefix = 0;
   1505     }
   1506 
   1507     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
   1508     sourceToken->strength = UCOL_TOK_RESET;
   1509     sourceToken->next = NULL;
   1510     sourceToken->previous = NULL;
   1511     sourceToken->noOfCEs = 0;
   1512     sourceToken->noOfExpCEs = 0;
   1513     sourceToken->listHeader = &src->lh[src->resultLen];
   1514 
   1515     src->lh[src->resultLen].first = NULL;
   1516     src->lh[src->resultLen].last = NULL;
   1517     src->lh[src->resultLen].first = NULL;
   1518     src->lh[src->resultLen].last = NULL;
   1519 
   1520     src->lh[src->resultLen].reset = sourceToken;
   1521 
   1522     /*
   1523     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
   1524     First convert all expansions into normal form. Examples:
   1525     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
   1526     d * ... into &x * c/y * d * ...
   1527     Note: reset values can never have expansions, although they can cause the
   1528     very next item to have one. They may be contractions, if they are found
   1529     earlier in the list.
   1530     */
   1531     *expandNext = 0;
   1532     if(expand != NULL) {
   1533         /* check to see if there is an expansion */
   1534         if(src->parsedToken.charsLen > 1) {
   1535             uint32_t resetCharsOffset;
   1536             resetCharsOffset = (uint32_t)(expand - src->source);
   1537             sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
   1538             *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
   1539         }
   1540     }
   1541 
   1542     src->resultLen++;
   1543 
   1544     uhash_put(src->tailored, sourceToken, sourceToken, status);
   1545 
   1546     return sourceToken;
   1547 }
   1548 
   1549 static
   1550 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
   1551     if(U_FAILURE(*status)) {
   1552         return NULL;
   1553     }
   1554     /* this is a virgin before - we need to fish the anchor from the UCA */
   1555     collIterate s;
   1556     uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
   1557     uint32_t CE, SecondCE;
   1558     uint32_t invPos;
   1559     if(sourceToken != NULL) {
   1560         uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
   1561     } else {
   1562         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
   1563     }
   1564     if(U_FAILURE(*status)) {
   1565         return NULL;
   1566     }
   1567 
   1568     baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
   1569     baseContCE = ucol_getNextCE(src->UCA, &s, status);
   1570     if(baseContCE == UCOL_NO_MORE_CES) {
   1571         baseContCE = 0;
   1572     }
   1573 
   1574 
   1575     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   1576     uint32_t ch = 0;
   1577     uint32_t expandNext = 0;
   1578     UColToken key;
   1579 
   1580     if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
   1581         uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
   1582         uint32_t raw = uprv_uca_getRawFromImplicit(primary);
   1583         ch = uprv_uca_getCodePointFromRaw(raw-1);
   1584         uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
   1585         CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
   1586         SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
   1587 
   1588         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1589         *src->extraCurrent++ = 0xFFFE;
   1590         *src->extraCurrent++ = (UChar)ch;
   1591         src->parsedToken.charsLen++;
   1592 
   1593         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
   1594         key.rulesToParseHdl = &(src->source);
   1595 
   1596         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
   1597         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1598 
   1599         if(sourceToken == NULL) {
   1600             src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   1601             if(isContinuation(SecondCE)) {
   1602                 src->lh[src->resultLen].baseContCE = SecondCE;
   1603             } else {
   1604                 src->lh[src->resultLen].baseContCE = 0;
   1605             }
   1606             src->lh[src->resultLen].nextCE = 0;
   1607             src->lh[src->resultLen].nextContCE = 0;
   1608             src->lh[src->resultLen].previousCE = 0;
   1609             src->lh[src->resultLen].previousContCE = 0;
   1610 
   1611             src->lh[src->resultLen].indirect = FALSE;
   1612 
   1613             sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1614         }
   1615 
   1616     } else {
   1617         invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
   1618 
   1619         // we got the previous CE. Now we need to see if the difference between
   1620         // the two CEs is really of the requested strength.
   1621         // if it's a bigger difference (we asked for secondary and got primary), we
   1622         // need to modify the CE.
   1623         if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
   1624             // adjust the strength
   1625             // now we are in the situation where our baseCE should actually be modified in
   1626             // order to get the CE in the right position.
   1627             if(strength == UCOL_SECONDARY) {
   1628                 CE = baseCE - 0x0200;
   1629             } else { // strength == UCOL_TERTIARY
   1630                 CE = baseCE - 0x02;
   1631             }
   1632             if(baseContCE) {
   1633                 if(strength == UCOL_SECONDARY) {
   1634                     SecondCE = baseContCE - 0x0200;
   1635                 } else { // strength == UCOL_TERTIARY
   1636                     SecondCE = baseContCE - 0x02;
   1637                 }
   1638             }
   1639         }
   1640 
   1641 #if 0
   1642         // the code below relies on getting a code point from the inverse table, in order to be
   1643         // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
   1644         // 1. There are many code points that have the same CE
   1645         // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
   1646         // Also, in case when there is no equivalent strength before an element, we have to actually
   1647         // construct one. For example, &[before 2]a << x won't result in x << a, because the element
   1648         // before a is a primary difference.
   1649 
   1650         //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   1651 
   1652 
   1653         ch = CETable[3*invPos+2];
   1654 
   1655         if((ch &  UCOL_INV_SIZEMASK) != 0) {
   1656             uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
   1657             uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
   1658             ch = conts[offset];
   1659         }
   1660 
   1661         *src->extraCurrent++ = (UChar)ch;
   1662         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
   1663         src->parsedToken.charsLen = 1;
   1664 
   1665         // We got an UCA before. However, this might have been tailored.
   1666         // example:
   1667         // &\u30ca = \u306a
   1668         // &[before 3]\u306a<<<\u306a|\u309d
   1669 
   1670 
   1671         // uint32_t key = (*newCharsLen << 24) | *charsOffset;
   1672         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
   1673         key.rulesToParseHdl = &(src->source);
   1674 
   1675         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
   1676         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1677 #endif
   1678 
   1679         // here is how it should be. The situation such as &[before 1]a < x, should be
   1680         // resolved exactly as if we wrote &a > x.
   1681         // therefore, I don't really care if the UCA value before a has been changed.
   1682         // However, I do care if the strength between my element and the previous element
   1683         // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
   1684         // have to construct the base CE.
   1685 
   1686 
   1687 
   1688         // if we found a tailored thing, we have to use the UCA value and construct
   1689         // a new reset token with constructed name
   1690         //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
   1691         // character to which we want to anchor is already tailored.
   1692         // We need to construct a new token which will be the anchor
   1693         // point
   1694         //*(src->extraCurrent-1) = 0xFFFE;
   1695         //*src->extraCurrent++ = (UChar)ch;
   1696         // grab before
   1697         src->parsedToken.charsOffset -= 10;
   1698         src->parsedToken.charsLen += 10;
   1699         src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   1700         if(isContinuation(SecondCE)) {
   1701             src->lh[src->resultLen].baseContCE = SecondCE;
   1702         } else {
   1703             src->lh[src->resultLen].baseContCE = 0;
   1704         }
   1705         src->lh[src->resultLen].nextCE = 0;
   1706         src->lh[src->resultLen].nextContCE = 0;
   1707         src->lh[src->resultLen].previousCE = 0;
   1708         src->lh[src->resultLen].previousContCE = 0;
   1709 
   1710         src->lh[src->resultLen].indirect = FALSE;
   1711 
   1712         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1713         //}
   1714     }
   1715 
   1716     return sourceToken;
   1717 
   1718 }
   1719 
   1720 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
   1721     UColToken *lastToken = NULL;
   1722     const UChar *parseEnd = NULL;
   1723     uint32_t expandNext = 0;
   1724     UBool variableTop = FALSE;
   1725     UBool top = FALSE;
   1726     uint16_t specs = 0;
   1727     UColTokListHeader *ListList = NULL;
   1728 
   1729     src->parsedToken.strength = UCOL_TOK_UNSET;
   1730 
   1731     ListList = src->lh;
   1732 
   1733     if(U_FAILURE(*status)) {
   1734         return 0;
   1735     }
   1736 #ifdef DEBUG_FOR_CODE_POINTS
   1737     char filename[35];
   1738     sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());
   1739     dfcp_fp = fopen(filename, "a");
   1740     fprintf(stdout, "Output is in the file %s.\n", filename);
   1741 #endif
   1742 
   1743 #ifdef DEBUG_FOR_COLL_RULES
   1744     std::string s3;
   1745     UnicodeString(src->source).toUTF8String(s3);
   1746     std::cout << "src->source = " << s3 << std::endl;
   1747 #endif
   1748 
   1749     while(src->current < src->end || src->isStarred) {
   1750         src->parsedToken.prefixOffset = 0;
   1751 
   1752         parseEnd = ucol_tok_parseNextToken(src,
   1753             (UBool)(lastToken == NULL),
   1754             parseError,
   1755             status);
   1756 
   1757         specs = src->parsedToken.flags;
   1758 
   1759 
   1760         variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
   1761         top = ((specs & UCOL_TOK_TOP) != 0);
   1762 
   1763         if(U_SUCCESS(*status) && parseEnd != NULL) {
   1764             UColToken *sourceToken = NULL;
   1765             //uint32_t key = 0;
   1766             uint32_t lastStrength = UCOL_TOK_UNSET;
   1767 
   1768             if(lastToken != NULL ) {
   1769                 lastStrength = lastToken->strength;
   1770             }
   1771 
   1772 #ifdef DEBUG_FOR_CODE_POINTS
   1773             UChar32 cp;
   1774             U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);
   1775             fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);
   1776 #endif
   1777             //key = newCharsLen << 24 | charsOffset;
   1778             UColToken key;
   1779             key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1780             key.rulesToParseHdl = &(src->source);
   1781 
   1782             /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
   1783             sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1784 
   1785             if(src->parsedToken.strength != UCOL_TOK_RESET) {
   1786                 if(lastToken == NULL) { /* this means that rules haven't started properly */
   1787                     *status = U_INVALID_FORMAT_ERROR;
   1788                     syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
   1789                     DBG_FORMAT_ERROR
   1790                     return 0;
   1791                 }
   1792                 /*  6 Otherwise (when relation != reset) */
   1793                 if(sourceToken == NULL) {
   1794                     /* If sourceToken is null, create new one, */
   1795                     sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
   1796                     /* test for NULL */
   1797                     if (sourceToken == NULL) {
   1798                         *status = U_MEMORY_ALLOCATION_ERROR;
   1799                         return 0;
   1800                     }
   1801                     sourceToken->rulesToParseHdl = &(src->source);
   1802                     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1803 
   1804                     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
   1805 
   1806                     sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
   1807                     sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
   1808 
   1809                     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
   1810                     sourceToken->next = NULL;
   1811                     sourceToken->previous = NULL;
   1812                     sourceToken->noOfCEs = 0;
   1813                     sourceToken->noOfExpCEs = 0;
   1814                     // keep the flags around so that we know about before
   1815                     sourceToken->flags = src->parsedToken.flags;
   1816                     uhash_put(src->tailored, sourceToken, sourceToken, status);
   1817                     if(U_FAILURE(*status)) {
   1818                         return 0;
   1819                     }
   1820                 } else {
   1821                     /* we could have fished out a reset here */
   1822                     if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
   1823                         /* otherwise remove sourceToken from where it was. */
   1824                         if(sourceToken->next != NULL) {
   1825                             if(sourceToken->next->strength > sourceToken->strength) {
   1826                                 sourceToken->next->strength = sourceToken->strength;
   1827                             }
   1828                             sourceToken->next->previous = sourceToken->previous;
   1829                         } else {
   1830                             sourceToken->listHeader->last = sourceToken->previous;
   1831                         }
   1832 
   1833                         if(sourceToken->previous != NULL) {
   1834                             sourceToken->previous->next = sourceToken->next;
   1835                         } else {
   1836                             sourceToken->listHeader->first = sourceToken->next;
   1837                         }
   1838                         sourceToken->next = NULL;
   1839                         sourceToken->previous = NULL;
   1840                     }
   1841                 }
   1842 
   1843                 sourceToken->strength = src->parsedToken.strength;
   1844                 sourceToken->listHeader = lastToken->listHeader;
   1845 
   1846                 /*
   1847                 1.  Find the strongest strength in each list, and set strongestP and strongestN
   1848                 accordingly in the headers.
   1849                 */
   1850                 if(lastStrength == UCOL_TOK_RESET
   1851                     || sourceToken->listHeader->first == 0) {
   1852                         /* If LAST is a reset
   1853                         insert sourceToken in the list. */
   1854                         if(sourceToken->listHeader->first == 0) {
   1855                             sourceToken->listHeader->first = sourceToken;
   1856                             sourceToken->listHeader->last = sourceToken;
   1857                         } else { /* we need to find a place for us */
   1858                             /* and we'll get in front of the same strength */
   1859                             if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
   1860                                 sourceToken->next = sourceToken->listHeader->first;
   1861                                 sourceToken->next->previous = sourceToken;
   1862                                 sourceToken->listHeader->first = sourceToken;
   1863                                 sourceToken->previous = NULL;
   1864                             } else {
   1865                                 lastToken = sourceToken->listHeader->first;
   1866                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
   1867                                     lastToken = lastToken->next;
   1868                                 }
   1869                                 if(lastToken->next != NULL) {
   1870                                     lastToken->next->previous = sourceToken;
   1871                                 } else {
   1872                                     sourceToken->listHeader->last = sourceToken;
   1873                                 }
   1874                                 sourceToken->previous = lastToken;
   1875                                 sourceToken->next = lastToken->next;
   1876                                 lastToken->next = sourceToken;
   1877                             }
   1878                         }
   1879                     } else {
   1880                         /* Otherwise (when LAST is not a reset)
   1881                         if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
   1882                         otherwise insert before.
   1883                         when inserting after or before, search to the next position with the same
   1884                         strength in that direction. (This is called postpone insertion).         */
   1885                         if(sourceToken != lastToken) {
   1886                             if(lastToken->polarity == sourceToken->polarity) {
   1887                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
   1888                                     lastToken = lastToken->next;
   1889                                 }
   1890                                 sourceToken->previous = lastToken;
   1891                                 if(lastToken->next != NULL) {
   1892                                     lastToken->next->previous = sourceToken;
   1893                                 } else {
   1894                                     sourceToken->listHeader->last = sourceToken;
   1895                                 }
   1896 
   1897                                 sourceToken->next = lastToken->next;
   1898                                 lastToken->next = sourceToken;
   1899                             } else {
   1900                                 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
   1901                                     lastToken = lastToken->previous;
   1902                                 }
   1903                                 sourceToken->next = lastToken;
   1904                                 if(lastToken->previous != NULL) {
   1905                                     lastToken->previous->next = sourceToken;
   1906                                 } else {
   1907                                     sourceToken->listHeader->first = sourceToken;
   1908                                 }
   1909                                 sourceToken->previous = lastToken->previous;
   1910                                 lastToken->previous = sourceToken;
   1911                             }
   1912                         } else { /* repeated one thing twice in rules, stay with the stronger strength */
   1913                             if(lastStrength < sourceToken->strength) {
   1914                                 sourceToken->strength = lastStrength;
   1915                             }
   1916                         }
   1917                     }
   1918 
   1919                     /* if the token was a variable top, we're gonna put it in */
   1920                     if(variableTop == TRUE && src->varTop == NULL) {
   1921                         variableTop = FALSE;
   1922                         src->varTop = sourceToken;
   1923                     }
   1924 
   1925                     // Treat the expansions.
   1926                     // There are two types of expansions: explicit (x / y) and reset based propagating expansions
   1927                     // (&abc * d * e <=> &ab * d / c * e / c)
   1928                     // if both of them are in effect for a token, they are combined.
   1929 
   1930                     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
   1931 
   1932                     if(expandNext != 0) {
   1933                         if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
   1934                             expandNext = 0;
   1935                         } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
   1936                             sourceToken->expansion = expandNext;
   1937                         } else { /* there is both explicit and implicit expansion. We need to make a combination */
   1938                             uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
   1939                             uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
   1940                             sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
   1941                             src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
   1942                         }
   1943                     }
   1944 
   1945                     // This is just for debugging purposes
   1946                     if(sourceToken->expansion != 0) {
   1947                         sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
   1948                     } else {
   1949                         sourceToken->debugExpansion = 0;
   1950                     }
   1951                     // if the previous token was a reset before, the strength of this
   1952                     // token must match the strength of before. Otherwise we have an
   1953                     // undefined situation.
   1954                     // In other words, we currently have a cludge which we use to
   1955                     // represent &a >> x. This is written as &[before 2]a << x.
   1956                     if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
   1957                         uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
   1958                         if(beforeStrength != sourceToken->strength) {
   1959                             *status = U_INVALID_FORMAT_ERROR;
   1960                             syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
   1961                             DBG_FORMAT_ERROR
   1962                             return 0;
   1963                         }
   1964                     }
   1965             } else {
   1966                 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
   1967                     /* if the previous token was also a reset, */
   1968                     /*this means that we have two consecutive resets */
   1969                     /* and we want to remove the previous one if empty*/
   1970                     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
   1971                         src->resultLen--;
   1972                     }
   1973                 }
   1974 
   1975                 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
   1976                     uint32_t searchCharsLen = src->parsedToken.charsLen;
   1977                     while(searchCharsLen > 1 && sourceToken == NULL) {
   1978                         searchCharsLen--;
   1979                         //key = searchCharsLen << 24 | charsOffset;
   1980                         UColToken key;
   1981                         key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
   1982                         key.rulesToParseHdl = &(src->source);
   1983                         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1984                     }
   1985                     if(sourceToken != NULL) {
   1986                         expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
   1987                     }
   1988                 }
   1989 
   1990                 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
   1991                     if(top == FALSE) { /* there is no indirection */
   1992                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
   1993                         if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
   1994                             /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
   1995                             while(sourceToken->strength > strength && sourceToken->previous != NULL) {
   1996                                 sourceToken = sourceToken->previous;
   1997                             }
   1998                             /* here, either we hit the strength or NULL */
   1999                             if(sourceToken->strength == strength) {
   2000                                 if(sourceToken->previous != NULL) {
   2001                                     sourceToken = sourceToken->previous;
   2002                                 } else { /* start of list */
   2003                                     sourceToken = sourceToken->listHeader->reset;
   2004                                 }
   2005                             } else { /* we hit NULL */
   2006                                 /* we should be doing the else part */
   2007                                 sourceToken = sourceToken->listHeader->reset;
   2008                                 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
   2009                             }
   2010                         } else {
   2011                             sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
   2012                         }
   2013                     } else { /* this is both before and indirection */
   2014                         top = FALSE;
   2015                         ListList[src->resultLen].previousCE = 0;
   2016                         ListList[src->resultLen].previousContCE = 0;
   2017                         ListList[src->resultLen].indirect = TRUE;
   2018                         /* we need to do slightly more work. we need to get the baseCE using the */
   2019                         /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
   2020                         /* in ucol_bld */
   2021                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
   2022                         uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
   2023                         uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
   2024                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
   2025 
   2026                         UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   2027                         if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) &&
   2028                            (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
   2029                             uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
   2030                             uint32_t raw = uprv_uca_getRawFromImplicit(primary);
   2031                             uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
   2032                             CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
   2033                             SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
   2034                         } else {
   2035                             /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
   2036                             ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
   2037                         }
   2038 
   2039                         ListList[src->resultLen].baseCE = CE;
   2040                         ListList[src->resultLen].baseContCE = SecondCE;
   2041                         ListList[src->resultLen].nextCE = 0;
   2042                         ListList[src->resultLen].nextContCE = 0;
   2043 
   2044                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   2045                     }
   2046                 }
   2047 
   2048 
   2049                 /*  5 If the relation is a reset:
   2050                 If sourceToken is null
   2051                 Create new list, create new sourceToken, make the baseCE from source, put
   2052                 the sourceToken in ListHeader of the new list */
   2053                 if(sourceToken == NULL) {
   2054                     /*
   2055                     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
   2056                     First convert all expansions into normal form. Examples:
   2057                     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
   2058                     d * ... into &x * c/y * d * ...
   2059                     Note: reset values can never have expansions, although they can cause the
   2060                     very next item to have one. They may be contractions, if they are found
   2061                     earlier in the list.
   2062                     */
   2063                     if(top == FALSE) {
   2064                         collIterate s;
   2065                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
   2066 
   2067                         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
   2068 
   2069                         CE = ucol_getNextCE(src->UCA, &s, status);
   2070                         const UChar *expand = s.pos;
   2071                         SecondCE = ucol_getNextCE(src->UCA, &s, status);
   2072 
   2073                         ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   2074                         if(isContinuation(SecondCE)) {
   2075                             ListList[src->resultLen].baseContCE = SecondCE;
   2076                         } else {
   2077                             ListList[src->resultLen].baseContCE = 0;
   2078                         }
   2079                         ListList[src->resultLen].nextCE = 0;
   2080                         ListList[src->resultLen].nextContCE = 0;
   2081                         ListList[src->resultLen].previousCE = 0;
   2082                         ListList[src->resultLen].previousContCE = 0;
   2083                         ListList[src->resultLen].indirect = FALSE;
   2084                         sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
   2085                     } else { /* top == TRUE */
   2086                         /* just use the supplied values */
   2087                         top = FALSE;
   2088                         ListList[src->resultLen].previousCE = 0;
   2089                         ListList[src->resultLen].previousContCE = 0;
   2090                         ListList[src->resultLen].indirect = TRUE;
   2091                         ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
   2092                         ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
   2093                         ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
   2094                         ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
   2095 
   2096                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   2097 
   2098                     }
   2099                 } else { /* reset to something already in rules */
   2100                     top = FALSE;
   2101                 }
   2102             }
   2103             /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
   2104             lastToken = sourceToken;
   2105         } else {
   2106             if(U_FAILURE(*status)) {
   2107                 return 0;
   2108             }
   2109         }
   2110     }
   2111 #ifdef DEBUG_FOR_CODE_POINTS
   2112     fclose(dfcp_fp);
   2113 #endif
   2114 
   2115 
   2116     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
   2117         src->resultLen--;
   2118     }
   2119     return src->resultLen;
   2120 }
   2121 
   2122 const UChar* ucol_tok_getRulesFromBundle(
   2123     void* /*context*/,
   2124     const char* locale,
   2125     const char* type,
   2126     int32_t* pLength,
   2127     UErrorCode* status)
   2128 {
   2129     const UChar* rules = NULL;
   2130     UResourceBundle* bundle;
   2131     UResourceBundle* collations;
   2132     UResourceBundle* collation;
   2133 
   2134     *pLength = 0;
   2135 
   2136     bundle = ures_open(U_ICUDATA_COLL, locale, status);
   2137     if(U_SUCCESS(*status)){
   2138         collations = ures_getByKey(bundle, "collations", NULL, status);
   2139         if(U_SUCCESS(*status)){
   2140             collation = ures_getByKey(collations, type, NULL, status);
   2141             if(U_SUCCESS(*status)){
   2142                 rules = ures_getStringByKey(collation, "Sequence", pLength, status);
   2143                 if(U_FAILURE(*status)){
   2144                     *pLength = 0;
   2145                     rules = NULL;
   2146                 }
   2147                 ures_close(collation);
   2148             }
   2149             ures_close(collations);
   2150         }
   2151     }
   2152 
   2153     ures_close(bundle);
   2154 
   2155     return rules;
   2156 }
   2157 
   2158 void ucol_tok_initTokenList(
   2159     UColTokenParser *src,
   2160     const UChar *rules,
   2161     uint32_t rulesLength,
   2162     const UCollator *UCA,
   2163     GetCollationRulesFunction importFunc,
   2164     void* context,
   2165     UErrorCode *status) {
   2166     U_NAMESPACE_USE
   2167 
   2168     uint32_t nSize = 0;
   2169     uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
   2170 
   2171     bool needToDeallocRules = false;
   2172 
   2173     if(U_FAILURE(*status)) {
   2174         return;
   2175     }
   2176 
   2177     // set everything to zero, so that we can clean up gracefully
   2178     uprv_memset(src, 0, sizeof(UColTokenParser));
   2179 
   2180     // first we need to find options that don't like to be normalized,
   2181     // like copy and remove...
   2182     //const UChar *openBrace = rules;
   2183     int32_t optionNumber = -1;
   2184     const UChar *setStart = NULL;
   2185     uint32_t i = 0;
   2186     while(i < rulesLength) {
   2187         if(rules[i] == 0x005B) {    // '[': start of an option
   2188             /* Gets the following:
   2189                optionNumber: The index of the option.
   2190                setStart: The pointer at which the option arguments start.
   2191              */
   2192             optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
   2193 
   2194             if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
   2195                 // [optimize]
   2196                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
   2197                 if(U_SUCCESS(*status)) {
   2198                     if(src->copySet == NULL) {
   2199                         src->copySet = newSet;
   2200                     } else {
   2201                         uset_addAll(src->copySet, newSet);
   2202                         uset_close(newSet);
   2203                     }
   2204                 } else {
   2205                     return;
   2206                 }
   2207             } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
   2208                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
   2209                 if(U_SUCCESS(*status)) {
   2210                     if(src->removeSet == NULL) {
   2211                         src->removeSet = newSet;
   2212                     } else {
   2213                         uset_addAll(src->removeSet, newSet);
   2214                         uset_close(newSet);
   2215                     }
   2216                 } else {
   2217                     return;
   2218                 }
   2219             } else if(optionNumber == OPTION_IMPORT){
   2220                 // [import <collation-name>]
   2221 
   2222                 // Find the address of the closing ].
   2223                 UChar* import_end = u_strchr(setStart, 0x005D);
   2224                 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);
   2225                 // Ignore trailing whitespace.
   2226                 while(uprv_isRuleWhiteSpace(*(import_end-1))) {
   2227                     --import_end;
   2228                 }
   2229 
   2230                 int32_t optionLength = (int32_t)(import_end - setStart);
   2231                 char option[50];
   2232                 if(optionLength >= (int32_t)sizeof(option)) {
   2233                     *status = U_ILLEGAL_ARGUMENT_ERROR;
   2234                     return;
   2235                 }
   2236                 u_UCharsToChars(setStart, option, optionLength);
   2237                 option[optionLength] = 0;
   2238 
   2239                 *status = U_ZERO_ERROR;
   2240                 char locale[50];
   2241                 int32_t templ;
   2242                 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);
   2243                 if(U_FAILURE(*status)) {
   2244                     *status = U_ILLEGAL_ARGUMENT_ERROR;
   2245                     return;
   2246                 }
   2247 
   2248                 char type[50];
   2249                 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||
   2250                     U_FAILURE(*status)
   2251                 ) {
   2252                     *status = U_ZERO_ERROR;
   2253                     uprv_strcpy(type, "standard");
   2254                 }
   2255 
   2256                 // TODO: Use public functions when available, see ticket #8134.
   2257                 char *keywords = (char *)locale_getKeywordsStart(locale);
   2258                 if(keywords != NULL) {
   2259                     *keywords = 0;
   2260                 }
   2261 
   2262                 int32_t importRulesLength = 0;
   2263                 const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);
   2264 
   2265 #ifdef DEBUG_FOR_COLL_RULES
   2266                 std::string s;
   2267                 UnicodeString(importRules).toUTF8String(s);
   2268                 std::cout << "Import rules = " << s << std::endl;
   2269 #endif
   2270 
   2271                 // Add the length of the imported rules to length of the original rules,
   2272                 // and subtract the length of the import option.
   2273                 uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);
   2274 
   2275                 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));
   2276 
   2277 #ifdef DEBUG_FOR_COLL_RULES
   2278                 std::string s1;
   2279                 UnicodeString(rules).toUTF8String(s1);
   2280                 std::cout << "Original rules = " << s1 << std::endl;
   2281 #endif
   2282 
   2283 
   2284                 // Copy the section of the original rules leading up to the import
   2285                 uprv_memcpy(newRules, rules, i*sizeof(UChar));
   2286                 // Copy the imported rules
   2287                 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));
   2288                 // Copy the rest of the original rules (minus the import option itself)
   2289                 uprv_memcpy(newRules+i+importRulesLength,
   2290                             rules+optionEndOffset,
   2291                             (rulesLength-optionEndOffset)*sizeof(UChar));
   2292 
   2293 #ifdef DEBUG_FOR_COLL_RULES
   2294                 std::string s2;
   2295                 UnicodeString(newRules).toUTF8String(s2);
   2296                 std::cout << "Resulting rules = " << s2 << std::endl;
   2297 #endif
   2298 
   2299                 if(needToDeallocRules){
   2300                     // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
   2301                     uprv_free((void*)rules);
   2302                 }
   2303                 needToDeallocRules = true;
   2304                 rules = newRules;
   2305                 rulesLength = newRulesLength;
   2306 
   2307                 estimatedSize += importRulesLength*2;
   2308 
   2309                 // First character of the new rules needs to be processed
   2310                 i--;
   2311             }
   2312         }
   2313         //openBrace++;
   2314         i++;
   2315     }
   2316 
   2317     src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
   2318     /* test for NULL */
   2319     if (src->source == NULL) {
   2320         *status = U_MEMORY_ALLOCATION_ERROR;
   2321         return;
   2322     }
   2323     uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
   2324     nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
   2325     if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
   2326         *status = U_ZERO_ERROR;
   2327         src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
   2328         /* test for NULL */
   2329         if (src->source == NULL) {
   2330             *status = U_MEMORY_ALLOCATION_ERROR;
   2331             return;
   2332         }
   2333         nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
   2334     }
   2335     if(needToDeallocRules){
   2336         // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
   2337         uprv_free((void*)rules);
   2338     }
   2339 
   2340 
   2341     src->current = src->source;
   2342     src->end = src->source+nSize;
   2343     src->sourceCurrent = src->source;
   2344     src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
   2345     src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
   2346     src->varTop = NULL;
   2347     src->UCA = UCA;
   2348     src->invUCA = ucol_initInverseUCA(status);
   2349     src->parsedToken.charsLen = 0;
   2350     src->parsedToken.charsOffset = 0;
   2351     src->parsedToken.extensionLen = 0;
   2352     src->parsedToken.extensionOffset = 0;
   2353     src->parsedToken.prefixLen = 0;
   2354     src->parsedToken.prefixOffset = 0;
   2355     src->parsedToken.flags = 0;
   2356     src->parsedToken.strength = UCOL_TOK_UNSET;
   2357     src->buildCCTabFlag = FALSE;
   2358     src->isStarred = FALSE;
   2359     src->inRange = FALSE;
   2360     src->lastRangeCp = 0;
   2361     src->previousCp = 0;
   2362 
   2363     if(U_FAILURE(*status)) {
   2364         return;
   2365     }
   2366     src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
   2367     if(U_FAILURE(*status)) {
   2368         return;
   2369     }
   2370     uhash_setValueDeleter(src->tailored, uhash_freeBlock);
   2371 
   2372     src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
   2373     /* test for NULL */
   2374     if (src->opts == NULL) {
   2375         *status = U_MEMORY_ALLOCATION_ERROR;
   2376         return;
   2377     }
   2378 
   2379     uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
   2380 
   2381     src->lh = 0;
   2382     src->listCapacity = 1024;
   2383     src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
   2384     //Test for NULL
   2385     if (src->lh == NULL) {
   2386         *status = U_MEMORY_ALLOCATION_ERROR;
   2387         return;
   2388     }
   2389     uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
   2390     src->resultLen = 0;
   2391 
   2392     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   2393 
   2394     // UCOL_RESET_TOP_VALUE
   2395     setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
   2396     // UCOL_FIRST_PRIMARY_IGNORABLE
   2397     setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
   2398     // UCOL_LAST_PRIMARY_IGNORABLE
   2399     setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
   2400     // UCOL_FIRST_SECONDARY_IGNORABLE
   2401     setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
   2402     // UCOL_LAST_SECONDARY_IGNORABLE
   2403     setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
   2404     // UCOL_FIRST_TERTIARY_IGNORABLE
   2405     setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
   2406     // UCOL_LAST_TERTIARY_IGNORABLE
   2407     setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
   2408     // UCOL_FIRST_VARIABLE
   2409     setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
   2410     // UCOL_LAST_VARIABLE
   2411     setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
   2412     // UCOL_FIRST_NON_VARIABLE
   2413     setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
   2414     // UCOL_LAST_NON_VARIABLE
   2415     setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
   2416     // UCOL_FIRST_IMPLICIT
   2417     setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
   2418     // UCOL_LAST_IMPLICIT
   2419     setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
   2420     // UCOL_FIRST_TRAILING
   2421     setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
   2422     // UCOL_LAST_TRAILING
   2423     setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
   2424     ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
   2425 }
   2426 
   2427 
   2428 void ucol_tok_closeTokenList(UColTokenParser *src) {
   2429     if(src->copySet != NULL) {
   2430         uset_close(src->copySet);
   2431     }
   2432     if(src->removeSet != NULL) {
   2433         uset_close(src->removeSet);
   2434     }
   2435     if(src->tailored != NULL) {
   2436         uhash_close(src->tailored);
   2437     }
   2438     if(src->lh != NULL) {
   2439         uprv_free(src->lh);
   2440     }
   2441     if(src->source != NULL) {
   2442         uprv_free(src->source);
   2443     }
   2444     if(src->opts != NULL) {
   2445         uprv_free(src->opts);
   2446     }
   2447     if (src->reorderCodes != NULL) {
   2448         uprv_free(src->reorderCodes);
   2449     }
   2450 }
   2451 
   2452 #endif /* #if !UCONFIG_NO_COLLATION */
   2453