Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2008, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucol_tok.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created 02/22/2001
     14 *   created by: Vladimir Weinstein
     15 *
     16 * This module reads a tailoring rule string and produces a list of
     17 * tokens that will be turned into collation elements
     18 *
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 
     23 #if !UCONFIG_NO_COLLATION
     24 
     25 #include "unicode/ustring.h"
     26 #include "unicode/uchar.h"
     27 #include "unicode/uniset.h"
     28 
     29 #include "ucol_tok.h"
     30 #include "ucol_bld.h"
     31 #include "cmemory.h"
     32 #include "util.h"
     33 
     34 U_CDECL_BEGIN
     35 static int32_t U_CALLCONV
     36 uhash_hashTokens(const UHashTok k)
     37 {
     38     int32_t hash = 0;
     39     //uint32_t key = (uint32_t)k.integer;
     40     UColToken *key = (UColToken *)k.pointer;
     41     if (key != 0) {
     42         //int32_t len = (key & 0xFF000000)>>24;
     43         int32_t len = (key->source & 0xFF000000)>>24;
     44         int32_t inc = ((len - 32) / 32) + 1;
     45 
     46         //const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
     47         const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
     48         const UChar *limit = p + len;
     49 
     50         while (p<limit) {
     51             hash = (hash * 37) + *p;
     52             p += inc;
     53         }
     54     }
     55     return hash;
     56 }
     57 
     58 static UBool U_CALLCONV
     59 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
     60 {
     61     //uint32_t p1 = (uint32_t) key1.integer;
     62     //uint32_t p2 = (uint32_t) key2.integer;
     63     UColToken *p1 = (UColToken *)key1.pointer;
     64     UColToken *p2 = (UColToken *)key2.pointer;
     65     const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
     66     const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
     67     uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
     68     uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
     69     const UChar *end = s1+s1L-1;
     70 
     71     if (p1 == p2) {
     72         return TRUE;
     73     }
     74     if (p1->source == 0 || p2->source == 0) {
     75         return FALSE;
     76     }
     77     if(s1L != s2L) {
     78         return FALSE;
     79     }
     80     if(p1->source == p2->source) {
     81         return TRUE;
     82     }
     83     while((s1 < end) && *s1 == *s2) {
     84         ++s1;
     85         ++s2;
     86     }
     87     if(*s1 == *s2) {
     88         return TRUE;
     89     } else {
     90         return FALSE;
     91     }
     92 }
     93 U_CDECL_END
     94 
     95 /*static inline void U_CALLCONV
     96 uhash_freeBlockWrapper(void *obj) {
     97     uhash_freeBlock(obj);
     98 }*/
     99 
    100 
    101 typedef struct {
    102     uint32_t startCE;
    103     uint32_t startContCE;
    104     uint32_t limitCE;
    105     uint32_t limitContCE;
    106 } indirectBoundaries;
    107 
    108 /* these values are used for finding CE values for indirect positioning. */
    109 /* Indirect positioning is a mechanism for allowing resets on symbolic   */
    110 /* values. It only works for resets and you cannot tailor indirect names */
    111 /* An indirect name can define either an anchor point or a range. An     */
    112 /* anchor point behaves in exactly the same way as a code point in reset */
    113 /* would, except that it cannot be tailored. A range (we currently only  */
    114 /* know for the [top] range will explicitly set the upper bound for      */
    115 /* generated CEs, thus allowing for better control over how many CEs can */
    116 /* be squeezed between in the range without performance penalty.         */
    117 /* In that respect, we use [top] for tailoring of locales that use CJK   */
    118 /* characters. Other indirect values are currently a pure convenience,   */
    119 /* they can be used to assure that the CEs will be always positioned in  */
    120 /* the same place relative to a point with known properties (e.g. first  */
    121 /* primary ignorable). */
    122 static indirectBoundaries ucolIndirectBoundaries[15];
    123 /*
    124 static indirectBoundaries ucolIndirectBoundaries[11] = {
    125 { UCOL_RESET_TOP_VALUE,               0,
    126 UCOL_NEXT_TOP_VALUE,                0 },
    127 { UCOL_FIRST_PRIMARY_IGNORABLE,       0,
    128 0,                                  0 },
    129 { UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
    130 0,                                  0 },
    131 { UCOL_FIRST_SECONDARY_IGNORABLE,     0,
    132 0,                                  0 },
    133 { UCOL_LAST_SECONDARY_IGNORABLE,      0,
    134 0,                                  0 },
    135 { UCOL_FIRST_TERTIARY_IGNORABLE,      0,
    136 0,                                  0 },
    137 { UCOL_LAST_TERTIARY_IGNORABLE,       0,
    138 0,                                  0 },
    139 { UCOL_FIRST_VARIABLE,                0,
    140 0,                                  0 },
    141 { UCOL_LAST_VARIABLE,                 0,
    142 0,                                  0 },
    143 { UCOL_FIRST_NON_VARIABLE,            0,
    144 0,                                  0 },
    145 { UCOL_LAST_NON_VARIABLE,             0,
    146 0,                                  0 },
    147 };
    148 */
    149 
    150 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
    151 
    152     // Set values for the top - TODO: once we have values for all the indirects, we are going
    153     // to initalize here.
    154     ucolIndirectBoundaries[indexR].startCE = start[0];
    155     ucolIndirectBoundaries[indexR].startContCE = start[1];
    156     if(end) {
    157         ucolIndirectBoundaries[indexR].limitCE = end[0];
    158         ucolIndirectBoundaries[indexR].limitContCE = end[1];
    159     } else {
    160         ucolIndirectBoundaries[indexR].limitCE = 0;
    161         ucolIndirectBoundaries[indexR].limitContCE = 0;
    162     }
    163 }
    164 
    165 
    166 static inline
    167 void syntaxError(const UChar* rules,
    168                  int32_t pos,
    169                  int32_t rulesLen,
    170                  UParseError* parseError)
    171 {
    172     parseError->offset = pos;
    173     parseError->line = 0 ; /* we are not using line numbers */
    174 
    175     // for pre-context
    176     int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
    177     int32_t stop  = pos;
    178 
    179     u_memcpy(parseError->preContext,rules+start,stop-start);
    180     //null terminate the buffer
    181     parseError->preContext[stop-start] = 0;
    182 
    183     //for post-context
    184     start = pos+1;
    185     stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
    186     rulesLen;
    187 
    188     if(start < stop) {
    189         u_memcpy(parseError->postContext,rules+start,stop-start);
    190         //null terminate the buffer
    191         parseError->postContext[stop-start]= 0;
    192     } else {
    193         parseError->postContext[0] = 0;
    194     }
    195 }
    196 
    197 static
    198 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
    199     switch(attrib) {
    200     case UCOL_HIRAGANA_QUATERNARY_MODE:
    201         opts->hiraganaQ = value;
    202         break;
    203     case UCOL_FRENCH_COLLATION:
    204         opts->frenchCollation = value;
    205         break;
    206     case UCOL_ALTERNATE_HANDLING:
    207         opts->alternateHandling = value;
    208         break;
    209     case UCOL_CASE_FIRST:
    210         opts->caseFirst = value;
    211         break;
    212     case UCOL_CASE_LEVEL:
    213         opts->caseLevel = value;
    214         break;
    215     case UCOL_NORMALIZATION_MODE:
    216         opts->normalizationMode = value;
    217         break;
    218     case UCOL_STRENGTH:
    219         opts->strength = value;
    220         break;
    221     case UCOL_NUMERIC_COLLATION:
    222         opts->numericCollation = value;
    223         break;
    224     case UCOL_ATTRIBUTE_COUNT:
    225     default:
    226         break;
    227     }
    228 }
    229 
    230 #define UTOK_OPTION_COUNT 20
    231 
    232 static UBool didInit = FALSE;
    233 /* we can be strict, or we can be lenient */
    234 /* I'd surely be lenient with the option arguments */
    235 /* maybe even with options */
    236 U_STRING_DECL(suboption_00, "non-ignorable", 13);
    237 U_STRING_DECL(suboption_01, "shifted",        7);
    238 
    239 U_STRING_DECL(suboption_02, "lower",          5);
    240 U_STRING_DECL(suboption_03, "upper",          5);
    241 U_STRING_DECL(suboption_04, "off",            3);
    242 U_STRING_DECL(suboption_05, "on",             2);
    243 U_STRING_DECL(suboption_06, "1",              1);
    244 U_STRING_DECL(suboption_07, "2",              1);
    245 U_STRING_DECL(suboption_08, "3",              1);
    246 U_STRING_DECL(suboption_09, "4",              1);
    247 U_STRING_DECL(suboption_10, "I",              1);
    248 
    249 U_STRING_DECL(suboption_11, "primary",        7);
    250 U_STRING_DECL(suboption_12, "secondary",      9);
    251 U_STRING_DECL(suboption_13, "tertiary",       8);
    252 U_STRING_DECL(suboption_14, "variable",       8);
    253 U_STRING_DECL(suboption_15, "regular",        7);
    254 U_STRING_DECL(suboption_16, "implicit",       8);
    255 U_STRING_DECL(suboption_17, "trailing",       8);
    256 
    257 
    258 U_STRING_DECL(option_00,    "undefined",      9);
    259 U_STRING_DECL(option_01,    "rearrange",      9);
    260 U_STRING_DECL(option_02,    "alternate",      9);
    261 U_STRING_DECL(option_03,    "backwards",      9);
    262 U_STRING_DECL(option_04,    "variable top",  12);
    263 U_STRING_DECL(option_05,    "top",            3);
    264 U_STRING_DECL(option_06,    "normalization", 13);
    265 U_STRING_DECL(option_07,    "caseLevel",      9);
    266 U_STRING_DECL(option_08,    "caseFirst",      9);
    267 U_STRING_DECL(option_09,    "scriptOrder",   11);
    268 U_STRING_DECL(option_10,    "charsetname",   11);
    269 U_STRING_DECL(option_11,    "charset",        7);
    270 U_STRING_DECL(option_12,    "before",         6);
    271 U_STRING_DECL(option_13,    "hiraganaQ",      9);
    272 U_STRING_DECL(option_14,    "strength",       8);
    273 U_STRING_DECL(option_15,    "first",          5);
    274 U_STRING_DECL(option_16,    "last",           4);
    275 U_STRING_DECL(option_17,    "optimize",       8);
    276 U_STRING_DECL(option_18,    "suppressContractions",         20);
    277 U_STRING_DECL(option_19,    "numericOrdering",              15);
    278 
    279 
    280 /*
    281 [last variable] last variable value
    282 [last primary ignorable] largest CE for primary ignorable
    283 [last secondary ignorable] largest CE for secondary ignorable
    284 [last tertiary ignorable] largest CE for tertiary ignorable
    285 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
    286 */
    287 
    288 
    289 static const ucolTokSuboption alternateSub[2] = {
    290     {suboption_00, 13, UCOL_NON_IGNORABLE},
    291     {suboption_01,  7, UCOL_SHIFTED}
    292 };
    293 
    294 static const ucolTokSuboption caseFirstSub[3] = {
    295     {suboption_02, 5, UCOL_LOWER_FIRST},
    296     {suboption_03,  5, UCOL_UPPER_FIRST},
    297     {suboption_04,  3, UCOL_OFF},
    298 };
    299 
    300 static const ucolTokSuboption onOffSub[2] = {
    301     {suboption_04, 3, UCOL_OFF},
    302     {suboption_05, 2, UCOL_ON}
    303 };
    304 
    305 static const ucolTokSuboption frenchSub[1] = {
    306     {suboption_07, 1, UCOL_ON}
    307 };
    308 
    309 static const ucolTokSuboption beforeSub[3] = {
    310     {suboption_06, 1, UCOL_PRIMARY},
    311     {suboption_07, 1, UCOL_SECONDARY},
    312     {suboption_08, 1, UCOL_TERTIARY}
    313 };
    314 
    315 static const ucolTokSuboption strengthSub[5] = {
    316     {suboption_06, 1, UCOL_PRIMARY},
    317     {suboption_07, 1, UCOL_SECONDARY},
    318     {suboption_08, 1, UCOL_TERTIARY},
    319     {suboption_09, 1, UCOL_QUATERNARY},
    320     {suboption_10, 1, UCOL_IDENTICAL},
    321 };
    322 
    323 static const ucolTokSuboption firstLastSub[7] = {
    324     {suboption_11, 7, UCOL_PRIMARY},
    325     {suboption_12, 9, UCOL_PRIMARY},
    326     {suboption_13, 8, UCOL_PRIMARY},
    327     {suboption_14, 8, UCOL_PRIMARY},
    328     {suboption_15, 7, UCOL_PRIMARY},
    329     {suboption_16, 8, UCOL_PRIMARY},
    330     {suboption_17, 8, UCOL_PRIMARY},
    331 };
    332 
    333 enum OptionNumber {
    334     OPTION_ALTERNATE_HANDLING = 0,
    335     OPTION_FRENCH_COLLATION,
    336     OPTION_CASE_LEVEL,
    337     OPTION_CASE_FIRST,
    338     OPTION_NORMALIZATION_MODE,
    339     OPTION_HIRAGANA_QUATERNARY,
    340     OPTION_STRENGTH,
    341     OPTION_NUMERIC_COLLATION,
    342     OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
    343     OPTION_VARIABLE_TOP,
    344     OPTION_REARRANGE,
    345     OPTION_BEFORE,
    346     OPTION_TOP,
    347     OPTION_FIRST,
    348     OPTION_LAST,
    349     OPTION_OPTIMIZE,
    350     OPTION_SUPPRESS_CONTRACTIONS,
    351     OPTION_UNDEFINED,
    352     OPTION_SCRIPT_ORDER,
    353     OPTION_CHARSET_NAME,
    354     OPTION_CHARSET
    355 } ;
    356 
    357 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
    358     /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
    359     /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
    360     /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
    361     /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
    362     /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
    363     /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
    364     /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
    365     /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
    366     /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
    367     /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
    368     /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
    369     /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
    370     /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
    371     /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
    372     /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
    373     /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
    374     /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
    375     /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
    376     /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
    377     /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"charset"        */
    378 };
    379 
    380 static
    381 int32_t u_strncmpNoCase(const UChar     *s1,
    382                         const UChar     *s2,
    383                         int32_t     n)
    384 {
    385     if(n > 0) {
    386         int32_t rc;
    387         for(;;) {
    388             rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
    389             if(rc != 0 || *s1 == 0 || --n == 0) {
    390                 return rc;
    391             }
    392             ++s1;
    393             ++s2;
    394         }
    395     }
    396     return 0;
    397 }
    398 
    399 static
    400 void ucol_uprv_tok_initData() {
    401     if(!didInit) {
    402         U_STRING_INIT(suboption_00, "non-ignorable", 13);
    403         U_STRING_INIT(suboption_01, "shifted",        7);
    404 
    405         U_STRING_INIT(suboption_02, "lower",          5);
    406         U_STRING_INIT(suboption_03, "upper",          5);
    407         U_STRING_INIT(suboption_04, "off",            3);
    408         U_STRING_INIT(suboption_05, "on",             2);
    409 
    410         U_STRING_INIT(suboption_06, "1",              1);
    411         U_STRING_INIT(suboption_07, "2",              1);
    412         U_STRING_INIT(suboption_08, "3",              1);
    413         U_STRING_INIT(suboption_09, "4",              1);
    414         U_STRING_INIT(suboption_10, "I",              1);
    415 
    416         U_STRING_INIT(suboption_11, "primary",        7);
    417         U_STRING_INIT(suboption_12, "secondary",      9);
    418         U_STRING_INIT(suboption_13, "tertiary",       8);
    419         U_STRING_INIT(suboption_14, "variable",       8);
    420         U_STRING_INIT(suboption_15, "regular",        7);
    421         U_STRING_INIT(suboption_16, "implicit",       8);
    422         U_STRING_INIT(suboption_17, "trailing",       8);
    423 
    424 
    425         U_STRING_INIT(option_00, "undefined",      9);
    426         U_STRING_INIT(option_01, "rearrange",      9);
    427         U_STRING_INIT(option_02, "alternate",      9);
    428         U_STRING_INIT(option_03, "backwards",      9);
    429         U_STRING_INIT(option_04, "variable top",  12);
    430         U_STRING_INIT(option_05, "top",            3);
    431         U_STRING_INIT(option_06, "normalization", 13);
    432         U_STRING_INIT(option_07, "caseLevel",      9);
    433         U_STRING_INIT(option_08, "caseFirst",      9);
    434         U_STRING_INIT(option_09, "scriptOrder",   11);
    435         U_STRING_INIT(option_10, "charsetname",   11);
    436         U_STRING_INIT(option_11, "charset",        7);
    437         U_STRING_INIT(option_12, "before",         6);
    438         U_STRING_INIT(option_13, "hiraganaQ",      9);
    439         U_STRING_INIT(option_14, "strength",       8);
    440         U_STRING_INIT(option_15, "first",          5);
    441         U_STRING_INIT(option_16, "last",           4);
    442         U_STRING_INIT(option_17, "optimize",       8);
    443         U_STRING_INIT(option_18, "suppressContractions",         20);
    444         U_STRING_INIT(option_19, "numericOrdering",      15);
    445         didInit = TRUE;
    446     }
    447 }
    448 
    449 
    450 // This function reads basic options to set in the runtime collator
    451 // used by data driven tests. Should not support build time options
    452 U_CAPI const UChar * U_EXPORT2
    453 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
    454                          UColAttribute *attrib, UColAttributeValue *value,
    455                          UErrorCode *status)
    456 {
    457     uint32_t i = 0;
    458     int32_t j=0;
    459     UBool foundOption = FALSE;
    460     const UChar *optionArg = NULL;
    461 
    462     ucol_uprv_tok_initData();
    463 
    464     while(start < end && u_isWhitespace(*start)) { /* eat whitespace */
    465         start++;
    466     }
    467     if(start >= end) {
    468         return NULL;
    469     }
    470     /* skip opening '[' */
    471     if(*start == 0x005b) {
    472         start++;
    473     } else {
    474         *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
    475         return NULL;
    476     }
    477 
    478     while(i < UTOK_OPTION_COUNT) {
    479         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
    480             foundOption = TRUE;
    481             if(end - start > rulesOptions[i].optionLen) {
    482                 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
    483                 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
    484                     optionArg++;
    485                 }
    486             }
    487             break;
    488         }
    489         i++;
    490     }
    491 
    492     if(!foundOption) {
    493         *status = U_ILLEGAL_ARGUMENT_ERROR;
    494         return NULL;
    495     }
    496 
    497     if(optionArg) {
    498         for(j = 0; j<rulesOptions[i].subSize; j++) {
    499             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    500                 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
    501                 *attrib = rulesOptions[i].attr;
    502                 *value = rulesOptions[i].subopts[j].attrVal;
    503                 optionArg += rulesOptions[i].subopts[j].subLen;
    504                 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
    505                     optionArg++;
    506                 }
    507                 if(*optionArg == 0x005d) {
    508                     optionArg++;
    509                     return optionArg;
    510                 } else {
    511                     *status = U_ILLEGAL_ARGUMENT_ERROR;
    512                     return NULL;
    513                 }
    514             }
    515         }
    516     }
    517     *status = U_ILLEGAL_ARGUMENT_ERROR;
    518     return NULL;
    519 }
    520 
    521 static
    522 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
    523     while(*start != 0x005b) { /* advance while we find the first '[' */
    524         start++;
    525     }
    526     // now we need to get a balanced set of '[]'. The problem is that a set can have
    527     // many, and *end point to the first closing '['
    528     int32_t noOpenBraces = 1;
    529     int32_t current = 1; // skip the opening brace
    530     while(start+current < end && noOpenBraces != 0) {
    531         if(start[current] == 0x005b) {
    532             noOpenBraces++;
    533         } else if(start[current] == 0x005D) { // closing brace
    534             noOpenBraces--;
    535         }
    536         current++;
    537     }
    538 
    539     if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
    540         *status = U_ILLEGAL_ARGUMENT_ERROR;
    541         return NULL;
    542     }
    543     return uset_openPattern(start, current, status);
    544 }
    545 
    546 static
    547 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
    548     int32_t i = 0;
    549     ucol_uprv_tok_initData();
    550 
    551     while(u_isWhitespace(*start)) { /* eat whitespace */
    552         start++;
    553     }
    554     while(i < UTOK_OPTION_COUNT) {
    555         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
    556             if(end - start > rulesOptions[i].optionLen) {
    557                 *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
    558                 while(u_isWhitespace(**optionArg)) { /* eat whitespace */
    559                     (*optionArg)++;
    560                 }
    561             }
    562             break;
    563         }
    564         i++;
    565     }
    566     if(i == UTOK_OPTION_COUNT) {
    567         i = -1; // didn't find an option
    568     }
    569     return i;
    570 }
    571 
    572 
    573 // reads and conforms to various options in rules
    574 // end is the position of the first closing ']'
    575 // However, some of the options take an UnicodeSet definition
    576 // which needs to duplicate the closing ']'
    577 // for example: '[copy [\uAC00-\uD7FF]]'
    578 // These options will move end to the second ']' and the
    579 // caller will set the current to it.
    580 static
    581 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
    582     const UChar* start = src->current;
    583     int32_t i = 0;
    584     int32_t j=0;
    585     const UChar *optionArg = NULL;
    586 
    587     uint8_t result = 0;
    588 
    589     start++; /*skip opening '['*/
    590     i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
    591     if(optionArg) {
    592         src->current = optionArg;
    593     }
    594 
    595     if(i < 0) {
    596         *status = U_ILLEGAL_ARGUMENT_ERROR;
    597     } else {
    598         int32_t noOpenBraces = 1;
    599         switch(i) {
    600     case OPTION_ALTERNATE_HANDLING:
    601     case OPTION_FRENCH_COLLATION:
    602     case OPTION_CASE_LEVEL:
    603     case OPTION_CASE_FIRST:
    604     case OPTION_NORMALIZATION_MODE:
    605     case OPTION_HIRAGANA_QUATERNARY:
    606     case OPTION_STRENGTH:
    607     case OPTION_NUMERIC_COLLATION:
    608         if(optionArg) {
    609             for(j = 0; j<rulesOptions[i].subSize; j++) {
    610                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    611                     ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
    612                     result =  UCOL_TOK_SUCCESS;
    613                 }
    614             }
    615         }
    616         if(result == 0) {
    617             *status = U_ILLEGAL_ARGUMENT_ERROR;
    618         }
    619         break;
    620     case OPTION_VARIABLE_TOP:
    621         result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
    622         break;
    623     case OPTION_REARRANGE:
    624         result = UCOL_TOK_SUCCESS;
    625         break;
    626     case OPTION_BEFORE:
    627         if(optionArg) {
    628             for(j = 0; j<rulesOptions[i].subSize; j++) {
    629                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    630                     result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
    631                 }
    632             }
    633         }
    634         if(result == 0) {
    635             *status = U_ILLEGAL_ARGUMENT_ERROR;
    636         }
    637         break;
    638     case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
    639         /* index to this array will be src->parsedToken.indirectIndex*/
    640         src->parsedToken.indirectIndex = 0;
    641         result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
    642         break;
    643     case OPTION_FIRST:
    644     case OPTION_LAST: /* first, last */
    645         for(j = 0; j<rulesOptions[i].subSize; j++) {
    646             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
    647                 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
    648                 // element of indirect boundaries is reserved for top.
    649                 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
    650                 result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
    651             }
    652         }
    653         if(result == 0) {
    654             *status = U_ILLEGAL_ARGUMENT_ERROR;
    655         }
    656         break;
    657     case OPTION_OPTIMIZE:
    658     case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
    659         // we need to move end here
    660         src->current++; // skip opening brace
    661         while(src->current < src->end && noOpenBraces != 0) {
    662             if(*src->current == 0x005b) {
    663                 noOpenBraces++;
    664             } else if(*src->current == 0x005D) { // closing brace
    665                 noOpenBraces--;
    666             }
    667             src->current++;
    668         }
    669         result = UCOL_TOK_SUCCESS;
    670         break;
    671     default:
    672         *status = U_UNSUPPORTED_ERROR;
    673         break;
    674         }
    675     }
    676     src->current = u_memchr(src->current, 0x005d, src->end-src->current);
    677     return result;
    678 }
    679 
    680 
    681 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
    682     if(src->extraCurrent+len >= src->extraEnd) {
    683         /* reallocate */
    684         UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
    685         if(newSrc != NULL) {
    686             src->current = newSrc + (src->current - src->source);
    687             src->extraCurrent = newSrc + (src->extraCurrent - src->source);
    688             src->end = newSrc + (src->end - src->source);
    689             src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
    690             src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
    691             src->source = newSrc;
    692         } else {
    693             *status = U_MEMORY_ALLOCATION_ERROR;
    694         }
    695     }
    696     if(len == 1) {
    697         *src->extraCurrent++ = *stuff;
    698     } else {
    699         uprv_memcpy(src->extraCurrent, stuff, len*sizeof(UChar));
    700         src->extraCurrent += len;
    701     }
    702 
    703 
    704 }
    705 
    706 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
    707     /*
    708     top = TRUE;
    709     */
    710     UChar buff[5];
    711     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    712     buff[0] = 0xFFFE;
    713     buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
    714     buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
    715     if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
    716         src->parsedToken.charsLen = 3;
    717         ucol_tok_addToExtraCurrent(src, buff, 3, status);
    718     } else {
    719         buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
    720         buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
    721         src->parsedToken.charsLen = 5;
    722         ucol_tok_addToExtraCurrent(src, buff, 5, status);
    723     }
    724     return TRUE;
    725 }
    726 
    727 static UBool isCharNewLine(UChar c){
    728     switch(c){
    729     case 0x000A: /* LF  */
    730     case 0x000D: /* CR  */
    731     case 0x000C: /* FF  */
    732     case 0x0085: /* NEL */
    733     case 0x2028: /* LS  */
    734     case 0x2029: /* PS  */
    735         return TRUE;
    736     default:
    737         return FALSE;
    738     }
    739 }
    740 
    741 U_CAPI const UChar* U_EXPORT2
    742 ucol_tok_parseNextToken(UColTokenParser *src,
    743                         UBool startOfRules,
    744                         UParseError *parseError,
    745                         UErrorCode *status)
    746 {
    747     /* parsing part */
    748     UBool variableTop = FALSE;
    749     UBool top = FALSE;
    750     UBool inChars = TRUE;
    751     UBool inQuote = FALSE;
    752     UBool wasInQuote = FALSE;
    753     uint8_t before = 0;
    754     UBool isEscaped = FALSE;
    755     // TODO: replace these variables with src->parsedToken counterparts
    756     // no need to use them anymore since we have src->parsedToken.
    757     // Ideally, token parser would be a nice class... Once, when I have
    758     // more time (around 2020 probably).
    759     uint32_t newExtensionLen = 0;
    760     uint32_t extensionOffset = 0;
    761     uint32_t newStrength = UCOL_TOK_UNSET;
    762     UChar buff[10];
    763 
    764     src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
    765     src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
    766     src->parsedToken.indirectIndex = 0;
    767 
    768     while (src->current < src->end) {
    769         UChar ch = *(src->current);
    770 
    771         if (inQuote) {
    772             if (ch == 0x0027/*'\''*/) {
    773                 inQuote = FALSE;
    774             } else {
    775                 if ((src->parsedToken.charsLen == 0) || inChars) {
    776                     if(src->parsedToken.charsLen == 0) {
    777                         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    778                     }
    779                     src->parsedToken.charsLen++;
    780                 } else {
    781                     if(newExtensionLen == 0) {
    782                         extensionOffset = (uint32_t)(src->extraCurrent - src->source);
    783                     }
    784                     newExtensionLen++;
    785                 }
    786             }
    787         }else if(isEscaped){
    788             isEscaped =FALSE;
    789             if (newStrength == UCOL_TOK_UNSET) {
    790                 *status = U_INVALID_FORMAT_ERROR;
    791                 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    792                 return NULL;
    793                 // enabling rules to start with non-tokens a < b
    794                 // newStrength = UCOL_TOK_RESET;
    795             }
    796             if(ch != 0x0000  && src->current != src->end) {
    797                 if (inChars) {
    798                     if(src->parsedToken.charsLen == 0) {
    799                         src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
    800                     }
    801                     src->parsedToken.charsLen++;
    802                 } else {
    803                     if(newExtensionLen == 0) {
    804                         extensionOffset = (uint32_t)(src->current - src->source);
    805                     }
    806                     newExtensionLen++;
    807                 }
    808             }
    809         }else {
    810             if(!uprv_isRuleWhiteSpace(ch)) {
    811                 /* Sets the strength for this entry */
    812                 switch (ch) {
    813                 case 0x003D/*'='*/ :
    814                     if (newStrength != UCOL_TOK_UNSET) {
    815                         goto EndOfLoop;
    816                     }
    817 
    818                     /* if we start with strength, we'll reset to top */
    819                     if(startOfRules == TRUE) {
    820                         src->parsedToken.indirectIndex = 5;
    821                         top = ucol_tok_doSetTop(src, status);
    822                         newStrength = UCOL_TOK_RESET;
    823                         goto EndOfLoop;
    824                     }
    825                     newStrength = UCOL_IDENTICAL;
    826                     break;
    827 
    828                 case 0x002C/*','*/:
    829                     if (newStrength != UCOL_TOK_UNSET) {
    830                         goto EndOfLoop;
    831                     }
    832 
    833                     /* if we start with strength, we'll reset to top */
    834                     if(startOfRules == TRUE) {
    835                         src->parsedToken.indirectIndex = 5;
    836                         top = ucol_tok_doSetTop(src, status);
    837                         newStrength = UCOL_TOK_RESET;
    838                         goto EndOfLoop;
    839                     }
    840                     newStrength = UCOL_TERTIARY;
    841                     break;
    842 
    843                 case  0x003B/*';'*/:
    844                     if (newStrength != UCOL_TOK_UNSET) {
    845                         goto EndOfLoop;
    846                     }
    847 
    848                     /* if we start with strength, we'll reset to top */
    849                     if(startOfRules == TRUE) {
    850                         src->parsedToken.indirectIndex = 5;
    851                         top = ucol_tok_doSetTop(src, status);
    852                         newStrength = UCOL_TOK_RESET;
    853                         goto EndOfLoop;
    854                     }
    855                     newStrength = UCOL_SECONDARY;
    856                     break;
    857 
    858                 case 0x003C/*'<'*/:
    859                     if (newStrength != UCOL_TOK_UNSET) {
    860                         goto EndOfLoop;
    861                     }
    862 
    863                     /* if we start with strength, we'll reset to top */
    864                     if(startOfRules == TRUE) {
    865                         src->parsedToken.indirectIndex = 5;
    866                         top = ucol_tok_doSetTop(src, status);
    867                         newStrength = UCOL_TOK_RESET;
    868                         goto EndOfLoop;
    869                     }
    870                     /* before this, do a scan to verify whether this is */
    871                     /* another strength */
    872                     if(*(src->current+1) == 0x003C) {
    873                         src->current++;
    874                         if(*(src->current+1) == 0x003C) {
    875                             src->current++; /* three in a row! */
    876                             newStrength = UCOL_TERTIARY;
    877                         } else { /* two in a row */
    878                             newStrength = UCOL_SECONDARY;
    879                         }
    880                     } else { /* just one */
    881                         newStrength = UCOL_PRIMARY;
    882                     }
    883                     break;
    884 
    885                 case 0x0026/*'&'*/:
    886                     if (newStrength != UCOL_TOK_UNSET) {
    887                         /**/
    888                         goto EndOfLoop;
    889                     }
    890 
    891                     newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
    892                     break;
    893 
    894                 case 0x005b/*'['*/:
    895                     /* options - read an option, analyze it */
    896                     if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
    897                         uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
    898                         if(U_SUCCESS(*status)) {
    899                             if(result & UCOL_TOK_TOP) {
    900                                 if(newStrength == UCOL_TOK_RESET) {
    901                                     top = ucol_tok_doSetTop(src, status);
    902                                     if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
    903                                         src->parsedToken.charsLen+=2;
    904                                         buff[0] = 0x002d;
    905                                         buff[1] = before;
    906                                         ucol_tok_addToExtraCurrent(src, buff, 2, status);
    907                                     }
    908 
    909                                     src->current++;
    910                                     goto EndOfLoop;
    911                                 } else {
    912                                     *status = U_INVALID_FORMAT_ERROR;
    913                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    914                                 }
    915                             } else if(result & UCOL_TOK_VARIABLE_TOP) {
    916                                 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
    917                                     variableTop = TRUE;
    918                                     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    919                                     src->parsedToken.charsLen = 1;
    920                                     buff[0] = 0xFFFF;
    921                                     ucol_tok_addToExtraCurrent(src, buff, 1, status);
    922                                     src->current++;
    923                                     goto EndOfLoop;
    924                                 } else {
    925                                     *status = U_INVALID_FORMAT_ERROR;
    926                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    927                                 }
    928                             } else if (result & UCOL_TOK_BEFORE){
    929                                 if(newStrength == UCOL_TOK_RESET) {
    930                                     before = result & UCOL_TOK_BEFORE;
    931                                 } else {
    932                                     *status = U_INVALID_FORMAT_ERROR;
    933                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    934 
    935                                 }
    936                             }
    937                         } else {
    938                             *status = U_INVALID_FORMAT_ERROR;
    939                             syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    940                             return NULL;
    941                         }
    942                     }
    943                     break;
    944                 case 0x0021/*! skip java thai modifier reordering*/:
    945                     break;
    946                 case 0x002F/*'/'*/:
    947                     wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
    948                     inChars = FALSE; /* we're now processing expansion */
    949                     break;
    950                 case 0x005C /* back slash for escaped chars */:
    951                     isEscaped = TRUE;
    952                     break;
    953                     /* found a quote, we're gonna start copying */
    954                 case 0x0027/*'\''*/:
    955                     if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
    956                         *status = U_INVALID_FORMAT_ERROR;
    957                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
    958                         return NULL;
    959                         // enabling rules to start with a non-token character a < b
    960                         // newStrength = UCOL_TOK_RESET;
    961                     }
    962 
    963                     inQuote = TRUE;
    964 
    965                     if(inChars) { /* we're doing characters */
    966                         if(wasInQuote == FALSE) {
    967                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    968                         }
    969                         if (src->parsedToken.charsLen != 0) {
    970                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
    971                         }
    972                         src->parsedToken.charsLen++;
    973                     } else { /* we're doing an expansion */
    974                         if(wasInQuote == FALSE) {
    975                             extensionOffset = (uint32_t)(src->extraCurrent - src->source);
    976                         }
    977                         if (newExtensionLen != 0) {
    978                             ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
    979                         }
    980                         newExtensionLen++;
    981                     }
    982 
    983                     wasInQuote = TRUE;
    984 
    985                     ch = *(++(src->current));
    986                     if(ch == 0x0027) { /* copy the double quote */
    987                         ucol_tok_addToExtraCurrent(src, &ch, 1, status);
    988                         inQuote = FALSE;
    989                     }
    990                     break;
    991 
    992                     /* '@' is french only if the strength is not currently set */
    993                     /* if it is, it's just a regular character in collation rules */
    994                 case 0x0040/*'@'*/:
    995                     if (newStrength == UCOL_TOK_UNSET) {
    996                         src->opts->frenchCollation = UCOL_ON;
    997                         break;
    998                     }
    999 
   1000                 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
   1001                     // we want to store read characters to the prefix part and continue reading
   1002                     // the characters (proper way would be to restart reading the chars, but in
   1003                     // that case we would have to complicate the token hasher, which I do not
   1004                     // intend to play with. Instead, we will do prefixes when prefixes are due
   1005                     // (before adding the elements).
   1006                     src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
   1007                     src->parsedToken.prefixLen = src->parsedToken.charsLen;
   1008 
   1009                     if(inChars) { /* we're doing characters */
   1010                         if(wasInQuote == FALSE) {
   1011                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1012                         }
   1013                         if (src->parsedToken.charsLen != 0) {
   1014                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
   1015                         }
   1016                         src->parsedToken.charsLen++;
   1017                     }
   1018 
   1019                     wasInQuote = TRUE;
   1020 
   1021                     do {
   1022                         ch = *(++(src->current));
   1023                         // skip whitespace between '|' and the character
   1024                     } while (uprv_isRuleWhiteSpace(ch));
   1025                     break;
   1026 
   1027                     //charsOffset = 0;
   1028                     //newCharsLen = 0;
   1029                     //break; // We want to store the whole prefix/character sequence. If we break
   1030                     // the '|' is going to get lost.
   1031                 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
   1032                     do {
   1033                         ch = *(++(src->current));
   1034                     } while (!isCharNewLine(ch));
   1035 
   1036                     break;
   1037                 default:
   1038                     if (newStrength == UCOL_TOK_UNSET) {
   1039                         *status = U_INVALID_FORMAT_ERROR;
   1040                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1041                         return NULL;
   1042                     }
   1043 
   1044                     if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
   1045                         *status = U_INVALID_FORMAT_ERROR;
   1046                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1047                         return NULL;
   1048                     }
   1049 
   1050                     if(ch == 0x0000 && src->current+1 == src->end) {
   1051                         break;
   1052                     }
   1053 
   1054                     if (inChars) {
   1055                         if(src->parsedToken.charsLen == 0) {
   1056                             src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
   1057                         }
   1058                         src->parsedToken.charsLen++;
   1059                     } else {
   1060                         if(newExtensionLen == 0) {
   1061                             extensionOffset = (uint32_t)(src->current - src->source);
   1062                         }
   1063                         newExtensionLen++;
   1064                     }
   1065 
   1066                     break;
   1067                 }
   1068             }
   1069         }
   1070 
   1071         if(wasInQuote) {
   1072             if(ch != 0x27) {
   1073                 if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
   1074                     ucol_tok_addToExtraCurrent(src, &ch, 1, status);
   1075                 }
   1076             }
   1077         }
   1078 
   1079         src->current++;
   1080     }
   1081 
   1082 EndOfLoop:
   1083     wasInQuote = FALSE;
   1084     if (newStrength == UCOL_TOK_UNSET) {
   1085         return NULL;
   1086     }
   1087 
   1088     if (src->parsedToken.charsLen == 0 && top == FALSE) {
   1089         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
   1090         *status = U_INVALID_FORMAT_ERROR;
   1091         return NULL;
   1092     }
   1093 
   1094     src->parsedToken.strength = newStrength;
   1095     src->parsedToken.extensionOffset = extensionOffset;
   1096     src->parsedToken.extensionLen = newExtensionLen;
   1097     src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
   1098 
   1099     return src->current;
   1100 }
   1101 
   1102 /*
   1103 Processing Description
   1104 1 Build a ListList. Each list has a header, which contains two lists (positive
   1105 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
   1106 reset may be null.
   1107 2 As you process, you keep a LAST pointer that points to the last token you
   1108 handled.
   1109 */
   1110 
   1111 static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
   1112                                       UParseError *parseError, UErrorCode *status)
   1113 {
   1114     if(src->resultLen == src->listCapacity) {
   1115         // Unfortunately, this won't work, as we store addresses of lhs in token
   1116         src->listCapacity *= 2;
   1117         src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
   1118         if(src->lh == NULL) {
   1119             *status = U_MEMORY_ALLOCATION_ERROR;
   1120             return NULL;
   1121         }
   1122     }
   1123     /* do the reset thing */
   1124     UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
   1125     /* test for NULL */
   1126     if (sourceToken == NULL) {
   1127         *status = U_MEMORY_ALLOCATION_ERROR;
   1128         return NULL;
   1129     }
   1130     sourceToken->rulesToParse = src->source;
   1131     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1132     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
   1133 
   1134     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
   1135     sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
   1136 
   1137     // keep the flags around so that we know about before
   1138     sourceToken->flags = src->parsedToken.flags;
   1139 
   1140     if(src->parsedToken.prefixOffset != 0) {
   1141         // this is a syntax error
   1142         *status = U_INVALID_FORMAT_ERROR;
   1143         syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
   1144         uprv_free(sourceToken);
   1145         return 0;
   1146     } else {
   1147         sourceToken->prefix = 0;
   1148     }
   1149 
   1150     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
   1151     sourceToken->strength = UCOL_TOK_RESET;
   1152     sourceToken->next = NULL;
   1153     sourceToken->previous = NULL;
   1154     sourceToken->noOfCEs = 0;
   1155     sourceToken->noOfExpCEs = 0;
   1156     sourceToken->listHeader = &src->lh[src->resultLen];
   1157 
   1158     src->lh[src->resultLen].first = NULL;
   1159     src->lh[src->resultLen].last = NULL;
   1160     src->lh[src->resultLen].first = NULL;
   1161     src->lh[src->resultLen].last = NULL;
   1162 
   1163     src->lh[src->resultLen].reset = sourceToken;
   1164 
   1165     /*
   1166     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
   1167     First convert all expansions into normal form. Examples:
   1168     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
   1169     d * ... into &x * c/y * d * ...
   1170     Note: reset values can never have expansions, although they can cause the
   1171     very next item to have one. They may be contractions, if they are found
   1172     earlier in the list.
   1173     */
   1174     *expandNext = 0;
   1175     if(expand != NULL) {
   1176         /* check to see if there is an expansion */
   1177         if(src->parsedToken.charsLen > 1) {
   1178             uint32_t resetCharsOffset;
   1179             resetCharsOffset = (uint32_t)(expand - src->source);
   1180             sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
   1181             *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
   1182         }
   1183     }
   1184 
   1185     src->resultLen++;
   1186 
   1187     uhash_put(src->tailored, sourceToken, sourceToken, status);
   1188 
   1189     return sourceToken;
   1190 }
   1191 
   1192 static
   1193 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
   1194     if(U_FAILURE(*status)) {
   1195         return NULL;
   1196     }
   1197     /* this is a virgin before - we need to fish the anchor from the UCA */
   1198     collIterate s;
   1199     uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
   1200     uint32_t CE, SecondCE;
   1201     uint32_t invPos;
   1202     if(sourceToken != NULL) {
   1203         uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
   1204     } else {
   1205         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
   1206     }
   1207 
   1208     baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
   1209     baseContCE = ucol_getNextCE(src->UCA, &s, status);
   1210     if(baseContCE == UCOL_NO_MORE_CES) {
   1211         baseContCE = 0;
   1212     }
   1213 
   1214 
   1215     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   1216     uint32_t ch = 0;
   1217     uint32_t expandNext = 0;
   1218     UColToken key;
   1219 
   1220     if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
   1221         uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
   1222         uint32_t raw = uprv_uca_getRawFromImplicit(primary);
   1223         ch = uprv_uca_getCodePointFromRaw(raw-1);
   1224         uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
   1225         CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
   1226         SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
   1227 
   1228         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1229         *src->extraCurrent++ = 0xFFFE;
   1230         *src->extraCurrent++ = (UChar)ch;
   1231         src->parsedToken.charsLen++;
   1232 
   1233         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
   1234         key.rulesToParse = src->source;
   1235 
   1236         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
   1237         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1238 
   1239         if(sourceToken == NULL) {
   1240             src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   1241             if(isContinuation(SecondCE)) {
   1242                 src->lh[src->resultLen].baseContCE = SecondCE;
   1243             } else {
   1244                 src->lh[src->resultLen].baseContCE = 0;
   1245             }
   1246             src->lh[src->resultLen].nextCE = 0;
   1247             src->lh[src->resultLen].nextContCE = 0;
   1248             src->lh[src->resultLen].previousCE = 0;
   1249             src->lh[src->resultLen].previousContCE = 0;
   1250 
   1251             src->lh[src->resultLen].indirect = FALSE;
   1252 
   1253             sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1254         }
   1255 
   1256     } else {
   1257         invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
   1258 
   1259         // we got the previous CE. Now we need to see if the difference between
   1260         // the two CEs is really of the requested strength.
   1261         // if it's a bigger difference (we asked for secondary and got primary), we
   1262         // need to modify the CE.
   1263         if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
   1264             // adjust the strength
   1265             // now we are in the situation where our baseCE should actually be modified in
   1266             // order to get the CE in the right position.
   1267             if(strength == UCOL_SECONDARY) {
   1268                 CE = baseCE - 0x0200;
   1269             } else { // strength == UCOL_TERTIARY
   1270                 CE = baseCE - 0x02;
   1271             }
   1272             if(baseContCE) {
   1273                 if(strength == UCOL_SECONDARY) {
   1274                     SecondCE = baseContCE - 0x0200;
   1275                 } else { // strength == UCOL_TERTIARY
   1276                     SecondCE = baseContCE - 0x02;
   1277                 }
   1278             }
   1279         }
   1280 
   1281 #if 0
   1282         // the code below relies on getting a code point from the inverse table, in order to be
   1283         // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
   1284         // 1. There are many code points that have the same CE
   1285         // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
   1286         // Also, in case when there is no equivalent strength before an element, we have to actually
   1287         // construct one. For example, &[before 2]a << x won't result in x << a, because the element
   1288         // before a is a primary difference.
   1289 
   1290         //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   1291 
   1292 
   1293         ch = CETable[3*invPos+2];
   1294 
   1295         if((ch &  UCOL_INV_SIZEMASK) != 0) {
   1296             uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
   1297             uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
   1298             ch = conts[offset];
   1299         }
   1300 
   1301         *src->extraCurrent++ = (UChar)ch;
   1302         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
   1303         src->parsedToken.charsLen = 1;
   1304 
   1305         // We got an UCA before. However, this might have been tailored.
   1306         // example:
   1307         // &\u30ca = \u306a
   1308         // &[before 3]\u306a<<<\u306a|\u309d
   1309 
   1310 
   1311         // uint32_t key = (*newCharsLen << 24) | *charsOffset;
   1312         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
   1313         key.rulesToParse = src->source;
   1314 
   1315         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
   1316         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1317 #endif
   1318 
   1319         // here is how it should be. The situation such as &[before 1]a < x, should be
   1320         // resolved exactly as if we wrote &a > x.
   1321         // therefore, I don't really care if the UCA value before a has been changed.
   1322         // However, I do care if the strength between my element and the previous element
   1323         // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
   1324         // have to construct the base CE.
   1325 
   1326 
   1327 
   1328         // if we found a tailored thing, we have to use the UCA value and construct
   1329         // a new reset token with constructed name
   1330         //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
   1331         // character to which we want to anchor is already tailored.
   1332         // We need to construct a new token which will be the anchor
   1333         // point
   1334         //*(src->extraCurrent-1) = 0xFFFE;
   1335         //*src->extraCurrent++ = (UChar)ch;
   1336         // grab before
   1337         src->parsedToken.charsOffset -= 10;
   1338         src->parsedToken.charsLen += 10;
   1339         src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   1340         if(isContinuation(SecondCE)) {
   1341             src->lh[src->resultLen].baseContCE = SecondCE;
   1342         } else {
   1343             src->lh[src->resultLen].baseContCE = 0;
   1344         }
   1345         src->lh[src->resultLen].nextCE = 0;
   1346         src->lh[src->resultLen].nextContCE = 0;
   1347         src->lh[src->resultLen].previousCE = 0;
   1348         src->lh[src->resultLen].previousContCE = 0;
   1349 
   1350         src->lh[src->resultLen].indirect = FALSE;
   1351 
   1352         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1353         //}
   1354     }
   1355 
   1356     return sourceToken;
   1357 
   1358 }
   1359 
   1360 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
   1361     UColToken *lastToken = NULL;
   1362     const UChar *parseEnd = NULL;
   1363     uint32_t expandNext = 0;
   1364     UBool variableTop = FALSE;
   1365     UBool top = FALSE;
   1366     uint16_t specs = 0;
   1367     UColTokListHeader *ListList = NULL;
   1368 
   1369     src->parsedToken.strength = UCOL_TOK_UNSET;
   1370 
   1371     ListList = src->lh;
   1372 
   1373     if(U_FAILURE(*status)) {
   1374         return 0;
   1375     }
   1376 
   1377     while(src->current < src->end) {
   1378         src->parsedToken.prefixOffset = 0;
   1379 
   1380         parseEnd = ucol_tok_parseNextToken(src,
   1381             (UBool)(lastToken == NULL),
   1382             parseError,
   1383             status);
   1384 
   1385         specs = src->parsedToken.flags;
   1386 
   1387 
   1388         variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
   1389         top = ((specs & UCOL_TOK_TOP) != 0);
   1390 
   1391         if(U_SUCCESS(*status) && parseEnd != NULL) {
   1392             UColToken *sourceToken = NULL;
   1393             //uint32_t key = 0;
   1394             uint32_t lastStrength = UCOL_TOK_UNSET;
   1395 
   1396             if(lastToken != NULL ) {
   1397                 lastStrength = lastToken->strength;
   1398             }
   1399 
   1400             //key = newCharsLen << 24 | charsOffset;
   1401             UColToken key;
   1402             key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1403             key.rulesToParse = src->source;
   1404 
   1405             /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
   1406             sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1407 
   1408             if(src->parsedToken.strength != UCOL_TOK_RESET) {
   1409                 if(lastToken == NULL) { /* this means that rules haven't started properly */
   1410                     *status = U_INVALID_FORMAT_ERROR;
   1411                     syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
   1412                     return 0;
   1413                 }
   1414                 /*  6 Otherwise (when relation != reset) */
   1415                 if(sourceToken == NULL) {
   1416                     /* If sourceToken is null, create new one, */
   1417                     sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
   1418                     /* test for NULL */
   1419                     if (sourceToken == NULL) {
   1420                         *status = U_MEMORY_ALLOCATION_ERROR;
   1421                         return 0;
   1422                     }
   1423                     sourceToken->rulesToParse = src->source;
   1424                     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
   1425 
   1426                     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
   1427 
   1428                     sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
   1429                     sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
   1430 
   1431                     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
   1432                     sourceToken->next = NULL;
   1433                     sourceToken->previous = NULL;
   1434                     sourceToken->noOfCEs = 0;
   1435                     sourceToken->noOfExpCEs = 0;
   1436                     // keep the flags around so that we know about before
   1437                     sourceToken->flags = src->parsedToken.flags;
   1438                     uhash_put(src->tailored, sourceToken, sourceToken, status);
   1439                     if(U_FAILURE(*status)) {
   1440                         return 0;
   1441                     }
   1442                 } else {
   1443                     /* we could have fished out a reset here */
   1444                     if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
   1445                         /* otherwise remove sourceToken from where it was. */
   1446                         if(sourceToken->next != NULL) {
   1447                             if(sourceToken->next->strength > sourceToken->strength) {
   1448                                 sourceToken->next->strength = sourceToken->strength;
   1449                             }
   1450                             sourceToken->next->previous = sourceToken->previous;
   1451                         } else {
   1452                             sourceToken->listHeader->last = sourceToken->previous;
   1453                         }
   1454 
   1455                         if(sourceToken->previous != NULL) {
   1456                             sourceToken->previous->next = sourceToken->next;
   1457                         } else {
   1458                             sourceToken->listHeader->first = sourceToken->next;
   1459                         }
   1460                         sourceToken->next = NULL;
   1461                         sourceToken->previous = NULL;
   1462                     }
   1463                 }
   1464 
   1465                 sourceToken->strength = src->parsedToken.strength;
   1466                 sourceToken->listHeader = lastToken->listHeader;
   1467 
   1468                 /*
   1469                 1.  Find the strongest strength in each list, and set strongestP and strongestN
   1470                 accordingly in the headers.
   1471                 */
   1472                 if(lastStrength == UCOL_TOK_RESET
   1473                     || sourceToken->listHeader->first == 0) {
   1474                         /* If LAST is a reset
   1475                         insert sourceToken in the list. */
   1476                         if(sourceToken->listHeader->first == 0) {
   1477                             sourceToken->listHeader->first = sourceToken;
   1478                             sourceToken->listHeader->last = sourceToken;
   1479                         } else { /* we need to find a place for us */
   1480                             /* and we'll get in front of the same strength */
   1481                             if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
   1482                                 sourceToken->next = sourceToken->listHeader->first;
   1483                                 sourceToken->next->previous = sourceToken;
   1484                                 sourceToken->listHeader->first = sourceToken;
   1485                                 sourceToken->previous = NULL;
   1486                             } else {
   1487                                 lastToken = sourceToken->listHeader->first;
   1488                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
   1489                                     lastToken = lastToken->next;
   1490                                 }
   1491                                 if(lastToken->next != NULL) {
   1492                                     lastToken->next->previous = sourceToken;
   1493                                 } else {
   1494                                     sourceToken->listHeader->last = sourceToken;
   1495                                 }
   1496                                 sourceToken->previous = lastToken;
   1497                                 sourceToken->next = lastToken->next;
   1498                                 lastToken->next = sourceToken;
   1499                             }
   1500                         }
   1501                     } else {
   1502                         /* Otherwise (when LAST is not a reset)
   1503                         if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
   1504                         otherwise insert before.
   1505                         when inserting after or before, search to the next position with the same
   1506                         strength in that direction. (This is called postpone insertion).         */
   1507                         if(sourceToken != lastToken) {
   1508                             if(lastToken->polarity == sourceToken->polarity) {
   1509                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
   1510                                     lastToken = lastToken->next;
   1511                                 }
   1512                                 sourceToken->previous = lastToken;
   1513                                 if(lastToken->next != NULL) {
   1514                                     lastToken->next->previous = sourceToken;
   1515                                 } else {
   1516                                     sourceToken->listHeader->last = sourceToken;
   1517                                 }
   1518 
   1519                                 sourceToken->next = lastToken->next;
   1520                                 lastToken->next = sourceToken;
   1521                             } else {
   1522                                 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
   1523                                     lastToken = lastToken->previous;
   1524                                 }
   1525                                 sourceToken->next = lastToken;
   1526                                 if(lastToken->previous != NULL) {
   1527                                     lastToken->previous->next = sourceToken;
   1528                                 } else {
   1529                                     sourceToken->listHeader->first = sourceToken;
   1530                                 }
   1531                                 sourceToken->previous = lastToken->previous;
   1532                                 lastToken->previous = sourceToken;
   1533                             }
   1534                         } else { /* repeated one thing twice in rules, stay with the stronger strength */
   1535                             if(lastStrength < sourceToken->strength) {
   1536                                 sourceToken->strength = lastStrength;
   1537                             }
   1538                         }
   1539                     }
   1540 
   1541                     /* if the token was a variable top, we're gonna put it in */
   1542                     if(variableTop == TRUE && src->varTop == NULL) {
   1543                         variableTop = FALSE;
   1544                         src->varTop = sourceToken;
   1545                     }
   1546 
   1547                     // Treat the expansions.
   1548                     // There are two types of expansions: explicit (x / y) and reset based propagating expansions
   1549                     // (&abc * d * e <=> &ab * d / c * e / c)
   1550                     // if both of them are in effect for a token, they are combined.
   1551 
   1552                     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
   1553 
   1554                     if(expandNext != 0) {
   1555                         if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
   1556                             expandNext = 0;
   1557                         } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
   1558                             sourceToken->expansion = expandNext;
   1559                         } else { /* there is both explicit and implicit expansion. We need to make a combination */
   1560                             uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
   1561                             uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
   1562                             sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
   1563                             src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
   1564                         }
   1565                     }
   1566 
   1567                     // This is just for debugging purposes
   1568                     if(sourceToken->expansion != 0) {
   1569                         sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
   1570                     } else {
   1571                         sourceToken->debugExpansion = 0;
   1572                     }
   1573                     // if the previous token was a reset before, the strength of this
   1574                     // token must match the strength of before. Otherwise we have an
   1575                     // undefined situation.
   1576                     // In other words, we currently have a cludge which we use to
   1577                     // represent &a >> x. This is written as &[before 2]a << x.
   1578                     if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
   1579                         uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
   1580                         if(beforeStrength != sourceToken->strength) {
   1581                             *status = U_INVALID_FORMAT_ERROR;
   1582                             syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
   1583                             return 0;
   1584                         }
   1585                     }
   1586             } else {
   1587                 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
   1588                     /* if the previous token was also a reset, */
   1589                     /*this means that we have two consecutive resets */
   1590                     /* and we want to remove the previous one if empty*/
   1591                     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
   1592                         src->resultLen--;
   1593                     }
   1594                 }
   1595 
   1596                 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
   1597                     uint32_t searchCharsLen = src->parsedToken.charsLen;
   1598                     while(searchCharsLen > 1 && sourceToken == NULL) {
   1599                         searchCharsLen--;
   1600                         //key = searchCharsLen << 24 | charsOffset;
   1601                         UColToken key;
   1602                         key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
   1603                         key.rulesToParse = src->source;
   1604                         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
   1605                     }
   1606                     if(sourceToken != NULL) {
   1607                         expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
   1608                     }
   1609                 }
   1610 
   1611                 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
   1612                     if(top == FALSE) { /* there is no indirection */
   1613                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
   1614                         if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
   1615                             /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
   1616                             while(sourceToken->strength > strength && sourceToken->previous != NULL) {
   1617                                 sourceToken = sourceToken->previous;
   1618                             }
   1619                             /* here, either we hit the strength or NULL */
   1620                             if(sourceToken->strength == strength) {
   1621                                 if(sourceToken->previous != NULL) {
   1622                                     sourceToken = sourceToken->previous;
   1623                                 } else { /* start of list */
   1624                                     sourceToken = sourceToken->listHeader->reset;
   1625                                 }
   1626                             } else { /* we hit NULL */
   1627                                 /* we should be doing the else part */
   1628                                 sourceToken = sourceToken->listHeader->reset;
   1629                                 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
   1630                             }
   1631                         } else {
   1632                             sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
   1633                         }
   1634                     } else { /* this is both before and indirection */
   1635                         top = FALSE;
   1636                         ListList[src->resultLen].previousCE = 0;
   1637                         ListList[src->resultLen].previousContCE = 0;
   1638                         ListList[src->resultLen].indirect = TRUE;
   1639                         /* we need to do slightly more work. we need to get the baseCE using the */
   1640                         /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
   1641                         /* in ucol_bld */
   1642                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
   1643                         uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
   1644                         uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
   1645                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
   1646 
   1647                         UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   1648                         if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
   1649                             uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
   1650                             uint32_t raw = uprv_uca_getRawFromImplicit(primary);
   1651                             uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
   1652                             CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
   1653                             SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
   1654                         } else {
   1655                             /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
   1656                             ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
   1657                         }
   1658 
   1659                         ListList[src->resultLen].baseCE = CE;
   1660                         ListList[src->resultLen].baseContCE = SecondCE;
   1661                         ListList[src->resultLen].nextCE = 0;
   1662                         ListList[src->resultLen].nextContCE = 0;
   1663 
   1664                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1665                     }
   1666                 }
   1667 
   1668 
   1669                 /*  5 If the relation is a reset:
   1670                 If sourceToken is null
   1671                 Create new list, create new sourceToken, make the baseCE from source, put
   1672                 the sourceToken in ListHeader of the new list */
   1673                 if(sourceToken == NULL) {
   1674                     /*
   1675                     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
   1676                     First convert all expansions into normal form. Examples:
   1677                     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
   1678                     d * ... into &x * c/y * d * ...
   1679                     Note: reset values can never have expansions, although they can cause the
   1680                     very next item to have one. They may be contractions, if they are found
   1681                     earlier in the list.
   1682                     */
   1683                     if(top == FALSE) {
   1684                         collIterate s;
   1685                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
   1686 
   1687                         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);
   1688 
   1689                         CE = ucol_getNextCE(src->UCA, &s, status);
   1690                         UChar *expand = s.pos;
   1691                         SecondCE = ucol_getNextCE(src->UCA, &s, status);
   1692 
   1693                         ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
   1694                         if(isContinuation(SecondCE)) {
   1695                             ListList[src->resultLen].baseContCE = SecondCE;
   1696                         } else {
   1697                             ListList[src->resultLen].baseContCE = 0;
   1698                         }
   1699                         ListList[src->resultLen].nextCE = 0;
   1700                         ListList[src->resultLen].nextContCE = 0;
   1701                         ListList[src->resultLen].previousCE = 0;
   1702                         ListList[src->resultLen].previousContCE = 0;
   1703                         ListList[src->resultLen].indirect = FALSE;
   1704                         sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
   1705                     } else { /* top == TRUE */
   1706                         /* just use the supplied values */
   1707                         top = FALSE;
   1708                         ListList[src->resultLen].previousCE = 0;
   1709                         ListList[src->resultLen].previousContCE = 0;
   1710                         ListList[src->resultLen].indirect = TRUE;
   1711                         ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
   1712                         ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
   1713                         ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
   1714                         ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
   1715 
   1716                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
   1717 
   1718                     }
   1719                 } else { /* reset to something already in rules */
   1720                     top = FALSE;
   1721                 }
   1722             }
   1723             /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
   1724             lastToken = sourceToken;
   1725         } else {
   1726             if(U_FAILURE(*status)) {
   1727                 return 0;
   1728             }
   1729         }
   1730     }
   1731 
   1732     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
   1733         src->resultLen--;
   1734     }
   1735     return src->resultLen;
   1736 }
   1737 
   1738 void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
   1739     U_NAMESPACE_USE
   1740 
   1741     uint32_t nSize = 0;
   1742     uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
   1743     if(U_FAILURE(*status)) {
   1744         return;
   1745     }
   1746 
   1747     // set everything to zero, so that we can clean up gracefully
   1748     uprv_memset(src, 0, sizeof(UColTokenParser));
   1749 
   1750     // first we need to find options that don't like to be normalized,
   1751     // like copy and remove...
   1752     //const UChar *openBrace = rules;
   1753     int32_t optionNumber = -1;
   1754     const UChar *setStart = NULL;
   1755     uint32_t i = 0;
   1756     while(i < rulesLength) {
   1757         if(rules[i] == 0x005B) {
   1758             // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
   1759             //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
   1760             optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
   1761             if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
   1762                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
   1763                 if(U_SUCCESS(*status)) {
   1764                     if(src->copySet == NULL) {
   1765                         src->copySet = newSet;
   1766                     } else {
   1767                         uset_addAll(src->copySet, newSet);
   1768                         uset_close(newSet);
   1769                     }
   1770                 } else {
   1771                     return;
   1772                 }
   1773             } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
   1774                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
   1775                 if(U_SUCCESS(*status)) {
   1776                     if(src->removeSet == NULL) {
   1777                         src->removeSet = newSet;
   1778                     } else {
   1779                         uset_addAll(src->removeSet, newSet);
   1780                         uset_close(newSet);
   1781                     }
   1782                 } else {
   1783                     return;
   1784                 }
   1785             }
   1786         }
   1787         //openBrace++;
   1788         i++;
   1789     }
   1790 
   1791     src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
   1792     /* test for NULL */
   1793     if (src->source == NULL) {
   1794         *status = U_MEMORY_ALLOCATION_ERROR;
   1795         return;
   1796     }
   1797     uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
   1798     nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
   1799     if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
   1800         *status = U_ZERO_ERROR;
   1801         src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
   1802         /* test for NULL */
   1803         if (src->source == NULL) {
   1804             *status = U_MEMORY_ALLOCATION_ERROR;
   1805             return;
   1806         }
   1807         nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
   1808     }
   1809     src->current = src->source;
   1810     src->end = src->source+nSize;
   1811     src->sourceCurrent = src->source;
   1812     src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
   1813     src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
   1814     src->varTop = NULL;
   1815     src->UCA = UCA;
   1816     src->invUCA = ucol_initInverseUCA(status);
   1817     src->parsedToken.charsLen = 0;
   1818     src->parsedToken.charsOffset = 0;
   1819     src->parsedToken.extensionLen = 0;
   1820     src->parsedToken.extensionOffset = 0;
   1821     src->parsedToken.prefixLen = 0;
   1822     src->parsedToken.prefixOffset = 0;
   1823     src->parsedToken.flags = 0;
   1824     src->parsedToken.strength = UCOL_TOK_UNSET;
   1825     src->buildCCTabFlag = FALSE;
   1826 
   1827     if(U_FAILURE(*status)) {
   1828         return;
   1829     }
   1830     src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
   1831     if(U_FAILURE(*status)) {
   1832         return;
   1833     }
   1834     uhash_setValueDeleter(src->tailored, uhash_freeBlock);
   1835 
   1836     src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
   1837     /* test for NULL */
   1838     if (src->opts == NULL) {
   1839         *status = U_MEMORY_ALLOCATION_ERROR;
   1840         return;
   1841     }
   1842 
   1843     uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
   1844 
   1845     // rulesToParse = src->source;
   1846     src->lh = 0;
   1847     src->listCapacity = 1024;
   1848     src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
   1849     //Test for NULL
   1850     if (src->lh == NULL) {
   1851         *status = U_MEMORY_ALLOCATION_ERROR;
   1852         return;
   1853     }
   1854     uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
   1855     src->resultLen = 0;
   1856 
   1857     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   1858 
   1859     // UCOL_RESET_TOP_VALUE
   1860     setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
   1861     // UCOL_FIRST_PRIMARY_IGNORABLE
   1862     setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
   1863     // UCOL_LAST_PRIMARY_IGNORABLE
   1864     setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
   1865     // UCOL_FIRST_SECONDARY_IGNORABLE
   1866     setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
   1867     // UCOL_LAST_SECONDARY_IGNORABLE
   1868     setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
   1869     // UCOL_FIRST_TERTIARY_IGNORABLE
   1870     setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
   1871     // UCOL_LAST_TERTIARY_IGNORABLE
   1872     setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
   1873     // UCOL_FIRST_VARIABLE
   1874     setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
   1875     // UCOL_LAST_VARIABLE
   1876     setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
   1877     // UCOL_FIRST_NON_VARIABLE
   1878     setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
   1879     // UCOL_LAST_NON_VARIABLE
   1880     setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
   1881     // UCOL_FIRST_IMPLICIT
   1882     setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
   1883     // UCOL_LAST_IMPLICIT
   1884     setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
   1885     // UCOL_FIRST_TRAILING
   1886     setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
   1887     // UCOL_LAST_TRAILING
   1888     setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
   1889     ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
   1890 }
   1891 
   1892 
   1893 void ucol_tok_closeTokenList(UColTokenParser *src) {
   1894     if(src->copySet != NULL) {
   1895         uset_close(src->copySet);
   1896     }
   1897     if(src->removeSet != NULL) {
   1898         uset_close(src->removeSet);
   1899     }
   1900     if(src->tailored != NULL) {
   1901         uhash_close(src->tailored);
   1902     }
   1903     if(src->lh != NULL) {
   1904         uprv_free(src->lh);
   1905     }
   1906     if(src->source != NULL) {
   1907         uprv_free(src->source);
   1908     }
   1909     if(src->opts != NULL) {
   1910         uprv_free(src->opts);
   1911     }
   1912 }
   1913 
   1914 #endif /* #if !UCONFIG_NO_COLLATION */
   1915 
   1916