1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucol_tok.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created 02/22/2001 14 * created by: Vladimir Weinstein 15 * 16 * This module reads a tailoring rule string and produces a list of 17 * tokens that will be turned into collation elements 18 * 19 */ 20 21 #include "unicode/utypes.h" 22 23 #if !UCONFIG_NO_COLLATION 24 25 #include "unicode/uscript.h" 26 #include "unicode/ustring.h" 27 #include "unicode/uchar.h" 28 #include "unicode/uniset.h" 29 30 #include "cmemory.h" 31 #include "cstring.h" 32 #include "ucol_bld.h" 33 #include "ucol_tok.h" 34 #include "ulocimp.h" 35 #include "uresimp.h" 36 #include "util.h" 37 38 // Define this only for debugging. 39 // #define DEBUG_FOR_COLL_RULES 1 40 41 #ifdef DEBUG_FOR_COLL_RULES 42 #include <iostream> 43 #endif 44 45 U_NAMESPACE_USE 46 47 U_CDECL_BEGIN 48 static int32_t U_CALLCONV 49 uhash_hashTokens(const UHashTok k) 50 { 51 int32_t hash = 0; 52 //uint32_t key = (uint32_t)k.integer; 53 UColToken *key = (UColToken *)k.pointer; 54 if (key != 0) { 55 int32_t len = (key->source & 0xFF000000)>>24; 56 int32_t inc = ((len - 32) / 32) + 1; 57 58 const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl); 59 const UChar *limit = p + len; 60 61 while (p<limit) { 62 hash = (hash * 37) + *p; 63 p += inc; 64 } 65 } 66 return hash; 67 } 68 69 static UBool U_CALLCONV 70 uhash_compareTokens(const UHashTok key1, const UHashTok key2) 71 { 72 //uint32_t p1 = (uint32_t) key1.integer; 73 //uint32_t p2 = (uint32_t) key2.integer; 74 UColToken *p1 = (UColToken *)key1.pointer; 75 UColToken *p2 = (UColToken *)key2.pointer; 76 const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl); 77 const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl); 78 uint32_t s1L = ((p1->source & 0xFF000000) >> 24); 79 uint32_t s2L = ((p2->source & 0xFF000000) >> 24); 80 const UChar *end = s1+s1L-1; 81 82 if (p1 == p2) { 83 return TRUE; 84 } 85 if (p1->source == 0 || p2->source == 0) { 86 return FALSE; 87 } 88 if(s1L != s2L) { 89 return FALSE; 90 } 91 if(p1->source == p2->source) { 92 return TRUE; 93 } 94 while((s1 < end) && *s1 == *s2) { 95 ++s1; 96 ++s2; 97 } 98 if(*s1 == *s2) { 99 return TRUE; 100 } else { 101 return FALSE; 102 } 103 } 104 U_CDECL_END 105 106 /* 107 * Debug messages used to pinpoint where a format error occurred. 108 * A better way is to include context-sensitive information in syntaxError() function. 109 * 110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR 111 * in the compile line. 112 */ 113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */ 114 115 #ifdef DEBUG_FOR_FORMAT_ERROR 116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);} 117 #else 118 #define DBG_FORMAT_ERROR 119 #endif 120 121 122 /* 123 * Controls debug messages so that the output can be compared before and after a 124 * big change. Prints the information of every code point that comes out of the 125 * collation parser and its strength into a file. When a big change in format 126 * happens, the files before and after the change should be identical. 127 * 128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS 129 * in the compile line. 130 */ 131 // #define DEBUG_FOR_CODE_POINTS 1 132 133 #ifdef DEBUG_FOR_CODE_POINTS 134 FILE* dfcp_fp = NULL; 135 #endif 136 137 138 /*static inline void U_CALLCONV 139 uhash_freeBlockWrapper(void *obj) { 140 uhash_freeBlock(obj); 141 }*/ 142 143 144 typedef struct { 145 uint32_t startCE; 146 uint32_t startContCE; 147 uint32_t limitCE; 148 uint32_t limitContCE; 149 } indirectBoundaries; 150 151 /* these values are used for finding CE values for indirect positioning. */ 152 /* Indirect positioning is a mechanism for allowing resets on symbolic */ 153 /* values. It only works for resets and you cannot tailor indirect names */ 154 /* An indirect name can define either an anchor point or a range. An */ 155 /* anchor point behaves in exactly the same way as a code point in reset */ 156 /* would, except that it cannot be tailored. A range (we currently only */ 157 /* know for the [top] range will explicitly set the upper bound for */ 158 /* generated CEs, thus allowing for better control over how many CEs can */ 159 /* be squeezed between in the range without performance penalty. */ 160 /* In that respect, we use [top] for tailoring of locales that use CJK */ 161 /* characters. Other indirect values are currently a pure convenience, */ 162 /* they can be used to assure that the CEs will be always positioned in */ 163 /* the same place relative to a point with known properties (e.g. first */ 164 /* primary ignorable). */ 165 static indirectBoundaries ucolIndirectBoundaries[15]; 166 /* 167 static indirectBoundaries ucolIndirectBoundaries[11] = { 168 { UCOL_RESET_TOP_VALUE, 0, 169 UCOL_NEXT_TOP_VALUE, 0 }, 170 { UCOL_FIRST_PRIMARY_IGNORABLE, 0, 171 0, 0 }, 172 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT, 173 0, 0 }, 174 { UCOL_FIRST_SECONDARY_IGNORABLE, 0, 175 0, 0 }, 176 { UCOL_LAST_SECONDARY_IGNORABLE, 0, 177 0, 0 }, 178 { UCOL_FIRST_TERTIARY_IGNORABLE, 0, 179 0, 0 }, 180 { UCOL_LAST_TERTIARY_IGNORABLE, 0, 181 0, 0 }, 182 { UCOL_FIRST_VARIABLE, 0, 183 0, 0 }, 184 { UCOL_LAST_VARIABLE, 0, 185 0, 0 }, 186 { UCOL_FIRST_NON_VARIABLE, 0, 187 0, 0 }, 188 { UCOL_LAST_NON_VARIABLE, 0, 189 0, 0 }, 190 }; 191 */ 192 193 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) { 194 195 // Set values for the top - TODO: once we have values for all the indirects, we are going 196 // to initalize here. 197 ucolIndirectBoundaries[indexR].startCE = start[0]; 198 ucolIndirectBoundaries[indexR].startContCE = start[1]; 199 if(end) { 200 ucolIndirectBoundaries[indexR].limitCE = end[0]; 201 ucolIndirectBoundaries[indexR].limitContCE = end[1]; 202 } else { 203 ucolIndirectBoundaries[indexR].limitCE = 0; 204 ucolIndirectBoundaries[indexR].limitContCE = 0; 205 } 206 } 207 208 209 static inline 210 void syntaxError(const UChar* rules, 211 int32_t pos, 212 int32_t rulesLen, 213 UParseError* parseError) 214 { 215 parseError->offset = pos; 216 parseError->line = 0 ; /* we are not using line numbers */ 217 218 // for pre-context 219 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1)); 220 int32_t stop = pos; 221 222 u_memcpy(parseError->preContext,rules+start,stop-start); 223 //null terminate the buffer 224 parseError->preContext[stop-start] = 0; 225 226 //for post-context 227 start = pos+1; 228 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) : 229 rulesLen; 230 231 if(start < stop) { 232 u_memcpy(parseError->postContext,rules+start,stop-start); 233 //null terminate the buffer 234 parseError->postContext[stop-start]= 0; 235 } else { 236 parseError->postContext[0] = 0; 237 } 238 } 239 240 static 241 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) { 242 switch(attrib) { 243 case UCOL_HIRAGANA_QUATERNARY_MODE: 244 opts->hiraganaQ = value; 245 break; 246 case UCOL_FRENCH_COLLATION: 247 opts->frenchCollation = value; 248 break; 249 case UCOL_ALTERNATE_HANDLING: 250 opts->alternateHandling = value; 251 break; 252 case UCOL_CASE_FIRST: 253 opts->caseFirst = value; 254 break; 255 case UCOL_CASE_LEVEL: 256 opts->caseLevel = value; 257 break; 258 case UCOL_NORMALIZATION_MODE: 259 opts->normalizationMode = value; 260 break; 261 case UCOL_STRENGTH: 262 opts->strength = value; 263 break; 264 case UCOL_NUMERIC_COLLATION: 265 opts->numericCollation = value; 266 break; 267 case UCOL_ATTRIBUTE_COUNT: 268 default: 269 break; 270 } 271 } 272 273 #define UTOK_OPTION_COUNT 22 274 275 static UBool didInit = FALSE; 276 /* we can be strict, or we can be lenient */ 277 /* I'd surely be lenient with the option arguments */ 278 /* maybe even with options */ 279 U_STRING_DECL(suboption_00, "non-ignorable", 13); 280 U_STRING_DECL(suboption_01, "shifted", 7); 281 282 U_STRING_DECL(suboption_02, "lower", 5); 283 U_STRING_DECL(suboption_03, "upper", 5); 284 U_STRING_DECL(suboption_04, "off", 3); 285 U_STRING_DECL(suboption_05, "on", 2); 286 U_STRING_DECL(suboption_06, "1", 1); 287 U_STRING_DECL(suboption_07, "2", 1); 288 U_STRING_DECL(suboption_08, "3", 1); 289 U_STRING_DECL(suboption_09, "4", 1); 290 U_STRING_DECL(suboption_10, "I", 1); 291 292 U_STRING_DECL(suboption_11, "primary", 7); 293 U_STRING_DECL(suboption_12, "secondary", 9); 294 U_STRING_DECL(suboption_13, "tertiary", 8); 295 U_STRING_DECL(suboption_14, "variable", 8); 296 U_STRING_DECL(suboption_15, "regular", 7); 297 U_STRING_DECL(suboption_16, "implicit", 8); 298 U_STRING_DECL(suboption_17, "trailing", 8); 299 300 301 U_STRING_DECL(option_00, "undefined", 9); 302 U_STRING_DECL(option_01, "rearrange", 9); 303 U_STRING_DECL(option_02, "alternate", 9); 304 U_STRING_DECL(option_03, "backwards", 9); 305 U_STRING_DECL(option_04, "variable top", 12); 306 U_STRING_DECL(option_05, "top", 3); 307 U_STRING_DECL(option_06, "normalization", 13); 308 U_STRING_DECL(option_07, "caseLevel", 9); 309 U_STRING_DECL(option_08, "caseFirst", 9); 310 U_STRING_DECL(option_09, "scriptOrder", 11); 311 U_STRING_DECL(option_10, "charsetname", 11); 312 U_STRING_DECL(option_11, "charset", 7); 313 U_STRING_DECL(option_12, "before", 6); 314 U_STRING_DECL(option_13, "hiraganaQ", 9); 315 U_STRING_DECL(option_14, "strength", 8); 316 U_STRING_DECL(option_15, "first", 5); 317 U_STRING_DECL(option_16, "last", 4); 318 U_STRING_DECL(option_17, "optimize", 8); 319 U_STRING_DECL(option_18, "suppressContractions", 20); 320 U_STRING_DECL(option_19, "numericOrdering", 15); 321 U_STRING_DECL(option_20, "import", 6); 322 U_STRING_DECL(option_21, "reorder", 7); 323 324 /* 325 [last variable] last variable value 326 [last primary ignorable] largest CE for primary ignorable 327 [last secondary ignorable] largest CE for secondary ignorable 328 [last tertiary ignorable] largest CE for tertiary ignorable 329 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) 330 */ 331 332 333 static const ucolTokSuboption alternateSub[2] = { 334 {suboption_00, 13, UCOL_NON_IGNORABLE}, 335 {suboption_01, 7, UCOL_SHIFTED} 336 }; 337 338 static const ucolTokSuboption caseFirstSub[3] = { 339 {suboption_02, 5, UCOL_LOWER_FIRST}, 340 {suboption_03, 5, UCOL_UPPER_FIRST}, 341 {suboption_04, 3, UCOL_OFF}, 342 }; 343 344 static const ucolTokSuboption onOffSub[2] = { 345 {suboption_04, 3, UCOL_OFF}, 346 {suboption_05, 2, UCOL_ON} 347 }; 348 349 static const ucolTokSuboption frenchSub[1] = { 350 {suboption_07, 1, UCOL_ON} 351 }; 352 353 static const ucolTokSuboption beforeSub[3] = { 354 {suboption_06, 1, UCOL_PRIMARY}, 355 {suboption_07, 1, UCOL_SECONDARY}, 356 {suboption_08, 1, UCOL_TERTIARY} 357 }; 358 359 static const ucolTokSuboption strengthSub[5] = { 360 {suboption_06, 1, UCOL_PRIMARY}, 361 {suboption_07, 1, UCOL_SECONDARY}, 362 {suboption_08, 1, UCOL_TERTIARY}, 363 {suboption_09, 1, UCOL_QUATERNARY}, 364 {suboption_10, 1, UCOL_IDENTICAL}, 365 }; 366 367 static const ucolTokSuboption firstLastSub[7] = { 368 {suboption_11, 7, UCOL_PRIMARY}, 369 {suboption_12, 9, UCOL_PRIMARY}, 370 {suboption_13, 8, UCOL_PRIMARY}, 371 {suboption_14, 8, UCOL_PRIMARY}, 372 {suboption_15, 7, UCOL_PRIMARY}, 373 {suboption_16, 8, UCOL_PRIMARY}, 374 {suboption_17, 8, UCOL_PRIMARY}, 375 }; 376 377 enum OptionNumber { 378 OPTION_ALTERNATE_HANDLING = 0, 379 OPTION_FRENCH_COLLATION, 380 OPTION_CASE_LEVEL, 381 OPTION_CASE_FIRST, 382 OPTION_NORMALIZATION_MODE, 383 OPTION_HIRAGANA_QUATERNARY, 384 OPTION_STRENGTH, 385 OPTION_NUMERIC_COLLATION, 386 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION, 387 OPTION_VARIABLE_TOP, 388 OPTION_REARRANGE, 389 OPTION_BEFORE, 390 OPTION_TOP, 391 OPTION_FIRST, 392 OPTION_LAST, 393 OPTION_OPTIMIZE, 394 OPTION_SUPPRESS_CONTRACTIONS, 395 OPTION_UNDEFINED, 396 OPTION_SCRIPT_ORDER, 397 OPTION_CHARSET_NAME, 398 OPTION_CHARSET, 399 OPTION_IMPORT, 400 OPTION_SCRIPTREORDER 401 } ; 402 403 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { 404 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */ 405 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */ 406 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ 407 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ 408 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */ 409 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */ 410 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */ 411 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/ 412 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ 413 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ 414 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ 415 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ 416 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */ 417 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */ 418 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */ 419 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */ 420 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ 421 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ 422 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ 423 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" */ 424 /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */ 425 /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */ 426 }; 427 428 static 429 int32_t u_strncmpNoCase(const UChar *s1, 430 const UChar *s2, 431 int32_t n) 432 { 433 if(n > 0) { 434 int32_t rc; 435 for(;;) { 436 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); 437 if(rc != 0 || *s1 == 0 || --n == 0) { 438 return rc; 439 } 440 ++s1; 441 ++s2; 442 } 443 } 444 return 0; 445 } 446 447 static 448 void ucol_uprv_tok_initData() { 449 if(!didInit) { 450 U_STRING_INIT(suboption_00, "non-ignorable", 13); 451 U_STRING_INIT(suboption_01, "shifted", 7); 452 453 U_STRING_INIT(suboption_02, "lower", 5); 454 U_STRING_INIT(suboption_03, "upper", 5); 455 U_STRING_INIT(suboption_04, "off", 3); 456 U_STRING_INIT(suboption_05, "on", 2); 457 458 U_STRING_INIT(suboption_06, "1", 1); 459 U_STRING_INIT(suboption_07, "2", 1); 460 U_STRING_INIT(suboption_08, "3", 1); 461 U_STRING_INIT(suboption_09, "4", 1); 462 U_STRING_INIT(suboption_10, "I", 1); 463 464 U_STRING_INIT(suboption_11, "primary", 7); 465 U_STRING_INIT(suboption_12, "secondary", 9); 466 U_STRING_INIT(suboption_13, "tertiary", 8); 467 U_STRING_INIT(suboption_14, "variable", 8); 468 U_STRING_INIT(suboption_15, "regular", 7); 469 U_STRING_INIT(suboption_16, "implicit", 8); 470 U_STRING_INIT(suboption_17, "trailing", 8); 471 472 473 U_STRING_INIT(option_00, "undefined", 9); 474 U_STRING_INIT(option_01, "rearrange", 9); 475 U_STRING_INIT(option_02, "alternate", 9); 476 U_STRING_INIT(option_03, "backwards", 9); 477 U_STRING_INIT(option_04, "variable top", 12); 478 U_STRING_INIT(option_05, "top", 3); 479 U_STRING_INIT(option_06, "normalization", 13); 480 U_STRING_INIT(option_07, "caseLevel", 9); 481 U_STRING_INIT(option_08, "caseFirst", 9); 482 U_STRING_INIT(option_09, "scriptOrder", 11); 483 U_STRING_INIT(option_10, "charsetname", 11); 484 U_STRING_INIT(option_11, "charset", 7); 485 U_STRING_INIT(option_12, "before", 6); 486 U_STRING_INIT(option_13, "hiraganaQ", 9); 487 U_STRING_INIT(option_14, "strength", 8); 488 U_STRING_INIT(option_15, "first", 5); 489 U_STRING_INIT(option_16, "last", 4); 490 U_STRING_INIT(option_17, "optimize", 8); 491 U_STRING_INIT(option_18, "suppressContractions", 20); 492 U_STRING_INIT(option_19, "numericOrdering", 15); 493 U_STRING_INIT(option_20, "import ", 6); 494 U_STRING_INIT(option_21, "reorder", 7); 495 didInit = TRUE; 496 } 497 } 498 499 500 // This function reads basic options to set in the runtime collator 501 // used by data driven tests. Should not support build time options 502 U_CAPI const UChar * U_EXPORT2 503 ucol_tok_getNextArgument(const UChar *start, const UChar *end, 504 UColAttribute *attrib, UColAttributeValue *value, 505 UErrorCode *status) 506 { 507 uint32_t i = 0; 508 int32_t j=0; 509 UBool foundOption = FALSE; 510 const UChar *optionArg = NULL; 511 512 ucol_uprv_tok_initData(); 513 514 while(start < end && (u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start))) { /* eat whitespace */ 515 start++; 516 } 517 if(start >= end) { 518 return NULL; 519 } 520 /* skip opening '[' */ 521 if(*start == 0x005b) { 522 start++; 523 } else { 524 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '[' 525 return NULL; 526 } 527 528 while(i < UTOK_OPTION_COUNT) { 529 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { 530 foundOption = TRUE; 531 if(end - start > rulesOptions[i].optionLen) { 532 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */ 533 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */ 534 optionArg++; 535 } 536 } 537 break; 538 } 539 i++; 540 } 541 542 if(!foundOption) { 543 *status = U_ILLEGAL_ARGUMENT_ERROR; 544 return NULL; 545 } 546 547 if(optionArg) { 548 for(j = 0; j<rulesOptions[i].subSize; j++) { 549 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 550 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); 551 *attrib = rulesOptions[i].attr; 552 *value = rulesOptions[i].subopts[j].attrVal; 553 optionArg += rulesOptions[i].subopts[j].subLen; 554 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */ 555 optionArg++; 556 } 557 if(*optionArg == 0x005d) { 558 optionArg++; 559 return optionArg; 560 } else { 561 *status = U_ILLEGAL_ARGUMENT_ERROR; 562 return NULL; 563 } 564 } 565 } 566 } 567 *status = U_ILLEGAL_ARGUMENT_ERROR; 568 return NULL; 569 } 570 571 static 572 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) { 573 while(*start != 0x005b) { /* advance while we find the first '[' */ 574 start++; 575 } 576 // now we need to get a balanced set of '[]'. The problem is that a set can have 577 // many, and *end point to the first closing '[' 578 int32_t noOpenBraces = 1; 579 int32_t current = 1; // skip the opening brace 580 while(start+current < end && noOpenBraces != 0) { 581 if(start[current] == 0x005b) { 582 noOpenBraces++; 583 } else if(start[current] == 0x005D) { // closing brace 584 noOpenBraces--; 585 } 586 current++; 587 } 588 589 if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) { 590 *status = U_ILLEGAL_ARGUMENT_ERROR; 591 return NULL; 592 } 593 return uset_openPattern(start, current, status); 594 } 595 596 /** 597 * Reads an option and matches the option name with the predefined options. (Case-insensitive.) 598 * @param start Pointer to the start UChar. 599 * @param end Pointer to the last valid pointer beyond which the option will not extend. 600 * @param optionArg Address of the pointer at which the options start (after the option name) 601 * @return The index of the option, or -1 if the option is not valid. 602 */ 603 static 604 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) { 605 int32_t i = 0; 606 ucol_uprv_tok_initData(); 607 608 while(u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start)) { /* eat whitespace */ 609 start++; 610 } 611 while(i < UTOK_OPTION_COUNT) { 612 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { 613 if(end - start > rulesOptions[i].optionLen) { 614 *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */ 615 while(u_isWhitespace(**optionArg) || uprv_isRuleWhiteSpace(**optionArg)) { /* eat whitespace */ 616 (*optionArg)++; 617 } 618 } 619 break; 620 } 621 i++; 622 } 623 if(i == UTOK_OPTION_COUNT) { 624 i = -1; // didn't find an option 625 } 626 return i; 627 } 628 629 630 static 631 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) { 632 int32_t codeCount = 0; 633 int32_t codeIndex = 0; 634 char conversion[64]; 635 int32_t tokenLength = 0; 636 const UChar* space; 637 638 const UChar* current = src->current; 639 const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current); 640 641 // eat leading whitespace 642 while(current < end && u_isWhitespace(*current)) { 643 current++; 644 } 645 646 while(current < end) { 647 space = u_memchr(current, 0x0020, end - current); 648 space = space == 0 ? end : space; 649 tokenLength = space - current; 650 if (tokenLength < 4) { 651 *status = U_INVALID_FORMAT_ERROR; 652 return; 653 } 654 codeCount++; 655 current += tokenLength; 656 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ 657 ++current; 658 } 659 } 660 661 if (codeCount == 0) { 662 *status = U_INVALID_FORMAT_ERROR; 663 } 664 665 src->reorderCodesLength = codeCount; 666 src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t)); 667 current = src->current; 668 669 // eat leading whitespace 670 while(current < end && u_isWhitespace(*current)) { 671 current++; 672 } 673 674 while(current < end) { 675 space = u_memchr(current, 0x0020, end - current); 676 space = space == 0 ? end : space; 677 tokenLength = space - current; 678 if (tokenLength < 4) { 679 *status = U_ILLEGAL_ARGUMENT_ERROR; 680 return; 681 } else { 682 u_UCharsToChars(current, conversion, tokenLength); 683 conversion[tokenLength] = '\0'; 684 src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion); 685 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { 686 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion); 687 } 688 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { 689 *status = U_ILLEGAL_ARGUMENT_ERROR; 690 } 691 } 692 codeIndex++; 693 current += tokenLength; 694 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ 695 ++current; 696 } 697 } 698 } 699 700 // reads and conforms to various options in rules 701 // end is the position of the first closing ']' 702 // However, some of the options take an UnicodeSet definition 703 // which needs to duplicate the closing ']' 704 // for example: '[copy [\uAC00-\uD7FF]]' 705 // These options will move end to the second ']' and the 706 // caller will set the current to it. 707 static 708 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) { 709 const UChar* start = src->current; 710 int32_t i = 0; 711 int32_t j=0; 712 const UChar *optionArg = NULL; 713 714 uint8_t result = 0; 715 716 start++; /*skip opening '['*/ 717 i = ucol_uprv_tok_readOption(start, src->end, &optionArg); 718 if(optionArg) { 719 src->current = optionArg; 720 } 721 722 if(i < 0) { 723 *status = U_ILLEGAL_ARGUMENT_ERROR; 724 } else { 725 int32_t noOpenBraces = 1; 726 switch(i) { 727 case OPTION_ALTERNATE_HANDLING: 728 case OPTION_FRENCH_COLLATION: 729 case OPTION_CASE_LEVEL: 730 case OPTION_CASE_FIRST: 731 case OPTION_NORMALIZATION_MODE: 732 case OPTION_HIRAGANA_QUATERNARY: 733 case OPTION_STRENGTH: 734 case OPTION_NUMERIC_COLLATION: 735 if(optionArg) { 736 for(j = 0; j<rulesOptions[i].subSize; j++) { 737 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 738 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); 739 result = UCOL_TOK_SUCCESS; 740 } 741 } 742 } 743 if(result == 0) { 744 *status = U_ILLEGAL_ARGUMENT_ERROR; 745 } 746 break; 747 case OPTION_VARIABLE_TOP: 748 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; 749 break; 750 case OPTION_REARRANGE: 751 result = UCOL_TOK_SUCCESS; 752 break; 753 case OPTION_BEFORE: 754 if(optionArg) { 755 for(j = 0; j<rulesOptions[i].subSize; j++) { 756 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 757 result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1); 758 } 759 } 760 } 761 if(result == 0) { 762 *status = U_ILLEGAL_ARGUMENT_ERROR; 763 } 764 break; 765 case OPTION_TOP: /* we are going to have an array with structures of limit CEs */ 766 /* index to this array will be src->parsedToken.indirectIndex*/ 767 src->parsedToken.indirectIndex = 0; 768 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP; 769 break; 770 case OPTION_FIRST: 771 case OPTION_LAST: /* first, last */ 772 for(j = 0; j<rulesOptions[i].subSize; j++) { 773 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 774 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first 775 // element of indirect boundaries is reserved for top. 776 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2); 777 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;; 778 } 779 } 780 if(result == 0) { 781 *status = U_ILLEGAL_ARGUMENT_ERROR; 782 } 783 break; 784 case OPTION_OPTIMIZE: 785 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization 786 // we need to move end here 787 src->current++; // skip opening brace 788 while(src->current < src->end && noOpenBraces != 0) { 789 if(*src->current == 0x005b) { 790 noOpenBraces++; 791 } else if(*src->current == 0x005D) { // closing brace 792 noOpenBraces--; 793 } 794 src->current++; 795 } 796 result = UCOL_TOK_SUCCESS; 797 break; 798 case OPTION_SCRIPTREORDER: 799 ucol_tok_parseScriptReorder(src, status); 800 break; 801 default: 802 *status = U_UNSUPPORTED_ERROR; 803 break; 804 } 805 } 806 src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current)); 807 return result; 808 } 809 810 811 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) { 812 if (stuff == NULL || len <= 0) { 813 return; 814 } 815 UnicodeString tempStuff(FALSE, stuff, len); 816 if(src->extraCurrent+len >= src->extraEnd) { 817 /* reallocate */ 818 if (stuff >= src->source && stuff <= src->end) { 819 // Copy the "stuff" contents into tempStuff's own buffer. 820 // UnicodeString is copy-on-write. 821 if (len > 0) { 822 tempStuff.setCharAt(0, tempStuff[0]); 823 } else { 824 tempStuff.remove(); 825 } 826 } 827 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar)); 828 if(newSrc != NULL) { 829 src->current = newSrc + (src->current - src->source); 830 src->extraCurrent = newSrc + (src->extraCurrent - src->source); 831 src->end = newSrc + (src->end - src->source); 832 src->extraEnd = newSrc + (src->extraEnd-src->source)*2; 833 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); 834 src->source = newSrc; 835 } else { 836 *status = U_MEMORY_ALLOCATION_ERROR; 837 return; 838 } 839 } 840 if(len == 1) { 841 *src->extraCurrent++ = tempStuff[0]; 842 } else { 843 u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len); 844 src->extraCurrent += len; 845 } 846 } 847 848 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) { 849 /* 850 top = TRUE; 851 */ 852 UChar buff[5]; 853 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 854 buff[0] = 0xFFFE; 855 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16); 856 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF); 857 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) { 858 src->parsedToken.charsLen = 3; 859 ucol_tok_addToExtraCurrent(src, buff, 3, status); 860 } else { 861 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16); 862 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF); 863 src->parsedToken.charsLen = 5; 864 ucol_tok_addToExtraCurrent(src, buff, 5, status); 865 } 866 return TRUE; 867 } 868 869 static UBool isCharNewLine(UChar c){ 870 switch(c){ 871 case 0x000A: /* LF */ 872 case 0x000D: /* CR */ 873 case 0x000C: /* FF */ 874 case 0x0085: /* NEL */ 875 case 0x2028: /* LS */ 876 case 0x2029: /* PS */ 877 return TRUE; 878 default: 879 return FALSE; 880 } 881 } 882 883 /* 884 * This function is called several times when a range is processed. Each time, the next code point 885 * is processed. 886 * The following variables must be set before calling this function: 887 * src->currentRangeCp: The current code point to process. 888 * src->lastRangeCp: The last code point in the range. 889 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp. 890 */ 891 static const UChar* 892 ucol_tok_processNextCodePointInRange(UColTokenParser *src, 893 UErrorCode *status) 894 { 895 // Append current code point to source 896 UChar buff[U16_MAX_LENGTH]; 897 uint32_t i = 0; 898 899 uint32_t nChars = U16_LENGTH(src->currentRangeCp); 900 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 901 src->parsedToken.charsLen = nChars; 902 903 U16_APPEND_UNSAFE(buff, i, src->currentRangeCp); 904 ucol_tok_addToExtraCurrent(src, buff, nChars, status); 905 906 ++src->currentRangeCp; 907 if (src->currentRangeCp > src->lastRangeCp) { 908 src->inRange = FALSE; 909 910 if (src->currentStarredCharIndex > src->lastStarredCharIndex) { 911 src->isStarred = FALSE; 912 } 913 } else { 914 src->previousCp = src->currentRangeCp; 915 } 916 return src->current; 917 } 918 919 /* 920 * This function is called several times when a starred list is processed. Each time, the next code point 921 * in the list is processed. 922 * The following variables must be set before calling this function: 923 * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point. 924 * src->lastStarredCharIndex: Index to the last character in the list. 925 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex. 926 */ 927 static const UChar* 928 ucol_tok_processNextTokenInStarredList(UColTokenParser *src) 929 { 930 // Extract the characters corresponding to the next code point. 931 UChar32 cp; 932 src->parsedToken.charsOffset = src->currentStarredCharIndex; 933 int32_t prev = src->currentStarredCharIndex; 934 U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp); 935 src->parsedToken.charsLen = src->currentStarredCharIndex - prev; 936 937 // When we are done parsing the starred string, turn the flag off so that 938 // the normal processing is restored. 939 if (src->currentStarredCharIndex > src->lastStarredCharIndex) { 940 src->isStarred = FALSE; 941 } 942 src->previousCp = cp; 943 return src->current; 944 } 945 946 /* 947 * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters. 948 * 949 * This routine parses and separates almost all tokens. The following are the syntax characters recognized. 950 * # : Comment character 951 * & : Reset operator 952 * = : Equality 953 * < : Primary collation 954 * << : Secondary collation 955 * <<< : Tertiary collation 956 * ; : Secondary collation 957 * , : Tertiary collation 958 * / : Expansions 959 * | : Prefix 960 * - : Range 961 962 * ! : Java Thai modifier, ignored 963 * @ : French only 964 965 * [] : Options 966 * '' : Quotes 967 * 968 * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz 969 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above. 970 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a", 971 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous 972 * character returned as cached so that the calling program can do further splitting. 973 */ 974 static const UChar* 975 ucol_tok_parseNextTokenInternal(UColTokenParser *src, 976 UBool startOfRules, 977 UParseError *parseError, 978 UErrorCode *status) 979 { 980 UBool variableTop = FALSE; 981 UBool top = FALSE; 982 UBool inChars = TRUE; 983 UBool inQuote = FALSE; 984 UBool wasInQuote = FALSE; 985 uint8_t before = 0; 986 UBool isEscaped = FALSE; 987 988 // TODO: replace these variables with src->parsedToken counterparts 989 // no need to use them anymore since we have src->parsedToken. 990 // Ideally, token parser would be a nice class... Once, when I have 991 // more time (around 2020 probably). 992 uint32_t newExtensionLen = 0; 993 uint32_t extensionOffset = 0; 994 uint32_t newStrength = UCOL_TOK_UNSET; 995 UChar buff[10]; 996 997 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; 998 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; 999 src->parsedToken.indirectIndex = 0; 1000 1001 while (src->current < src->end) { 1002 UChar ch = *(src->current); 1003 1004 if (inQuote) { 1005 if (ch == 0x0027/*'\''*/) { 1006 inQuote = FALSE; 1007 } else { 1008 if ((src->parsedToken.charsLen == 0) || inChars) { 1009 if(src->parsedToken.charsLen == 0) { 1010 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1011 } 1012 src->parsedToken.charsLen++; 1013 } else { 1014 if(newExtensionLen == 0) { 1015 extensionOffset = (uint32_t)(src->extraCurrent - src->source); 1016 } 1017 newExtensionLen++; 1018 } 1019 } 1020 }else if(isEscaped){ 1021 isEscaped =FALSE; 1022 if (newStrength == UCOL_TOK_UNSET) { 1023 *status = U_INVALID_FORMAT_ERROR; 1024 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1025 DBG_FORMAT_ERROR 1026 return NULL; 1027 // enabling rules to start with non-tokens a < b 1028 // newStrength = UCOL_TOK_RESET; 1029 } 1030 if(ch != 0x0000 && src->current != src->end) { 1031 if (inChars) { 1032 if(src->parsedToken.charsLen == 0) { 1033 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); 1034 } 1035 src->parsedToken.charsLen++; 1036 } else { 1037 if(newExtensionLen == 0) { 1038 extensionOffset = (uint32_t)(src->current - src->source); 1039 } 1040 newExtensionLen++; 1041 } 1042 } 1043 }else { 1044 if(!uprv_isRuleWhiteSpace(ch)) { 1045 /* Sets the strength for this entry */ 1046 switch (ch) { 1047 case 0x003D/*'='*/ : 1048 if (newStrength != UCOL_TOK_UNSET) { 1049 goto EndOfLoop; 1050 } 1051 1052 /* if we start with strength, we'll reset to top */ 1053 if(startOfRules == TRUE) { 1054 src->parsedToken.indirectIndex = 5; 1055 top = ucol_tok_doSetTop(src, status); 1056 newStrength = UCOL_TOK_RESET; 1057 goto EndOfLoop; 1058 } 1059 newStrength = UCOL_IDENTICAL; 1060 if(*(src->current+1) == 0x002A) {/*'*'*/ 1061 src->current++; 1062 src->isStarred = TRUE; 1063 } 1064 break; 1065 1066 case 0x002C/*','*/: 1067 if (newStrength != UCOL_TOK_UNSET) { 1068 goto EndOfLoop; 1069 } 1070 1071 /* if we start with strength, we'll reset to top */ 1072 if(startOfRules == TRUE) { 1073 src->parsedToken.indirectIndex = 5; 1074 top = ucol_tok_doSetTop(src, status); 1075 newStrength = UCOL_TOK_RESET; 1076 goto EndOfLoop; 1077 } 1078 newStrength = UCOL_TERTIARY; 1079 break; 1080 1081 case 0x003B/*';'*/: 1082 if (newStrength != UCOL_TOK_UNSET) { 1083 goto EndOfLoop; 1084 } 1085 1086 /* if we start with strength, we'll reset to top */ 1087 if(startOfRules == TRUE) { 1088 src->parsedToken.indirectIndex = 5; 1089 top = ucol_tok_doSetTop(src, status); 1090 newStrength = UCOL_TOK_RESET; 1091 goto EndOfLoop; 1092 } 1093 newStrength = UCOL_SECONDARY; 1094 break; 1095 1096 case 0x003C/*'<'*/: 1097 if (newStrength != UCOL_TOK_UNSET) { 1098 goto EndOfLoop; 1099 } 1100 1101 /* if we start with strength, we'll reset to top */ 1102 if(startOfRules == TRUE) { 1103 src->parsedToken.indirectIndex = 5; 1104 top = ucol_tok_doSetTop(src, status); 1105 newStrength = UCOL_TOK_RESET; 1106 goto EndOfLoop; 1107 } 1108 /* before this, do a scan to verify whether this is */ 1109 /* another strength */ 1110 if(*(src->current+1) == 0x003C) { 1111 src->current++; 1112 if(*(src->current+1) == 0x003C) { 1113 src->current++; /* three in a row! */ 1114 newStrength = UCOL_TERTIARY; 1115 } else { /* two in a row */ 1116 newStrength = UCOL_SECONDARY; 1117 } 1118 } else { /* just one */ 1119 newStrength = UCOL_PRIMARY; 1120 } 1121 if(*(src->current+1) == 0x002A) {/*'*'*/ 1122 src->current++; 1123 src->isStarred = TRUE; 1124 } 1125 break; 1126 1127 case 0x0026/*'&'*/: 1128 if (newStrength != UCOL_TOK_UNSET) { 1129 /**/ 1130 goto EndOfLoop; 1131 } 1132 1133 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ 1134 break; 1135 1136 case 0x005b/*'['*/: 1137 /* options - read an option, analyze it */ 1138 if(u_strchr(src->current, 0x005d /*']'*/) != NULL) { 1139 uint8_t result = ucol_uprv_tok_readAndSetOption(src, status); 1140 if(U_SUCCESS(*status)) { 1141 if(result & UCOL_TOK_TOP) { 1142 if(newStrength == UCOL_TOK_RESET) { 1143 top = ucol_tok_doSetTop(src, status); 1144 if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b' 1145 src->parsedToken.charsLen+=2; 1146 buff[0] = 0x002d; 1147 buff[1] = before; 1148 ucol_tok_addToExtraCurrent(src, buff, 2, status); 1149 } 1150 1151 src->current++; 1152 goto EndOfLoop; 1153 } else { 1154 *status = U_INVALID_FORMAT_ERROR; 1155 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1156 DBG_FORMAT_ERROR 1157 } 1158 } else if(result & UCOL_TOK_VARIABLE_TOP) { 1159 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) { 1160 variableTop = TRUE; 1161 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1162 src->parsedToken.charsLen = 1; 1163 buff[0] = 0xFFFF; 1164 ucol_tok_addToExtraCurrent(src, buff, 1, status); 1165 src->current++; 1166 goto EndOfLoop; 1167 } else { 1168 *status = U_INVALID_FORMAT_ERROR; 1169 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1170 DBG_FORMAT_ERROR 1171 } 1172 } else if (result & UCOL_TOK_BEFORE){ 1173 if(newStrength == UCOL_TOK_RESET) { 1174 before = result & UCOL_TOK_BEFORE; 1175 } else { 1176 *status = U_INVALID_FORMAT_ERROR; 1177 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1178 DBG_FORMAT_ERROR 1179 } 1180 } 1181 } else { 1182 *status = U_INVALID_FORMAT_ERROR; 1183 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1184 DBG_FORMAT_ERROR 1185 return NULL; 1186 } 1187 } 1188 break; 1189 case 0x0021/*! skip java thai modifier reordering*/: 1190 break; 1191 case 0x002F/*'/'*/: 1192 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */ 1193 inChars = FALSE; /* we're now processing expansion */ 1194 break; 1195 case 0x005C /* back slash for escaped chars */: 1196 isEscaped = TRUE; 1197 break; 1198 /* found a quote, we're gonna start copying */ 1199 case 0x0027/*'\''*/: 1200 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ 1201 *status = U_INVALID_FORMAT_ERROR; 1202 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1203 DBG_FORMAT_ERROR 1204 return NULL; 1205 // enabling rules to start with a non-token character a < b 1206 // newStrength = UCOL_TOK_RESET; 1207 } 1208 1209 inQuote = TRUE; 1210 1211 if(inChars) { /* we're doing characters */ 1212 if(wasInQuote == FALSE) { 1213 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1214 } 1215 if (src->parsedToken.charsLen != 0) { 1216 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); 1217 } 1218 src->parsedToken.charsLen++; 1219 } else { /* we're doing an expansion */ 1220 if(wasInQuote == FALSE) { 1221 extensionOffset = (uint32_t)(src->extraCurrent - src->source); 1222 } 1223 if (newExtensionLen != 0) { 1224 ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status); 1225 } 1226 newExtensionLen++; 1227 } 1228 1229 wasInQuote = TRUE; 1230 1231 ch = *(++(src->current)); 1232 if(ch == 0x0027) { /* copy the double quote */ 1233 ucol_tok_addToExtraCurrent(src, &ch, 1, status); 1234 inQuote = FALSE; 1235 } 1236 break; 1237 1238 /* '@' is french only if the strength is not currently set */ 1239 /* if it is, it's just a regular character in collation rules */ 1240 case 0x0040/*'@'*/: 1241 if (newStrength == UCOL_TOK_UNSET) { 1242 src->opts->frenchCollation = UCOL_ON; 1243 break; 1244 } 1245 1246 case 0x007C /*|*/: /* this means we have actually been reading prefix part */ 1247 // we want to store read characters to the prefix part and continue reading 1248 // the characters (proper way would be to restart reading the chars, but in 1249 // that case we would have to complicate the token hasher, which I do not 1250 // intend to play with. Instead, we will do prefixes when prefixes are due 1251 // (before adding the elements). 1252 src->parsedToken.prefixOffset = src->parsedToken.charsOffset; 1253 src->parsedToken.prefixLen = src->parsedToken.charsLen; 1254 1255 if(inChars) { /* we're doing characters */ 1256 if(wasInQuote == FALSE) { 1257 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1258 } 1259 if (src->parsedToken.charsLen != 0) { 1260 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); 1261 } 1262 src->parsedToken.charsLen++; 1263 } 1264 1265 wasInQuote = TRUE; 1266 1267 do { 1268 ch = *(++(src->current)); 1269 // skip whitespace between '|' and the character 1270 } while (uprv_isRuleWhiteSpace(ch)); 1271 break; 1272 1273 //charsOffset = 0; 1274 //newCharsLen = 0; 1275 //break; // We want to store the whole prefix/character sequence. If we break 1276 // the '|' is going to get lost. 1277 1278 case 0x002D /*-*/: /* A range. */ 1279 if (newStrength != UCOL_TOK_UNSET) { 1280 // While processing the pending token, the isStarred field 1281 // is reset, so it needs to be saved for the next 1282 // invocation. 1283 src->savedIsStarred = src->isStarred; 1284 goto EndOfLoop; 1285 } 1286 src->isStarred = src->savedIsStarred; 1287 1288 // Ranges are valid only in starred tokens. 1289 if (!src->isStarred) { 1290 *status = U_INVALID_FORMAT_ERROR; 1291 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1292 DBG_FORMAT_ERROR 1293 return NULL; 1294 } 1295 newStrength = src->parsedToken.strength; 1296 src->inRange = TRUE; 1297 break; 1298 1299 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */ 1300 do { 1301 ch = *(++(src->current)); 1302 } while (!isCharNewLine(ch)); 1303 1304 break; 1305 default: 1306 if (newStrength == UCOL_TOK_UNSET) { 1307 *status = U_INVALID_FORMAT_ERROR; 1308 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1309 DBG_FORMAT_ERROR 1310 return NULL; 1311 } 1312 1313 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { 1314 *status = U_INVALID_FORMAT_ERROR; 1315 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1316 DBG_FORMAT_ERROR 1317 return NULL; 1318 } 1319 1320 if(ch == 0x0000 && src->current+1 == src->end) { 1321 break; 1322 } 1323 1324 if (inChars) { 1325 if(src->parsedToken.charsLen == 0) { 1326 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); 1327 } 1328 src->parsedToken.charsLen++; 1329 } else { 1330 if(newExtensionLen == 0) { 1331 extensionOffset = (uint32_t)(src->current - src->source); 1332 } 1333 newExtensionLen++; 1334 } 1335 1336 break; 1337 } 1338 } 1339 } 1340 1341 if(wasInQuote) { 1342 if(ch != 0x27) { 1343 if(inQuote || !uprv_isRuleWhiteSpace(ch)) { 1344 ucol_tok_addToExtraCurrent(src, &ch, 1, status); 1345 } 1346 } 1347 } 1348 1349 src->current++; 1350 } 1351 1352 EndOfLoop: 1353 wasInQuote = FALSE; 1354 if (newStrength == UCOL_TOK_UNSET) { 1355 return NULL; 1356 } 1357 1358 if (src->parsedToken.charsLen == 0 && top == FALSE) { 1359 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1360 *status = U_INVALID_FORMAT_ERROR; 1361 DBG_FORMAT_ERROR 1362 return NULL; 1363 } 1364 1365 src->parsedToken.strength = newStrength; 1366 src->parsedToken.extensionOffset = extensionOffset; 1367 src->parsedToken.extensionLen = newExtensionLen; 1368 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before; 1369 1370 return src->current; 1371 } 1372 1373 /* 1374 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters. 1375 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported. 1376 * 1377 * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following: 1378 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates 1379 * it to separate tokens and returns one by one. In order to do that, the necessary states are 1380 * cached as member variables of the token parser. 1381 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the 1382 * starting character as a single list token (which is separated into individual characters here) 1383 * and as another list token starting with the last character in the range. Before expanding it 1384 * as a list of tokens, this function expands the range by filling the intermediate characters and 1385 * returns them one by one as separate tokens. 1386 * Necessary checks are done for invalid combinations. 1387 */ 1388 U_CAPI const UChar* U_EXPORT2 1389 ucol_tok_parseNextToken(UColTokenParser *src, 1390 UBool startOfRules, 1391 UParseError *parseError, 1392 UErrorCode *status) 1393 { 1394 const UChar *nextToken; 1395 1396 if (src->inRange) { 1397 // We are not done processing a range. Continue it. 1398 return ucol_tok_processNextCodePointInRange(src, status); 1399 } else if (src->isStarred) { 1400 // We are not done processing a starred token. Continue it. 1401 return ucol_tok_processNextTokenInStarredList(src); 1402 } 1403 1404 // Get the next token. 1405 nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status); 1406 1407 if (nextToken == NULL) { 1408 return NULL; 1409 } 1410 1411 if (src->inRange) { 1412 // A new range has started. 1413 // Check whether it is a chain of ranges with more than one hyphen. 1414 if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) { 1415 *status = U_INVALID_FORMAT_ERROR; 1416 syntaxError(src->source,src->parsedToken.charsOffset-1, 1417 src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError); 1418 DBG_FORMAT_ERROR 1419 return NULL; 1420 } 1421 1422 // The current token indicates the second code point of the range. 1423 // Process just that, and then proceed with the star. 1424 src->currentStarredCharIndex = src->parsedToken.charsOffset; 1425 U16_NEXT(src->source, src->currentStarredCharIndex, 1426 (uint32_t)(src->end - src->source), src->lastRangeCp); 1427 if (src->lastRangeCp <= src->previousCp) { 1428 *status = U_INVALID_FORMAT_ERROR; 1429 syntaxError(src->source,src->parsedToken.charsOffset-1, 1430 src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); 1431 DBG_FORMAT_ERROR 1432 return NULL; 1433 } 1434 1435 // Set current range code point to process the range loop 1436 src->currentRangeCp = src->previousCp + 1; 1437 1438 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; 1439 1440 return ucol_tok_processNextCodePointInRange(src, status); 1441 } else if (src->isStarred) { 1442 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that 1443 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be 1444 // separated into several tokens and returned. 1445 src->currentStarredCharIndex = src->parsedToken.charsOffset; 1446 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; 1447 1448 return ucol_tok_processNextTokenInStarredList(src); 1449 } else { 1450 // Set previous codepoint 1451 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp); 1452 } 1453 return nextToken; 1454 } 1455 1456 1457 /* 1458 Processing Description 1459 1 Build a ListList. Each list has a header, which contains two lists (positive 1460 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and 1461 reset may be null. 1462 2 As you process, you keep a LAST pointer that points to the last token you 1463 handled. 1464 1465 */ 1466 1467 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext, 1468 UParseError *parseError, UErrorCode *status) 1469 { 1470 if(src->resultLen == src->listCapacity) { 1471 // Unfortunately, this won't work, as we store addresses of lhs in token 1472 src->listCapacity *= 2; 1473 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader)); 1474 if(src->lh == NULL) { 1475 *status = U_MEMORY_ALLOCATION_ERROR; 1476 return NULL; 1477 } 1478 } 1479 /* do the reset thing */ 1480 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); 1481 /* test for NULL */ 1482 if (sourceToken == NULL) { 1483 *status = U_MEMORY_ALLOCATION_ERROR; 1484 return NULL; 1485 } 1486 sourceToken->rulesToParseHdl = &(src->source); 1487 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; 1488 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; 1489 1490 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); 1491 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); 1492 1493 // keep the flags around so that we know about before 1494 sourceToken->flags = src->parsedToken.flags; 1495 1496 if(src->parsedToken.prefixOffset != 0) { 1497 // this is a syntax error 1498 *status = U_INVALID_FORMAT_ERROR; 1499 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); 1500 DBG_FORMAT_ERROR 1501 uprv_free(sourceToken); 1502 return 0; 1503 } else { 1504 sourceToken->prefix = 0; 1505 } 1506 1507 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ 1508 sourceToken->strength = UCOL_TOK_RESET; 1509 sourceToken->next = NULL; 1510 sourceToken->previous = NULL; 1511 sourceToken->noOfCEs = 0; 1512 sourceToken->noOfExpCEs = 0; 1513 sourceToken->listHeader = &src->lh[src->resultLen]; 1514 1515 src->lh[src->resultLen].first = NULL; 1516 src->lh[src->resultLen].last = NULL; 1517 src->lh[src->resultLen].first = NULL; 1518 src->lh[src->resultLen].last = NULL; 1519 1520 src->lh[src->resultLen].reset = sourceToken; 1521 1522 /* 1523 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... 1524 First convert all expansions into normal form. Examples: 1525 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * 1526 d * ... into &x * c/y * d * ... 1527 Note: reset values can never have expansions, although they can cause the 1528 very next item to have one. They may be contractions, if they are found 1529 earlier in the list. 1530 */ 1531 *expandNext = 0; 1532 if(expand != NULL) { 1533 /* check to see if there is an expansion */ 1534 if(src->parsedToken.charsLen > 1) { 1535 uint32_t resetCharsOffset; 1536 resetCharsOffset = (uint32_t)(expand - src->source); 1537 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset; 1538 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset); 1539 } 1540 } 1541 1542 src->resultLen++; 1543 1544 uhash_put(src->tailored, sourceToken, sourceToken, status); 1545 1546 return sourceToken; 1547 } 1548 1549 static 1550 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) { 1551 if(U_FAILURE(*status)) { 1552 return NULL; 1553 } 1554 /* this is a virgin before - we need to fish the anchor from the UCA */ 1555 collIterate s; 1556 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; 1557 uint32_t CE, SecondCE; 1558 uint32_t invPos; 1559 if(sourceToken != NULL) { 1560 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status); 1561 } else { 1562 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status); 1563 } 1564 if(U_FAILURE(*status)) { 1565 return NULL; 1566 } 1567 1568 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; 1569 baseContCE = ucol_getNextCE(src->UCA, &s, status); 1570 if(baseContCE == UCOL_NO_MORE_CES) { 1571 baseContCE = 0; 1572 } 1573 1574 1575 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 1576 uint32_t ch = 0; 1577 uint32_t expandNext = 0; 1578 UColToken key; 1579 1580 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 1581 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); 1582 uint32_t raw = uprv_uca_getRawFromImplicit(primary); 1583 ch = uprv_uca_getCodePointFromRaw(raw-1); 1584 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); 1585 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; 1586 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; 1587 1588 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1589 *src->extraCurrent++ = 0xFFFE; 1590 *src->extraCurrent++ = (UChar)ch; 1591 src->parsedToken.charsLen++; 1592 1593 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; 1594 key.rulesToParseHdl = &(src->source); 1595 1596 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); 1597 sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1598 1599 if(sourceToken == NULL) { 1600 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; 1601 if(isContinuation(SecondCE)) { 1602 src->lh[src->resultLen].baseContCE = SecondCE; 1603 } else { 1604 src->lh[src->resultLen].baseContCE = 0; 1605 } 1606 src->lh[src->resultLen].nextCE = 0; 1607 src->lh[src->resultLen].nextContCE = 0; 1608 src->lh[src->resultLen].previousCE = 0; 1609 src->lh[src->resultLen].previousContCE = 0; 1610 1611 src->lh[src->resultLen].indirect = FALSE; 1612 1613 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 1614 } 1615 1616 } else { 1617 invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); 1618 1619 // we got the previous CE. Now we need to see if the difference between 1620 // the two CEs is really of the requested strength. 1621 // if it's a bigger difference (we asked for secondary and got primary), we 1622 // need to modify the CE. 1623 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) { 1624 // adjust the strength 1625 // now we are in the situation where our baseCE should actually be modified in 1626 // order to get the CE in the right position. 1627 if(strength == UCOL_SECONDARY) { 1628 CE = baseCE - 0x0200; 1629 } else { // strength == UCOL_TERTIARY 1630 CE = baseCE - 0x02; 1631 } 1632 if(baseContCE) { 1633 if(strength == UCOL_SECONDARY) { 1634 SecondCE = baseContCE - 0x0200; 1635 } else { // strength == UCOL_TERTIARY 1636 SecondCE = baseContCE - 0x02; 1637 } 1638 } 1639 } 1640 1641 #if 0 1642 // the code below relies on getting a code point from the inverse table, in order to be 1643 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work: 1644 // 1. There are many code points that have the same CE 1645 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken. 1646 // Also, in case when there is no equivalent strength before an element, we have to actually 1647 // construct one. For example, &[before 2]a << x won't result in x << a, because the element 1648 // before a is a primary difference. 1649 1650 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 1651 1652 1653 ch = CETable[3*invPos+2]; 1654 1655 if((ch & UCOL_INV_SIZEMASK) != 0) { 1656 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts); 1657 uint32_t offset = (ch & UCOL_INV_OFFSETMASK); 1658 ch = conts[offset]; 1659 } 1660 1661 *src->extraCurrent++ = (UChar)ch; 1662 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1); 1663 src->parsedToken.charsLen = 1; 1664 1665 // We got an UCA before. However, this might have been tailored. 1666 // example: 1667 // &\u30ca = \u306a 1668 // &[before 3]\u306a<<<\u306a|\u309d 1669 1670 1671 // uint32_t key = (*newCharsLen << 24) | *charsOffset; 1672 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; 1673 key.rulesToParseHdl = &(src->source); 1674 1675 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); 1676 sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1677 #endif 1678 1679 // here is how it should be. The situation such as &[before 1]a < x, should be 1680 // resolved exactly as if we wrote &a > x. 1681 // therefore, I don't really care if the UCA value before a has been changed. 1682 // However, I do care if the strength between my element and the previous element 1683 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll 1684 // have to construct the base CE. 1685 1686 1687 1688 // if we found a tailored thing, we have to use the UCA value and construct 1689 // a new reset token with constructed name 1690 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { 1691 // character to which we want to anchor is already tailored. 1692 // We need to construct a new token which will be the anchor 1693 // point 1694 //*(src->extraCurrent-1) = 0xFFFE; 1695 //*src->extraCurrent++ = (UChar)ch; 1696 // grab before 1697 src->parsedToken.charsOffset -= 10; 1698 src->parsedToken.charsLen += 10; 1699 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; 1700 if(isContinuation(SecondCE)) { 1701 src->lh[src->resultLen].baseContCE = SecondCE; 1702 } else { 1703 src->lh[src->resultLen].baseContCE = 0; 1704 } 1705 src->lh[src->resultLen].nextCE = 0; 1706 src->lh[src->resultLen].nextContCE = 0; 1707 src->lh[src->resultLen].previousCE = 0; 1708 src->lh[src->resultLen].previousContCE = 0; 1709 1710 src->lh[src->resultLen].indirect = FALSE; 1711 1712 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 1713 //} 1714 } 1715 1716 return sourceToken; 1717 1718 } 1719 1720 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) { 1721 UColToken *lastToken = NULL; 1722 const UChar *parseEnd = NULL; 1723 uint32_t expandNext = 0; 1724 UBool variableTop = FALSE; 1725 UBool top = FALSE; 1726 uint16_t specs = 0; 1727 UColTokListHeader *ListList = NULL; 1728 1729 src->parsedToken.strength = UCOL_TOK_UNSET; 1730 1731 ListList = src->lh; 1732 1733 if(U_FAILURE(*status)) { 1734 return 0; 1735 } 1736 #ifdef DEBUG_FOR_CODE_POINTS 1737 char filename[35]; 1738 sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid()); 1739 dfcp_fp = fopen(filename, "a"); 1740 fprintf(stdout, "Output is in the file %s.\n", filename); 1741 #endif 1742 1743 #ifdef DEBUG_FOR_COLL_RULES 1744 std::string s3; 1745 UnicodeString(src->source).toUTF8String(s3); 1746 std::cout << "src->source = " << s3 << std::endl; 1747 #endif 1748 1749 while(src->current < src->end || src->isStarred) { 1750 src->parsedToken.prefixOffset = 0; 1751 1752 parseEnd = ucol_tok_parseNextToken(src, 1753 (UBool)(lastToken == NULL), 1754 parseError, 1755 status); 1756 1757 specs = src->parsedToken.flags; 1758 1759 1760 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); 1761 top = ((specs & UCOL_TOK_TOP) != 0); 1762 1763 if(U_SUCCESS(*status) && parseEnd != NULL) { 1764 UColToken *sourceToken = NULL; 1765 //uint32_t key = 0; 1766 uint32_t lastStrength = UCOL_TOK_UNSET; 1767 1768 if(lastToken != NULL ) { 1769 lastStrength = lastToken->strength; 1770 } 1771 1772 #ifdef DEBUG_FOR_CODE_POINTS 1773 UChar32 cp; 1774 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp); 1775 fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength); 1776 #endif 1777 //key = newCharsLen << 24 | charsOffset; 1778 UColToken key; 1779 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; 1780 key.rulesToParseHdl = &(src->source); 1781 1782 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */ 1783 sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1784 1785 if(src->parsedToken.strength != UCOL_TOK_RESET) { 1786 if(lastToken == NULL) { /* this means that rules haven't started properly */ 1787 *status = U_INVALID_FORMAT_ERROR; 1788 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); 1789 DBG_FORMAT_ERROR 1790 return 0; 1791 } 1792 /* 6 Otherwise (when relation != reset) */ 1793 if(sourceToken == NULL) { 1794 /* If sourceToken is null, create new one, */ 1795 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); 1796 /* test for NULL */ 1797 if (sourceToken == NULL) { 1798 *status = U_MEMORY_ALLOCATION_ERROR; 1799 return 0; 1800 } 1801 sourceToken->rulesToParseHdl = &(src->source); 1802 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; 1803 1804 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); 1805 1806 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset; 1807 sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset); 1808 1809 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ 1810 sourceToken->next = NULL; 1811 sourceToken->previous = NULL; 1812 sourceToken->noOfCEs = 0; 1813 sourceToken->noOfExpCEs = 0; 1814 // keep the flags around so that we know about before 1815 sourceToken->flags = src->parsedToken.flags; 1816 uhash_put(src->tailored, sourceToken, sourceToken, status); 1817 if(U_FAILURE(*status)) { 1818 return 0; 1819 } 1820 } else { 1821 /* we could have fished out a reset here */ 1822 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) { 1823 /* otherwise remove sourceToken from where it was. */ 1824 if(sourceToken->next != NULL) { 1825 if(sourceToken->next->strength > sourceToken->strength) { 1826 sourceToken->next->strength = sourceToken->strength; 1827 } 1828 sourceToken->next->previous = sourceToken->previous; 1829 } else { 1830 sourceToken->listHeader->last = sourceToken->previous; 1831 } 1832 1833 if(sourceToken->previous != NULL) { 1834 sourceToken->previous->next = sourceToken->next; 1835 } else { 1836 sourceToken->listHeader->first = sourceToken->next; 1837 } 1838 sourceToken->next = NULL; 1839 sourceToken->previous = NULL; 1840 } 1841 } 1842 1843 sourceToken->strength = src->parsedToken.strength; 1844 sourceToken->listHeader = lastToken->listHeader; 1845 1846 /* 1847 1. Find the strongest strength in each list, and set strongestP and strongestN 1848 accordingly in the headers. 1849 */ 1850 if(lastStrength == UCOL_TOK_RESET 1851 || sourceToken->listHeader->first == 0) { 1852 /* If LAST is a reset 1853 insert sourceToken in the list. */ 1854 if(sourceToken->listHeader->first == 0) { 1855 sourceToken->listHeader->first = sourceToken; 1856 sourceToken->listHeader->last = sourceToken; 1857 } else { /* we need to find a place for us */ 1858 /* and we'll get in front of the same strength */ 1859 if(sourceToken->listHeader->first->strength <= sourceToken->strength) { 1860 sourceToken->next = sourceToken->listHeader->first; 1861 sourceToken->next->previous = sourceToken; 1862 sourceToken->listHeader->first = sourceToken; 1863 sourceToken->previous = NULL; 1864 } else { 1865 lastToken = sourceToken->listHeader->first; 1866 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { 1867 lastToken = lastToken->next; 1868 } 1869 if(lastToken->next != NULL) { 1870 lastToken->next->previous = sourceToken; 1871 } else { 1872 sourceToken->listHeader->last = sourceToken; 1873 } 1874 sourceToken->previous = lastToken; 1875 sourceToken->next = lastToken->next; 1876 lastToken->next = sourceToken; 1877 } 1878 } 1879 } else { 1880 /* Otherwise (when LAST is not a reset) 1881 if polarity (LAST) == polarity(relation), insert sourceToken after LAST, 1882 otherwise insert before. 1883 when inserting after or before, search to the next position with the same 1884 strength in that direction. (This is called postpone insertion). */ 1885 if(sourceToken != lastToken) { 1886 if(lastToken->polarity == sourceToken->polarity) { 1887 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { 1888 lastToken = lastToken->next; 1889 } 1890 sourceToken->previous = lastToken; 1891 if(lastToken->next != NULL) { 1892 lastToken->next->previous = sourceToken; 1893 } else { 1894 sourceToken->listHeader->last = sourceToken; 1895 } 1896 1897 sourceToken->next = lastToken->next; 1898 lastToken->next = sourceToken; 1899 } else { 1900 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) { 1901 lastToken = lastToken->previous; 1902 } 1903 sourceToken->next = lastToken; 1904 if(lastToken->previous != NULL) { 1905 lastToken->previous->next = sourceToken; 1906 } else { 1907 sourceToken->listHeader->first = sourceToken; 1908 } 1909 sourceToken->previous = lastToken->previous; 1910 lastToken->previous = sourceToken; 1911 } 1912 } else { /* repeated one thing twice in rules, stay with the stronger strength */ 1913 if(lastStrength < sourceToken->strength) { 1914 sourceToken->strength = lastStrength; 1915 } 1916 } 1917 } 1918 1919 /* if the token was a variable top, we're gonna put it in */ 1920 if(variableTop == TRUE && src->varTop == NULL) { 1921 variableTop = FALSE; 1922 src->varTop = sourceToken; 1923 } 1924 1925 // Treat the expansions. 1926 // There are two types of expansions: explicit (x / y) and reset based propagating expansions 1927 // (&abc * d * e <=> &ab * d / c * e / c) 1928 // if both of them are in effect for a token, they are combined. 1929 1930 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; 1931 1932 if(expandNext != 0) { 1933 if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */ 1934 expandNext = 0; 1935 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */ 1936 sourceToken->expansion = expandNext; 1937 } else { /* there is both explicit and implicit expansion. We need to make a combination */ 1938 uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); 1939 uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar)); 1940 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source)); 1941 src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen; 1942 } 1943 } 1944 1945 // This is just for debugging purposes 1946 if(sourceToken->expansion != 0) { 1947 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); 1948 } else { 1949 sourceToken->debugExpansion = 0; 1950 } 1951 // if the previous token was a reset before, the strength of this 1952 // token must match the strength of before. Otherwise we have an 1953 // undefined situation. 1954 // In other words, we currently have a cludge which we use to 1955 // represent &a >> x. This is written as &[before 2]a << x. 1956 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) { 1957 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1; 1958 if(beforeStrength != sourceToken->strength) { 1959 *status = U_INVALID_FORMAT_ERROR; 1960 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); 1961 DBG_FORMAT_ERROR 1962 return 0; 1963 } 1964 } 1965 } else { 1966 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { 1967 /* if the previous token was also a reset, */ 1968 /*this means that we have two consecutive resets */ 1969 /* and we want to remove the previous one if empty*/ 1970 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { 1971 src->resultLen--; 1972 } 1973 } 1974 1975 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */ 1976 uint32_t searchCharsLen = src->parsedToken.charsLen; 1977 while(searchCharsLen > 1 && sourceToken == NULL) { 1978 searchCharsLen--; 1979 //key = searchCharsLen << 24 | charsOffset; 1980 UColToken key; 1981 key.source = searchCharsLen << 24 | src->parsedToken.charsOffset; 1982 key.rulesToParseHdl = &(src->source); 1983 sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1984 } 1985 if(sourceToken != NULL) { 1986 expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen); 1987 } 1988 } 1989 1990 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ 1991 if(top == FALSE) { /* there is no indirection */ 1992 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; 1993 if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { 1994 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */ 1995 while(sourceToken->strength > strength && sourceToken->previous != NULL) { 1996 sourceToken = sourceToken->previous; 1997 } 1998 /* here, either we hit the strength or NULL */ 1999 if(sourceToken->strength == strength) { 2000 if(sourceToken->previous != NULL) { 2001 sourceToken = sourceToken->previous; 2002 } else { /* start of list */ 2003 sourceToken = sourceToken->listHeader->reset; 2004 } 2005 } else { /* we hit NULL */ 2006 /* we should be doing the else part */ 2007 sourceToken = sourceToken->listHeader->reset; 2008 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); 2009 } 2010 } else { 2011 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); 2012 } 2013 } else { /* this is both before and indirection */ 2014 top = FALSE; 2015 ListList[src->resultLen].previousCE = 0; 2016 ListList[src->resultLen].previousContCE = 0; 2017 ListList[src->resultLen].indirect = TRUE; 2018 /* we need to do slightly more work. we need to get the baseCE using the */ 2019 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */ 2020 /* in ucol_bld */ 2021 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; 2022 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; 2023 uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F; 2024 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; 2025 2026 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 2027 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && 2028 (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 2029 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); 2030 uint32_t raw = uprv_uca_getRawFromImplicit(primary); 2031 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); 2032 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; 2033 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; 2034 } else { 2035 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/ 2036 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); 2037 } 2038 2039 ListList[src->resultLen].baseCE = CE; 2040 ListList[src->resultLen].baseContCE = SecondCE; 2041 ListList[src->resultLen].nextCE = 0; 2042 ListList[src->resultLen].nextContCE = 0; 2043 2044 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 2045 } 2046 } 2047 2048 2049 /* 5 If the relation is a reset: 2050 If sourceToken is null 2051 Create new list, create new sourceToken, make the baseCE from source, put 2052 the sourceToken in ListHeader of the new list */ 2053 if(sourceToken == NULL) { 2054 /* 2055 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... 2056 First convert all expansions into normal form. Examples: 2057 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * 2058 d * ... into &x * c/y * d * ... 2059 Note: reset values can never have expansions, although they can cause the 2060 very next item to have one. They may be contractions, if they are found 2061 earlier in the list. 2062 */ 2063 if(top == FALSE) { 2064 collIterate s; 2065 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; 2066 2067 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status); 2068 2069 CE = ucol_getNextCE(src->UCA, &s, status); 2070 const UChar *expand = s.pos; 2071 SecondCE = ucol_getNextCE(src->UCA, &s, status); 2072 2073 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; 2074 if(isContinuation(SecondCE)) { 2075 ListList[src->resultLen].baseContCE = SecondCE; 2076 } else { 2077 ListList[src->resultLen].baseContCE = 0; 2078 } 2079 ListList[src->resultLen].nextCE = 0; 2080 ListList[src->resultLen].nextContCE = 0; 2081 ListList[src->resultLen].previousCE = 0; 2082 ListList[src->resultLen].previousContCE = 0; 2083 ListList[src->resultLen].indirect = FALSE; 2084 sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status); 2085 } else { /* top == TRUE */ 2086 /* just use the supplied values */ 2087 top = FALSE; 2088 ListList[src->resultLen].previousCE = 0; 2089 ListList[src->resultLen].previousContCE = 0; 2090 ListList[src->resultLen].indirect = TRUE; 2091 ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; 2092 ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE; 2093 ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE; 2094 ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE; 2095 2096 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 2097 2098 } 2099 } else { /* reset to something already in rules */ 2100 top = FALSE; 2101 } 2102 } 2103 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */ 2104 lastToken = sourceToken; 2105 } else { 2106 if(U_FAILURE(*status)) { 2107 return 0; 2108 } 2109 } 2110 } 2111 #ifdef DEBUG_FOR_CODE_POINTS 2112 fclose(dfcp_fp); 2113 #endif 2114 2115 2116 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { 2117 src->resultLen--; 2118 } 2119 return src->resultLen; 2120 } 2121 2122 const UChar* ucol_tok_getRulesFromBundle( 2123 void* /*context*/, 2124 const char* locale, 2125 const char* type, 2126 int32_t* pLength, 2127 UErrorCode* status) 2128 { 2129 const UChar* rules = NULL; 2130 UResourceBundle* bundle; 2131 UResourceBundle* collations; 2132 UResourceBundle* collation; 2133 2134 *pLength = 0; 2135 2136 bundle = ures_open(U_ICUDATA_COLL, locale, status); 2137 if(U_SUCCESS(*status)){ 2138 collations = ures_getByKey(bundle, "collations", NULL, status); 2139 if(U_SUCCESS(*status)){ 2140 collation = ures_getByKey(collations, type, NULL, status); 2141 if(U_SUCCESS(*status)){ 2142 rules = ures_getStringByKey(collation, "Sequence", pLength, status); 2143 if(U_FAILURE(*status)){ 2144 *pLength = 0; 2145 rules = NULL; 2146 } 2147 ures_close(collation); 2148 } 2149 ures_close(collations); 2150 } 2151 } 2152 2153 ures_close(bundle); 2154 2155 return rules; 2156 } 2157 2158 void ucol_tok_initTokenList( 2159 UColTokenParser *src, 2160 const UChar *rules, 2161 uint32_t rulesLength, 2162 const UCollator *UCA, 2163 GetCollationRulesFunction importFunc, 2164 void* context, 2165 UErrorCode *status) { 2166 U_NAMESPACE_USE 2167 2168 uint32_t nSize = 0; 2169 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE); 2170 2171 bool needToDeallocRules = false; 2172 2173 if(U_FAILURE(*status)) { 2174 return; 2175 } 2176 2177 // set everything to zero, so that we can clean up gracefully 2178 uprv_memset(src, 0, sizeof(UColTokenParser)); 2179 2180 // first we need to find options that don't like to be normalized, 2181 // like copy and remove... 2182 //const UChar *openBrace = rules; 2183 int32_t optionNumber = -1; 2184 const UChar *setStart = NULL; 2185 uint32_t i = 0; 2186 while(i < rulesLength) { 2187 if(rules[i] == 0x005B) { // '[': start of an option 2188 /* Gets the following: 2189 optionNumber: The index of the option. 2190 setStart: The pointer at which the option arguments start. 2191 */ 2192 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart); 2193 2194 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */ 2195 // [optimize] 2196 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); 2197 if(U_SUCCESS(*status)) { 2198 if(src->copySet == NULL) { 2199 src->copySet = newSet; 2200 } else { 2201 uset_addAll(src->copySet, newSet); 2202 uset_close(newSet); 2203 } 2204 } else { 2205 return; 2206 } 2207 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) { 2208 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); 2209 if(U_SUCCESS(*status)) { 2210 if(src->removeSet == NULL) { 2211 src->removeSet = newSet; 2212 } else { 2213 uset_addAll(src->removeSet, newSet); 2214 uset_close(newSet); 2215 } 2216 } else { 2217 return; 2218 } 2219 } else if(optionNumber == OPTION_IMPORT){ 2220 // [import <collation-name>] 2221 2222 // Find the address of the closing ]. 2223 UChar* import_end = u_strchr(setStart, 0x005D); 2224 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules); 2225 // Ignore trailing whitespace. 2226 while(uprv_isRuleWhiteSpace(*(import_end-1))) { 2227 --import_end; 2228 } 2229 2230 int32_t optionLength = (int32_t)(import_end - setStart); 2231 char option[50]; 2232 if(optionLength >= (int32_t)sizeof(option)) { 2233 *status = U_ILLEGAL_ARGUMENT_ERROR; 2234 return; 2235 } 2236 u_UCharsToChars(setStart, option, optionLength); 2237 option[optionLength] = 0; 2238 2239 *status = U_ZERO_ERROR; 2240 char locale[50]; 2241 int32_t templ; 2242 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status); 2243 if(U_FAILURE(*status)) { 2244 *status = U_ILLEGAL_ARGUMENT_ERROR; 2245 return; 2246 } 2247 2248 char type[50]; 2249 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 || 2250 U_FAILURE(*status) 2251 ) { 2252 *status = U_ZERO_ERROR; 2253 uprv_strcpy(type, "standard"); 2254 } 2255 2256 // TODO: Use public functions when available, see ticket #8134. 2257 char *keywords = (char *)locale_getKeywordsStart(locale); 2258 if(keywords != NULL) { 2259 *keywords = 0; 2260 } 2261 2262 int32_t importRulesLength = 0; 2263 const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status); 2264 2265 #ifdef DEBUG_FOR_COLL_RULES 2266 std::string s; 2267 UnicodeString(importRules).toUTF8String(s); 2268 std::cout << "Import rules = " << s << std::endl; 2269 #endif 2270 2271 // Add the length of the imported rules to length of the original rules, 2272 // and subtract the length of the import option. 2273 uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i); 2274 2275 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar)); 2276 2277 #ifdef DEBUG_FOR_COLL_RULES 2278 std::string s1; 2279 UnicodeString(rules).toUTF8String(s1); 2280 std::cout << "Original rules = " << s1 << std::endl; 2281 #endif 2282 2283 2284 // Copy the section of the original rules leading up to the import 2285 uprv_memcpy(newRules, rules, i*sizeof(UChar)); 2286 // Copy the imported rules 2287 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar)); 2288 // Copy the rest of the original rules (minus the import option itself) 2289 uprv_memcpy(newRules+i+importRulesLength, 2290 rules+optionEndOffset, 2291 (rulesLength-optionEndOffset)*sizeof(UChar)); 2292 2293 #ifdef DEBUG_FOR_COLL_RULES 2294 std::string s2; 2295 UnicodeString(newRules).toUTF8String(s2); 2296 std::cout << "Resulting rules = " << s2 << std::endl; 2297 #endif 2298 2299 if(needToDeallocRules){ 2300 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free 2301 uprv_free((void*)rules); 2302 } 2303 needToDeallocRules = true; 2304 rules = newRules; 2305 rulesLength = newRulesLength; 2306 2307 estimatedSize += importRulesLength*2; 2308 2309 // First character of the new rules needs to be processed 2310 i--; 2311 } 2312 } 2313 //openBrace++; 2314 i++; 2315 } 2316 2317 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar)); 2318 /* test for NULL */ 2319 if (src->source == NULL) { 2320 *status = U_MEMORY_ALLOCATION_ERROR; 2321 return; 2322 } 2323 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar)); 2324 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status); 2325 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) { 2326 *status = U_ZERO_ERROR; 2327 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); 2328 /* test for NULL */ 2329 if (src->source == NULL) { 2330 *status = U_MEMORY_ALLOCATION_ERROR; 2331 return; 2332 } 2333 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); 2334 } 2335 if(needToDeallocRules){ 2336 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free 2337 uprv_free((void*)rules); 2338 } 2339 2340 2341 src->current = src->source; 2342 src->end = src->source+nSize; 2343 src->sourceCurrent = src->source; 2344 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly 2345 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 2346 src->varTop = NULL; 2347 src->UCA = UCA; 2348 src->invUCA = ucol_initInverseUCA(status); 2349 src->parsedToken.charsLen = 0; 2350 src->parsedToken.charsOffset = 0; 2351 src->parsedToken.extensionLen = 0; 2352 src->parsedToken.extensionOffset = 0; 2353 src->parsedToken.prefixLen = 0; 2354 src->parsedToken.prefixOffset = 0; 2355 src->parsedToken.flags = 0; 2356 src->parsedToken.strength = UCOL_TOK_UNSET; 2357 src->buildCCTabFlag = FALSE; 2358 src->isStarred = FALSE; 2359 src->inRange = FALSE; 2360 src->lastRangeCp = 0; 2361 src->previousCp = 0; 2362 2363 if(U_FAILURE(*status)) { 2364 return; 2365 } 2366 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status); 2367 if(U_FAILURE(*status)) { 2368 return; 2369 } 2370 uhash_setValueDeleter(src->tailored, uhash_freeBlock); 2371 2372 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); 2373 /* test for NULL */ 2374 if (src->opts == NULL) { 2375 *status = U_MEMORY_ALLOCATION_ERROR; 2376 return; 2377 } 2378 2379 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); 2380 2381 src->lh = 0; 2382 src->listCapacity = 1024; 2383 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader)); 2384 //Test for NULL 2385 if (src->lh == NULL) { 2386 *status = U_MEMORY_ALLOCATION_ERROR; 2387 return; 2388 } 2389 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader)); 2390 src->resultLen = 0; 2391 2392 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 2393 2394 // UCOL_RESET_TOP_VALUE 2395 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); 2396 // UCOL_FIRST_PRIMARY_IGNORABLE 2397 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0); 2398 // UCOL_LAST_PRIMARY_IGNORABLE 2399 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0); 2400 // UCOL_FIRST_SECONDARY_IGNORABLE 2401 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0); 2402 // UCOL_LAST_SECONDARY_IGNORABLE 2403 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0); 2404 // UCOL_FIRST_TERTIARY_IGNORABLE 2405 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0); 2406 // UCOL_LAST_TERTIARY_IGNORABLE 2407 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0); 2408 // UCOL_FIRST_VARIABLE 2409 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0); 2410 // UCOL_LAST_VARIABLE 2411 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0); 2412 // UCOL_FIRST_NON_VARIABLE 2413 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0); 2414 // UCOL_LAST_NON_VARIABLE 2415 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); 2416 // UCOL_FIRST_IMPLICIT 2417 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0); 2418 // UCOL_LAST_IMPLICIT 2419 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING); 2420 // UCOL_FIRST_TRAILING 2421 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0); 2422 // UCOL_LAST_TRAILING 2423 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0); 2424 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24); 2425 } 2426 2427 2428 void ucol_tok_closeTokenList(UColTokenParser *src) { 2429 if(src->copySet != NULL) { 2430 uset_close(src->copySet); 2431 } 2432 if(src->removeSet != NULL) { 2433 uset_close(src->removeSet); 2434 } 2435 if(src->tailored != NULL) { 2436 uhash_close(src->tailored); 2437 } 2438 if(src->lh != NULL) { 2439 uprv_free(src->lh); 2440 } 2441 if(src->source != NULL) { 2442 uprv_free(src->source); 2443 } 2444 if(src->opts != NULL) { 2445 uprv_free(src->opts); 2446 } 2447 if (src->reorderCodes != NULL) { 2448 uprv_free(src->reorderCodes); 2449 } 2450 } 2451 2452 #endif /* #if !UCONFIG_NO_COLLATION */ 2453