1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucol_tok.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created 02/22/2001 14 * created by: Vladimir Weinstein 15 * 16 * This module reads a tailoring rule string and produces a list of 17 * tokens that will be turned into collation elements 18 * 19 */ 20 21 #include "unicode/utypes.h" 22 23 #if !UCONFIG_NO_COLLATION 24 25 #include "unicode/uscript.h" 26 #include "unicode/ustring.h" 27 #include "unicode/uchar.h" 28 #include "unicode/uniset.h" 29 30 #include "cmemory.h" 31 #include "cstring.h" 32 #include "patternprops.h" 33 #include "ucol_bld.h" 34 #include "ucol_tok.h" 35 #include "ulocimp.h" 36 #include "uresimp.h" 37 38 // Define this only for debugging. 39 // #define DEBUG_FOR_COLL_RULES 1 40 41 #ifdef DEBUG_FOR_COLL_RULES 42 #include <iostream> 43 #endif 44 45 U_NAMESPACE_USE 46 47 U_CDECL_BEGIN 48 static int32_t U_CALLCONV 49 uhash_hashTokens(const UHashTok k) 50 { 51 int32_t hash = 0; 52 //uint32_t key = (uint32_t)k.integer; 53 UColToken *key = (UColToken *)k.pointer; 54 if (key != 0) { 55 int32_t len = (key->source & 0xFF000000)>>24; 56 int32_t inc = ((len - 32) / 32) + 1; 57 58 const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl); 59 const UChar *limit = p + len; 60 61 while (p<limit) { 62 hash = (hash * 37) + *p; 63 p += inc; 64 } 65 } 66 return hash; 67 } 68 69 static UBool U_CALLCONV 70 uhash_compareTokens(const UHashTok key1, const UHashTok key2) 71 { 72 //uint32_t p1 = (uint32_t) key1.integer; 73 //uint32_t p2 = (uint32_t) key2.integer; 74 UColToken *p1 = (UColToken *)key1.pointer; 75 UColToken *p2 = (UColToken *)key2.pointer; 76 const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl); 77 const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl); 78 uint32_t s1L = ((p1->source & 0xFF000000) >> 24); 79 uint32_t s2L = ((p2->source & 0xFF000000) >> 24); 80 const UChar *end = s1+s1L-1; 81 82 if (p1 == p2) { 83 return TRUE; 84 } 85 if (p1->source == 0 || p2->source == 0) { 86 return FALSE; 87 } 88 if(s1L != s2L) { 89 return FALSE; 90 } 91 if(p1->source == p2->source) { 92 return TRUE; 93 } 94 while((s1 < end) && *s1 == *s2) { 95 ++s1; 96 ++s2; 97 } 98 if(*s1 == *s2) { 99 return TRUE; 100 } else { 101 return FALSE; 102 } 103 } 104 U_CDECL_END 105 106 /* 107 * Debug messages used to pinpoint where a format error occurred. 108 * A better way is to include context-sensitive information in syntaxError() function. 109 * 110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR 111 * in the compile line. 112 */ 113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */ 114 115 #ifdef DEBUG_FOR_FORMAT_ERROR 116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);} 117 #else 118 #define DBG_FORMAT_ERROR 119 #endif 120 121 122 /* 123 * Controls debug messages so that the output can be compared before and after a 124 * big change. Prints the information of every code point that comes out of the 125 * collation parser and its strength into a file. When a big change in format 126 * happens, the files before and after the change should be identical. 127 * 128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS 129 * in the compile line. 130 */ 131 // #define DEBUG_FOR_CODE_POINTS 1 132 133 #ifdef DEBUG_FOR_CODE_POINTS 134 FILE* dfcp_fp = NULL; 135 #endif 136 137 138 typedef struct { 139 uint32_t startCE; 140 uint32_t startContCE; 141 uint32_t limitCE; 142 uint32_t limitContCE; 143 } indirectBoundaries; 144 145 /* these values are used for finding CE values for indirect positioning. */ 146 /* Indirect positioning is a mechanism for allowing resets on symbolic */ 147 /* values. It only works for resets and you cannot tailor indirect names */ 148 /* An indirect name can define either an anchor point or a range. An */ 149 /* anchor point behaves in exactly the same way as a code point in reset */ 150 /* would, except that it cannot be tailored. A range (we currently only */ 151 /* know for the [top] range will explicitly set the upper bound for */ 152 /* generated CEs, thus allowing for better control over how many CEs can */ 153 /* be squeezed between in the range without performance penalty. */ 154 /* In that respect, we use [top] for tailoring of locales that use CJK */ 155 /* characters. Other indirect values are currently a pure convenience, */ 156 /* they can be used to assure that the CEs will be always positioned in */ 157 /* the same place relative to a point with known properties (e.g. first */ 158 /* primary ignorable). */ 159 static indirectBoundaries ucolIndirectBoundaries[15]; 160 /* 161 static indirectBoundaries ucolIndirectBoundaries[11] = { 162 { UCOL_RESET_TOP_VALUE, 0, 163 UCOL_NEXT_TOP_VALUE, 0 }, 164 { UCOL_FIRST_PRIMARY_IGNORABLE, 0, 165 0, 0 }, 166 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT, 167 0, 0 }, 168 { UCOL_FIRST_SECONDARY_IGNORABLE, 0, 169 0, 0 }, 170 { UCOL_LAST_SECONDARY_IGNORABLE, 0, 171 0, 0 }, 172 { UCOL_FIRST_TERTIARY_IGNORABLE, 0, 173 0, 0 }, 174 { UCOL_LAST_TERTIARY_IGNORABLE, 0, 175 0, 0 }, 176 { UCOL_FIRST_VARIABLE, 0, 177 0, 0 }, 178 { UCOL_LAST_VARIABLE, 0, 179 0, 0 }, 180 { UCOL_FIRST_NON_VARIABLE, 0, 181 0, 0 }, 182 { UCOL_LAST_NON_VARIABLE, 0, 183 0, 0 }, 184 }; 185 */ 186 187 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) { 188 189 // Set values for the top - TODO: once we have values for all the indirects, we are going 190 // to initalize here. 191 ucolIndirectBoundaries[indexR].startCE = start[0]; 192 ucolIndirectBoundaries[indexR].startContCE = start[1]; 193 if(end) { 194 ucolIndirectBoundaries[indexR].limitCE = end[0]; 195 ucolIndirectBoundaries[indexR].limitContCE = end[1]; 196 } else { 197 ucolIndirectBoundaries[indexR].limitCE = 0; 198 ucolIndirectBoundaries[indexR].limitContCE = 0; 199 } 200 } 201 202 203 static inline 204 void syntaxError(const UChar* rules, 205 int32_t pos, 206 int32_t rulesLen, 207 UParseError* parseError) 208 { 209 parseError->offset = pos; 210 parseError->line = 0 ; /* we are not using line numbers */ 211 212 // for pre-context 213 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1)); 214 int32_t stop = pos; 215 216 u_memcpy(parseError->preContext,rules+start,stop-start); 217 //null terminate the buffer 218 parseError->preContext[stop-start] = 0; 219 220 //for post-context 221 start = pos+1; 222 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) : 223 rulesLen; 224 225 if(start < stop) { 226 u_memcpy(parseError->postContext,rules+start,stop-start); 227 //null terminate the buffer 228 parseError->postContext[stop-start]= 0; 229 } else { 230 parseError->postContext[0] = 0; 231 } 232 } 233 234 static 235 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) { 236 switch(attrib) { 237 case UCOL_HIRAGANA_QUATERNARY_MODE: 238 opts->hiraganaQ = value; 239 break; 240 case UCOL_FRENCH_COLLATION: 241 opts->frenchCollation = value; 242 break; 243 case UCOL_ALTERNATE_HANDLING: 244 opts->alternateHandling = value; 245 break; 246 case UCOL_CASE_FIRST: 247 opts->caseFirst = value; 248 break; 249 case UCOL_CASE_LEVEL: 250 opts->caseLevel = value; 251 break; 252 case UCOL_NORMALIZATION_MODE: 253 opts->normalizationMode = value; 254 break; 255 case UCOL_STRENGTH: 256 opts->strength = value; 257 break; 258 case UCOL_NUMERIC_COLLATION: 259 opts->numericCollation = value; 260 break; 261 case UCOL_ATTRIBUTE_COUNT: 262 default: 263 break; 264 } 265 } 266 267 #define UTOK_OPTION_COUNT 22 268 269 static UBool didInit = FALSE; 270 /* we can be strict, or we can be lenient */ 271 /* I'd surely be lenient with the option arguments */ 272 /* maybe even with options */ 273 U_STRING_DECL(suboption_00, "non-ignorable", 13); 274 U_STRING_DECL(suboption_01, "shifted", 7); 275 276 U_STRING_DECL(suboption_02, "lower", 5); 277 U_STRING_DECL(suboption_03, "upper", 5); 278 U_STRING_DECL(suboption_04, "off", 3); 279 U_STRING_DECL(suboption_05, "on", 2); 280 U_STRING_DECL(suboption_06, "1", 1); 281 U_STRING_DECL(suboption_07, "2", 1); 282 U_STRING_DECL(suboption_08, "3", 1); 283 U_STRING_DECL(suboption_09, "4", 1); 284 U_STRING_DECL(suboption_10, "I", 1); 285 286 U_STRING_DECL(suboption_11, "primary", 7); 287 U_STRING_DECL(suboption_12, "secondary", 9); 288 U_STRING_DECL(suboption_13, "tertiary", 8); 289 U_STRING_DECL(suboption_14, "variable", 8); 290 U_STRING_DECL(suboption_15, "regular", 7); 291 U_STRING_DECL(suboption_16, "implicit", 8); 292 U_STRING_DECL(suboption_17, "trailing", 8); 293 294 295 U_STRING_DECL(option_00, "undefined", 9); 296 U_STRING_DECL(option_01, "rearrange", 9); 297 U_STRING_DECL(option_02, "alternate", 9); 298 U_STRING_DECL(option_03, "backwards", 9); 299 U_STRING_DECL(option_04, "variable top", 12); 300 U_STRING_DECL(option_05, "top", 3); 301 U_STRING_DECL(option_06, "normalization", 13); 302 U_STRING_DECL(option_07, "caseLevel", 9); 303 U_STRING_DECL(option_08, "caseFirst", 9); 304 U_STRING_DECL(option_09, "scriptOrder", 11); 305 U_STRING_DECL(option_10, "charsetname", 11); 306 U_STRING_DECL(option_11, "charset", 7); 307 U_STRING_DECL(option_12, "before", 6); 308 U_STRING_DECL(option_13, "hiraganaQ", 9); 309 U_STRING_DECL(option_14, "strength", 8); 310 U_STRING_DECL(option_15, "first", 5); 311 U_STRING_DECL(option_16, "last", 4); 312 U_STRING_DECL(option_17, "optimize", 8); 313 U_STRING_DECL(option_18, "suppressContractions", 20); 314 U_STRING_DECL(option_19, "numericOrdering", 15); 315 U_STRING_DECL(option_20, "import", 6); 316 U_STRING_DECL(option_21, "reorder", 7); 317 318 /* 319 [last variable] last variable value 320 [last primary ignorable] largest CE for primary ignorable 321 [last secondary ignorable] largest CE for secondary ignorable 322 [last tertiary ignorable] largest CE for tertiary ignorable 323 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) 324 */ 325 326 327 static const ucolTokSuboption alternateSub[2] = { 328 {suboption_00, 13, UCOL_NON_IGNORABLE}, 329 {suboption_01, 7, UCOL_SHIFTED} 330 }; 331 332 static const ucolTokSuboption caseFirstSub[3] = { 333 {suboption_02, 5, UCOL_LOWER_FIRST}, 334 {suboption_03, 5, UCOL_UPPER_FIRST}, 335 {suboption_04, 3, UCOL_OFF}, 336 }; 337 338 static const ucolTokSuboption onOffSub[2] = { 339 {suboption_04, 3, UCOL_OFF}, 340 {suboption_05, 2, UCOL_ON} 341 }; 342 343 static const ucolTokSuboption frenchSub[1] = { 344 {suboption_07, 1, UCOL_ON} 345 }; 346 347 static const ucolTokSuboption beforeSub[3] = { 348 {suboption_06, 1, UCOL_PRIMARY}, 349 {suboption_07, 1, UCOL_SECONDARY}, 350 {suboption_08, 1, UCOL_TERTIARY} 351 }; 352 353 static const ucolTokSuboption strengthSub[5] = { 354 {suboption_06, 1, UCOL_PRIMARY}, 355 {suboption_07, 1, UCOL_SECONDARY}, 356 {suboption_08, 1, UCOL_TERTIARY}, 357 {suboption_09, 1, UCOL_QUATERNARY}, 358 {suboption_10, 1, UCOL_IDENTICAL}, 359 }; 360 361 static const ucolTokSuboption firstLastSub[7] = { 362 {suboption_11, 7, UCOL_PRIMARY}, 363 {suboption_12, 9, UCOL_PRIMARY}, 364 {suboption_13, 8, UCOL_PRIMARY}, 365 {suboption_14, 8, UCOL_PRIMARY}, 366 {suboption_15, 7, UCOL_PRIMARY}, 367 {suboption_16, 8, UCOL_PRIMARY}, 368 {suboption_17, 8, UCOL_PRIMARY}, 369 }; 370 371 enum OptionNumber { 372 OPTION_ALTERNATE_HANDLING = 0, 373 OPTION_FRENCH_COLLATION, 374 OPTION_CASE_LEVEL, 375 OPTION_CASE_FIRST, 376 OPTION_NORMALIZATION_MODE, 377 OPTION_HIRAGANA_QUATERNARY, 378 OPTION_STRENGTH, 379 OPTION_NUMERIC_COLLATION, 380 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION, 381 OPTION_VARIABLE_TOP, 382 OPTION_REARRANGE, 383 OPTION_BEFORE, 384 OPTION_TOP, 385 OPTION_FIRST, 386 OPTION_LAST, 387 OPTION_OPTIMIZE, 388 OPTION_SUPPRESS_CONTRACTIONS, 389 OPTION_UNDEFINED, 390 OPTION_SCRIPT_ORDER, 391 OPTION_CHARSET_NAME, 392 OPTION_CHARSET, 393 OPTION_IMPORT, 394 OPTION_SCRIPTREORDER 395 } ; 396 397 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { 398 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */ 399 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */ 400 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ 401 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ 402 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */ 403 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */ 404 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */ 405 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/ 406 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ 407 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ 408 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ 409 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ 410 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */ 411 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */ 412 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */ 413 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */ 414 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ 415 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ 416 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ 417 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" */ 418 /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */ 419 /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */ 420 }; 421 422 static 423 int32_t u_strncmpNoCase(const UChar *s1, 424 const UChar *s2, 425 int32_t n) 426 { 427 if(n > 0) { 428 int32_t rc; 429 for(;;) { 430 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); 431 if(rc != 0 || *s1 == 0 || --n == 0) { 432 return rc; 433 } 434 ++s1; 435 ++s2; 436 } 437 } 438 return 0; 439 } 440 441 static 442 void ucol_uprv_tok_initData() { 443 if(!didInit) { 444 U_STRING_INIT(suboption_00, "non-ignorable", 13); 445 U_STRING_INIT(suboption_01, "shifted", 7); 446 447 U_STRING_INIT(suboption_02, "lower", 5); 448 U_STRING_INIT(suboption_03, "upper", 5); 449 U_STRING_INIT(suboption_04, "off", 3); 450 U_STRING_INIT(suboption_05, "on", 2); 451 452 U_STRING_INIT(suboption_06, "1", 1); 453 U_STRING_INIT(suboption_07, "2", 1); 454 U_STRING_INIT(suboption_08, "3", 1); 455 U_STRING_INIT(suboption_09, "4", 1); 456 U_STRING_INIT(suboption_10, "I", 1); 457 458 U_STRING_INIT(suboption_11, "primary", 7); 459 U_STRING_INIT(suboption_12, "secondary", 9); 460 U_STRING_INIT(suboption_13, "tertiary", 8); 461 U_STRING_INIT(suboption_14, "variable", 8); 462 U_STRING_INIT(suboption_15, "regular", 7); 463 U_STRING_INIT(suboption_16, "implicit", 8); 464 U_STRING_INIT(suboption_17, "trailing", 8); 465 466 467 U_STRING_INIT(option_00, "undefined", 9); 468 U_STRING_INIT(option_01, "rearrange", 9); 469 U_STRING_INIT(option_02, "alternate", 9); 470 U_STRING_INIT(option_03, "backwards", 9); 471 U_STRING_INIT(option_04, "variable top", 12); 472 U_STRING_INIT(option_05, "top", 3); 473 U_STRING_INIT(option_06, "normalization", 13); 474 U_STRING_INIT(option_07, "caseLevel", 9); 475 U_STRING_INIT(option_08, "caseFirst", 9); 476 U_STRING_INIT(option_09, "scriptOrder", 11); 477 U_STRING_INIT(option_10, "charsetname", 11); 478 U_STRING_INIT(option_11, "charset", 7); 479 U_STRING_INIT(option_12, "before", 6); 480 U_STRING_INIT(option_13, "hiraganaQ", 9); 481 U_STRING_INIT(option_14, "strength", 8); 482 U_STRING_INIT(option_15, "first", 5); 483 U_STRING_INIT(option_16, "last", 4); 484 U_STRING_INIT(option_17, "optimize", 8); 485 U_STRING_INIT(option_18, "suppressContractions", 20); 486 U_STRING_INIT(option_19, "numericOrdering", 15); 487 U_STRING_INIT(option_20, "import ", 6); 488 U_STRING_INIT(option_21, "reorder", 7); 489 didInit = TRUE; 490 } 491 } 492 493 494 // This function reads basic options to set in the runtime collator 495 // used by data driven tests. Should not support build time options 496 U_CAPI const UChar * U_EXPORT2 497 ucol_tok_getNextArgument(const UChar *start, const UChar *end, 498 UColAttribute *attrib, UColAttributeValue *value, 499 UErrorCode *status) 500 { 501 uint32_t i = 0; 502 int32_t j=0; 503 UBool foundOption = FALSE; 504 const UChar *optionArg = NULL; 505 506 ucol_uprv_tok_initData(); 507 508 while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */ 509 start++; 510 } 511 if(start >= end) { 512 return NULL; 513 } 514 /* skip opening '[' */ 515 if(*start == 0x005b) { 516 start++; 517 } else { 518 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '[' 519 return NULL; 520 } 521 522 while(i < UTOK_OPTION_COUNT) { 523 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { 524 foundOption = TRUE; 525 if(end - start > rulesOptions[i].optionLen) { 526 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */ 527 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */ 528 optionArg++; 529 } 530 } 531 break; 532 } 533 i++; 534 } 535 536 if(!foundOption) { 537 *status = U_ILLEGAL_ARGUMENT_ERROR; 538 return NULL; 539 } 540 541 if(optionArg) { 542 for(j = 0; j<rulesOptions[i].subSize; j++) { 543 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 544 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); 545 *attrib = rulesOptions[i].attr; 546 *value = rulesOptions[i].subopts[j].attrVal; 547 optionArg += rulesOptions[i].subopts[j].subLen; 548 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */ 549 optionArg++; 550 } 551 if(*optionArg == 0x005d) { 552 optionArg++; 553 return optionArg; 554 } else { 555 *status = U_ILLEGAL_ARGUMENT_ERROR; 556 return NULL; 557 } 558 } 559 } 560 } 561 *status = U_ILLEGAL_ARGUMENT_ERROR; 562 return NULL; 563 } 564 565 static 566 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) { 567 while(*start != 0x005b) { /* advance while we find the first '[' */ 568 start++; 569 } 570 // now we need to get a balanced set of '[]'. The problem is that a set can have 571 // many, and *end point to the first closing '[' 572 int32_t noOpenBraces = 1; 573 int32_t current = 1; // skip the opening brace 574 while(start+current < end && noOpenBraces != 0) { 575 if(start[current] == 0x005b) { 576 noOpenBraces++; 577 } else if(start[current] == 0x005D) { // closing brace 578 noOpenBraces--; 579 } 580 current++; 581 } 582 583 if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) { 584 *status = U_ILLEGAL_ARGUMENT_ERROR; 585 return NULL; 586 } 587 return uset_openPattern(start, current, status); 588 } 589 590 /** 591 * Reads an option and matches the option name with the predefined options. (Case-insensitive.) 592 * @param start Pointer to the start UChar. 593 * @param end Pointer to the last valid pointer beyond which the option will not extend. 594 * @param optionArg Address of the pointer at which the options start (after the option name) 595 * @return The index of the option, or -1 if the option is not valid. 596 */ 597 static 598 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) { 599 int32_t i = 0; 600 ucol_uprv_tok_initData(); 601 602 while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */ 603 start++; 604 } 605 while(i < UTOK_OPTION_COUNT) { 606 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { 607 if(end - start > rulesOptions[i].optionLen) { 608 *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */ 609 while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */ 610 (*optionArg)++; 611 } 612 } 613 break; 614 } 615 i++; 616 } 617 if(i == UTOK_OPTION_COUNT) { 618 i = -1; // didn't find an option 619 } 620 return i; 621 } 622 623 624 static 625 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) { 626 int32_t codeCount = 0; 627 int32_t codeIndex = 0; 628 char conversion[64]; 629 int32_t tokenLength = 0; 630 const UChar* space; 631 632 const UChar* current = src->current; 633 const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current); 634 635 // eat leading whitespace 636 while(current < end && u_isWhitespace(*current)) { 637 current++; 638 } 639 640 while(current < end) { 641 space = u_memchr(current, 0x0020, end - current); 642 space = space == 0 ? end : space; 643 tokenLength = space - current; 644 if (tokenLength < 4) { 645 *status = U_INVALID_FORMAT_ERROR; 646 return; 647 } 648 codeCount++; 649 current += tokenLength; 650 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ 651 ++current; 652 } 653 } 654 655 if (codeCount == 0) { 656 *status = U_INVALID_FORMAT_ERROR; 657 } 658 659 src->reorderCodesLength = codeCount; 660 src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t)); 661 current = src->current; 662 663 // eat leading whitespace 664 while(current < end && u_isWhitespace(*current)) { 665 current++; 666 } 667 668 while(current < end) { 669 space = u_memchr(current, 0x0020, end - current); 670 space = space == 0 ? end : space; 671 tokenLength = space - current; 672 if (tokenLength < 4) { 673 *status = U_ILLEGAL_ARGUMENT_ERROR; 674 return; 675 } else { 676 u_UCharsToChars(current, conversion, tokenLength); 677 conversion[tokenLength] = '\0'; 678 src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion); 679 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { 680 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion); 681 } 682 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { 683 *status = U_ILLEGAL_ARGUMENT_ERROR; 684 } 685 } 686 codeIndex++; 687 current += tokenLength; 688 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ 689 ++current; 690 } 691 } 692 } 693 694 // reads and conforms to various options in rules 695 // end is the position of the first closing ']' 696 // However, some of the options take an UnicodeSet definition 697 // which needs to duplicate the closing ']' 698 // for example: '[copy [\uAC00-\uD7FF]]' 699 // These options will move end to the second ']' and the 700 // caller will set the current to it. 701 static 702 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) { 703 const UChar* start = src->current; 704 int32_t i = 0; 705 int32_t j=0; 706 const UChar *optionArg = NULL; 707 708 uint8_t result = 0; 709 710 start++; /*skip opening '['*/ 711 i = ucol_uprv_tok_readOption(start, src->end, &optionArg); 712 if(optionArg) { 713 src->current = optionArg; 714 } 715 716 if(i < 0) { 717 *status = U_ILLEGAL_ARGUMENT_ERROR; 718 } else { 719 int32_t noOpenBraces = 1; 720 switch(i) { 721 case OPTION_ALTERNATE_HANDLING: 722 case OPTION_FRENCH_COLLATION: 723 case OPTION_CASE_LEVEL: 724 case OPTION_CASE_FIRST: 725 case OPTION_NORMALIZATION_MODE: 726 case OPTION_HIRAGANA_QUATERNARY: 727 case OPTION_STRENGTH: 728 case OPTION_NUMERIC_COLLATION: 729 if(optionArg) { 730 for(j = 0; j<rulesOptions[i].subSize; j++) { 731 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 732 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); 733 result = UCOL_TOK_SUCCESS; 734 } 735 } 736 } 737 if(result == 0) { 738 *status = U_ILLEGAL_ARGUMENT_ERROR; 739 } 740 break; 741 case OPTION_VARIABLE_TOP: 742 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; 743 break; 744 case OPTION_REARRANGE: 745 result = UCOL_TOK_SUCCESS; 746 break; 747 case OPTION_BEFORE: 748 if(optionArg) { 749 for(j = 0; j<rulesOptions[i].subSize; j++) { 750 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 751 result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1); 752 } 753 } 754 } 755 if(result == 0) { 756 *status = U_ILLEGAL_ARGUMENT_ERROR; 757 } 758 break; 759 case OPTION_TOP: /* we are going to have an array with structures of limit CEs */ 760 /* index to this array will be src->parsedToken.indirectIndex*/ 761 src->parsedToken.indirectIndex = 0; 762 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP; 763 break; 764 case OPTION_FIRST: 765 case OPTION_LAST: /* first, last */ 766 for(j = 0; j<rulesOptions[i].subSize; j++) { 767 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 768 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first 769 // element of indirect boundaries is reserved for top. 770 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2); 771 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;; 772 } 773 } 774 if(result == 0) { 775 *status = U_ILLEGAL_ARGUMENT_ERROR; 776 } 777 break; 778 case OPTION_OPTIMIZE: 779 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization 780 // we need to move end here 781 src->current++; // skip opening brace 782 while(src->current < src->end && noOpenBraces != 0) { 783 if(*src->current == 0x005b) { 784 noOpenBraces++; 785 } else if(*src->current == 0x005D) { // closing brace 786 noOpenBraces--; 787 } 788 src->current++; 789 } 790 result = UCOL_TOK_SUCCESS; 791 break; 792 case OPTION_SCRIPTREORDER: 793 ucol_tok_parseScriptReorder(src, status); 794 break; 795 default: 796 *status = U_UNSUPPORTED_ERROR; 797 break; 798 } 799 } 800 src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current)); 801 return result; 802 } 803 804 805 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) { 806 if (stuff == NULL || len <= 0) { 807 return; 808 } 809 UnicodeString tempStuff(FALSE, stuff, len); 810 if(src->extraCurrent+len >= src->extraEnd) { 811 /* reallocate */ 812 if (stuff >= src->source && stuff <= src->end) { 813 // Copy the "stuff" contents into tempStuff's own buffer. 814 // UnicodeString is copy-on-write. 815 if (len > 0) { 816 tempStuff.setCharAt(0, tempStuff[0]); 817 } else { 818 tempStuff.remove(); 819 } 820 } 821 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar)); 822 if(newSrc != NULL) { 823 src->current = newSrc + (src->current - src->source); 824 src->extraCurrent = newSrc + (src->extraCurrent - src->source); 825 src->end = newSrc + (src->end - src->source); 826 src->extraEnd = newSrc + (src->extraEnd-src->source)*2; 827 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); 828 src->source = newSrc; 829 } else { 830 *status = U_MEMORY_ALLOCATION_ERROR; 831 return; 832 } 833 } 834 if(len == 1) { 835 *src->extraCurrent++ = tempStuff[0]; 836 } else { 837 u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len); 838 src->extraCurrent += len; 839 } 840 } 841 842 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) { 843 /* 844 top = TRUE; 845 */ 846 UChar buff[5]; 847 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 848 buff[0] = 0xFFFE; 849 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16); 850 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF); 851 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) { 852 src->parsedToken.charsLen = 3; 853 ucol_tok_addToExtraCurrent(src, buff, 3, status); 854 } else { 855 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16); 856 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF); 857 src->parsedToken.charsLen = 5; 858 ucol_tok_addToExtraCurrent(src, buff, 5, status); 859 } 860 return TRUE; 861 } 862 863 static UBool isCharNewLine(UChar c){ 864 switch(c){ 865 case 0x000A: /* LF */ 866 case 0x000D: /* CR */ 867 case 0x000C: /* FF */ 868 case 0x0085: /* NEL */ 869 case 0x2028: /* LS */ 870 case 0x2029: /* PS */ 871 return TRUE; 872 default: 873 return FALSE; 874 } 875 } 876 877 /* 878 * This function is called several times when a range is processed. Each time, the next code point 879 * is processed. 880 * The following variables must be set before calling this function: 881 * src->currentRangeCp: The current code point to process. 882 * src->lastRangeCp: The last code point in the range. 883 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp. 884 */ 885 static const UChar* 886 ucol_tok_processNextCodePointInRange(UColTokenParser *src, 887 UErrorCode *status) 888 { 889 // Append current code point to source 890 UChar buff[U16_MAX_LENGTH]; 891 uint32_t i = 0; 892 893 uint32_t nChars = U16_LENGTH(src->currentRangeCp); 894 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 895 src->parsedToken.charsLen = nChars; 896 897 U16_APPEND_UNSAFE(buff, i, src->currentRangeCp); 898 ucol_tok_addToExtraCurrent(src, buff, nChars, status); 899 900 ++src->currentRangeCp; 901 if (src->currentRangeCp > src->lastRangeCp) { 902 src->inRange = FALSE; 903 904 if (src->currentStarredCharIndex > src->lastStarredCharIndex) { 905 src->isStarred = FALSE; 906 } 907 } else { 908 src->previousCp = src->currentRangeCp; 909 } 910 return src->current; 911 } 912 913 /* 914 * This function is called several times when a starred list is processed. Each time, the next code point 915 * in the list is processed. 916 * The following variables must be set before calling this function: 917 * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point. 918 * src->lastStarredCharIndex: Index to the last character in the list. 919 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex. 920 */ 921 static const UChar* 922 ucol_tok_processNextTokenInStarredList(UColTokenParser *src) 923 { 924 // Extract the characters corresponding to the next code point. 925 UChar32 cp; 926 src->parsedToken.charsOffset = src->currentStarredCharIndex; 927 int32_t prev = src->currentStarredCharIndex; 928 U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp); 929 src->parsedToken.charsLen = src->currentStarredCharIndex - prev; 930 931 // When we are done parsing the starred string, turn the flag off so that 932 // the normal processing is restored. 933 if (src->currentStarredCharIndex > src->lastStarredCharIndex) { 934 src->isStarred = FALSE; 935 } 936 src->previousCp = cp; 937 return src->current; 938 } 939 940 /* 941 * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters. 942 * 943 * This routine parses and separates almost all tokens. The following are the syntax characters recognized. 944 * # : Comment character 945 * & : Reset operator 946 * = : Equality 947 * < : Primary collation 948 * << : Secondary collation 949 * <<< : Tertiary collation 950 * ; : Secondary collation 951 * , : Tertiary collation 952 * / : Expansions 953 * | : Prefix 954 * - : Range 955 956 * ! : Java Thai modifier, ignored 957 * @ : French only 958 959 * [] : Options 960 * '' : Quotes 961 * 962 * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz 963 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above. 964 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a", 965 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous 966 * character returned as cached so that the calling program can do further splitting. 967 */ 968 static const UChar* 969 ucol_tok_parseNextTokenInternal(UColTokenParser *src, 970 UBool startOfRules, 971 UParseError *parseError, 972 UErrorCode *status) 973 { 974 UBool variableTop = FALSE; 975 UBool top = FALSE; 976 UBool inChars = TRUE; 977 UBool inQuote = FALSE; 978 UBool wasInQuote = FALSE; 979 uint8_t before = 0; 980 UBool isEscaped = FALSE; 981 982 // TODO: replace these variables with src->parsedToken counterparts 983 // no need to use them anymore since we have src->parsedToken. 984 // Ideally, token parser would be a nice class... Once, when I have 985 // more time (around 2020 probably). 986 uint32_t newExtensionLen = 0; 987 uint32_t extensionOffset = 0; 988 uint32_t newStrength = UCOL_TOK_UNSET; 989 UChar buff[10]; 990 991 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; 992 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; 993 src->parsedToken.indirectIndex = 0; 994 995 while (src->current < src->end) { 996 UChar ch = *(src->current); 997 998 if (inQuote) { 999 if (ch == 0x0027/*'\''*/) { 1000 inQuote = FALSE; 1001 } else { 1002 if ((src->parsedToken.charsLen == 0) || inChars) { 1003 if(src->parsedToken.charsLen == 0) { 1004 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1005 } 1006 src->parsedToken.charsLen++; 1007 } else { 1008 if(newExtensionLen == 0) { 1009 extensionOffset = (uint32_t)(src->extraCurrent - src->source); 1010 } 1011 newExtensionLen++; 1012 } 1013 } 1014 }else if(isEscaped){ 1015 isEscaped =FALSE; 1016 if (newStrength == UCOL_TOK_UNSET) { 1017 *status = U_INVALID_FORMAT_ERROR; 1018 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1019 DBG_FORMAT_ERROR 1020 return NULL; 1021 // enabling rules to start with non-tokens a < b 1022 // newStrength = UCOL_TOK_RESET; 1023 } 1024 if(ch != 0x0000 && src->current != src->end) { 1025 if (inChars) { 1026 if(src->parsedToken.charsLen == 0) { 1027 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); 1028 } 1029 src->parsedToken.charsLen++; 1030 } else { 1031 if(newExtensionLen == 0) { 1032 extensionOffset = (uint32_t)(src->current - src->source); 1033 } 1034 newExtensionLen++; 1035 } 1036 } 1037 }else { 1038 if(!PatternProps::isWhiteSpace(ch)) { 1039 /* Sets the strength for this entry */ 1040 switch (ch) { 1041 case 0x003D/*'='*/ : 1042 if (newStrength != UCOL_TOK_UNSET) { 1043 goto EndOfLoop; 1044 } 1045 1046 /* if we start with strength, we'll reset to top */ 1047 if(startOfRules == TRUE) { 1048 src->parsedToken.indirectIndex = 5; 1049 top = ucol_tok_doSetTop(src, status); 1050 newStrength = UCOL_TOK_RESET; 1051 goto EndOfLoop; 1052 } 1053 newStrength = UCOL_IDENTICAL; 1054 if(*(src->current+1) == 0x002A) {/*'*'*/ 1055 src->current++; 1056 src->isStarred = TRUE; 1057 } 1058 break; 1059 1060 case 0x002C/*','*/: 1061 if (newStrength != UCOL_TOK_UNSET) { 1062 goto EndOfLoop; 1063 } 1064 1065 /* if we start with strength, we'll reset to top */ 1066 if(startOfRules == TRUE) { 1067 src->parsedToken.indirectIndex = 5; 1068 top = ucol_tok_doSetTop(src, status); 1069 newStrength = UCOL_TOK_RESET; 1070 goto EndOfLoop; 1071 } 1072 newStrength = UCOL_TERTIARY; 1073 break; 1074 1075 case 0x003B/*';'*/: 1076 if (newStrength != UCOL_TOK_UNSET) { 1077 goto EndOfLoop; 1078 } 1079 1080 /* if we start with strength, we'll reset to top */ 1081 if(startOfRules == TRUE) { 1082 src->parsedToken.indirectIndex = 5; 1083 top = ucol_tok_doSetTop(src, status); 1084 newStrength = UCOL_TOK_RESET; 1085 goto EndOfLoop; 1086 } 1087 newStrength = UCOL_SECONDARY; 1088 break; 1089 1090 case 0x003C/*'<'*/: 1091 if (newStrength != UCOL_TOK_UNSET) { 1092 goto EndOfLoop; 1093 } 1094 1095 /* if we start with strength, we'll reset to top */ 1096 if(startOfRules == TRUE) { 1097 src->parsedToken.indirectIndex = 5; 1098 top = ucol_tok_doSetTop(src, status); 1099 newStrength = UCOL_TOK_RESET; 1100 goto EndOfLoop; 1101 } 1102 /* before this, do a scan to verify whether this is */ 1103 /* another strength */ 1104 if(*(src->current+1) == 0x003C) { 1105 src->current++; 1106 if(*(src->current+1) == 0x003C) { 1107 src->current++; /* three in a row! */ 1108 newStrength = UCOL_TERTIARY; 1109 } else { /* two in a row */ 1110 newStrength = UCOL_SECONDARY; 1111 } 1112 } else { /* just one */ 1113 newStrength = UCOL_PRIMARY; 1114 } 1115 if(*(src->current+1) == 0x002A) {/*'*'*/ 1116 src->current++; 1117 src->isStarred = TRUE; 1118 } 1119 break; 1120 1121 case 0x0026/*'&'*/: 1122 if (newStrength != UCOL_TOK_UNSET) { 1123 /**/ 1124 goto EndOfLoop; 1125 } 1126 1127 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ 1128 break; 1129 1130 case 0x005b/*'['*/: 1131 /* options - read an option, analyze it */ 1132 if(u_strchr(src->current, 0x005d /*']'*/) != NULL) { 1133 uint8_t result = ucol_uprv_tok_readAndSetOption(src, status); 1134 if(U_SUCCESS(*status)) { 1135 if(result & UCOL_TOK_TOP) { 1136 if(newStrength == UCOL_TOK_RESET) { 1137 top = ucol_tok_doSetTop(src, status); 1138 if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b' 1139 src->parsedToken.charsLen+=2; 1140 buff[0] = 0x002d; 1141 buff[1] = before; 1142 ucol_tok_addToExtraCurrent(src, buff, 2, status); 1143 } 1144 1145 src->current++; 1146 goto EndOfLoop; 1147 } else { 1148 *status = U_INVALID_FORMAT_ERROR; 1149 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1150 DBG_FORMAT_ERROR 1151 } 1152 } else if(result & UCOL_TOK_VARIABLE_TOP) { 1153 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) { 1154 variableTop = TRUE; 1155 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1156 src->parsedToken.charsLen = 1; 1157 buff[0] = 0xFFFF; 1158 ucol_tok_addToExtraCurrent(src, buff, 1, status); 1159 src->current++; 1160 goto EndOfLoop; 1161 } else { 1162 *status = U_INVALID_FORMAT_ERROR; 1163 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1164 DBG_FORMAT_ERROR 1165 } 1166 } else if (result & UCOL_TOK_BEFORE){ 1167 if(newStrength == UCOL_TOK_RESET) { 1168 before = result & UCOL_TOK_BEFORE; 1169 } else { 1170 *status = U_INVALID_FORMAT_ERROR; 1171 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1172 DBG_FORMAT_ERROR 1173 } 1174 } 1175 } else { 1176 *status = U_INVALID_FORMAT_ERROR; 1177 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1178 DBG_FORMAT_ERROR 1179 return NULL; 1180 } 1181 } 1182 break; 1183 case 0x0021/*! skip java thai modifier reordering*/: 1184 break; 1185 case 0x002F/*'/'*/: 1186 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */ 1187 inChars = FALSE; /* we're now processing expansion */ 1188 break; 1189 case 0x005C /* back slash for escaped chars */: 1190 isEscaped = TRUE; 1191 break; 1192 /* found a quote, we're gonna start copying */ 1193 case 0x0027/*'\''*/: 1194 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ 1195 *status = U_INVALID_FORMAT_ERROR; 1196 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1197 DBG_FORMAT_ERROR 1198 return NULL; 1199 // enabling rules to start with a non-token character a < b 1200 // newStrength = UCOL_TOK_RESET; 1201 } 1202 1203 inQuote = TRUE; 1204 1205 if(inChars) { /* we're doing characters */ 1206 if(wasInQuote == FALSE) { 1207 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1208 } 1209 if (src->parsedToken.charsLen != 0) { 1210 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); 1211 } 1212 src->parsedToken.charsLen++; 1213 } else { /* we're doing an expansion */ 1214 if(wasInQuote == FALSE) { 1215 extensionOffset = (uint32_t)(src->extraCurrent - src->source); 1216 } 1217 if (newExtensionLen != 0) { 1218 ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status); 1219 } 1220 newExtensionLen++; 1221 } 1222 1223 wasInQuote = TRUE; 1224 1225 ch = *(++(src->current)); 1226 if(ch == 0x0027) { /* copy the double quote */ 1227 ucol_tok_addToExtraCurrent(src, &ch, 1, status); 1228 inQuote = FALSE; 1229 } 1230 break; 1231 1232 /* '@' is french only if the strength is not currently set */ 1233 /* if it is, it's just a regular character in collation rules */ 1234 case 0x0040/*'@'*/: 1235 if (newStrength == UCOL_TOK_UNSET) { 1236 src->opts->frenchCollation = UCOL_ON; 1237 break; 1238 } 1239 1240 case 0x007C /*|*/: /* this means we have actually been reading prefix part */ 1241 // we want to store read characters to the prefix part and continue reading 1242 // the characters (proper way would be to restart reading the chars, but in 1243 // that case we would have to complicate the token hasher, which I do not 1244 // intend to play with. Instead, we will do prefixes when prefixes are due 1245 // (before adding the elements). 1246 src->parsedToken.prefixOffset = src->parsedToken.charsOffset; 1247 src->parsedToken.prefixLen = src->parsedToken.charsLen; 1248 1249 if(inChars) { /* we're doing characters */ 1250 if(wasInQuote == FALSE) { 1251 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1252 } 1253 if (src->parsedToken.charsLen != 0) { 1254 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); 1255 } 1256 src->parsedToken.charsLen++; 1257 } 1258 1259 wasInQuote = TRUE; 1260 1261 do { 1262 ch = *(++(src->current)); 1263 // skip whitespace between '|' and the character 1264 } while (PatternProps::isWhiteSpace(ch)); 1265 break; 1266 1267 //charsOffset = 0; 1268 //newCharsLen = 0; 1269 //break; // We want to store the whole prefix/character sequence. If we break 1270 // the '|' is going to get lost. 1271 1272 case 0x002D /*-*/: /* A range. */ 1273 if (newStrength != UCOL_TOK_UNSET) { 1274 // While processing the pending token, the isStarred field 1275 // is reset, so it needs to be saved for the next 1276 // invocation. 1277 src->savedIsStarred = src->isStarred; 1278 goto EndOfLoop; 1279 } 1280 src->isStarred = src->savedIsStarred; 1281 1282 // Ranges are valid only in starred tokens. 1283 if (!src->isStarred) { 1284 *status = U_INVALID_FORMAT_ERROR; 1285 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1286 DBG_FORMAT_ERROR 1287 return NULL; 1288 } 1289 newStrength = src->parsedToken.strength; 1290 src->inRange = TRUE; 1291 break; 1292 1293 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */ 1294 do { 1295 ch = *(++(src->current)); 1296 } while (!isCharNewLine(ch)); 1297 1298 break; 1299 default: 1300 if (newStrength == UCOL_TOK_UNSET) { 1301 *status = U_INVALID_FORMAT_ERROR; 1302 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1303 DBG_FORMAT_ERROR 1304 return NULL; 1305 } 1306 1307 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { 1308 *status = U_INVALID_FORMAT_ERROR; 1309 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1310 DBG_FORMAT_ERROR 1311 return NULL; 1312 } 1313 1314 if(ch == 0x0000 && src->current+1 == src->end) { 1315 break; 1316 } 1317 1318 if (inChars) { 1319 if(src->parsedToken.charsLen == 0) { 1320 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); 1321 } 1322 src->parsedToken.charsLen++; 1323 } else { 1324 if(newExtensionLen == 0) { 1325 extensionOffset = (uint32_t)(src->current - src->source); 1326 } 1327 newExtensionLen++; 1328 } 1329 1330 break; 1331 } 1332 } 1333 } 1334 1335 if(wasInQuote) { 1336 if(ch != 0x27) { 1337 if(inQuote || !PatternProps::isWhiteSpace(ch)) { 1338 ucol_tok_addToExtraCurrent(src, &ch, 1, status); 1339 } 1340 } 1341 } 1342 1343 src->current++; 1344 } 1345 1346 EndOfLoop: 1347 wasInQuote = FALSE; 1348 if (newStrength == UCOL_TOK_UNSET) { 1349 return NULL; 1350 } 1351 1352 if (src->parsedToken.charsLen == 0 && top == FALSE) { 1353 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1354 *status = U_INVALID_FORMAT_ERROR; 1355 DBG_FORMAT_ERROR 1356 return NULL; 1357 } 1358 1359 src->parsedToken.strength = newStrength; 1360 src->parsedToken.extensionOffset = extensionOffset; 1361 src->parsedToken.extensionLen = newExtensionLen; 1362 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before; 1363 1364 return src->current; 1365 } 1366 1367 /* 1368 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters. 1369 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported. 1370 * 1371 * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following: 1372 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates 1373 * it to separate tokens and returns one by one. In order to do that, the necessary states are 1374 * cached as member variables of the token parser. 1375 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the 1376 * starting character as a single list token (which is separated into individual characters here) 1377 * and as another list token starting with the last character in the range. Before expanding it 1378 * as a list of tokens, this function expands the range by filling the intermediate characters and 1379 * returns them one by one as separate tokens. 1380 * Necessary checks are done for invalid combinations. 1381 */ 1382 U_CAPI const UChar* U_EXPORT2 1383 ucol_tok_parseNextToken(UColTokenParser *src, 1384 UBool startOfRules, 1385 UParseError *parseError, 1386 UErrorCode *status) 1387 { 1388 const UChar *nextToken; 1389 1390 if (src->inRange) { 1391 // We are not done processing a range. Continue it. 1392 return ucol_tok_processNextCodePointInRange(src, status); 1393 } else if (src->isStarred) { 1394 // We are not done processing a starred token. Continue it. 1395 return ucol_tok_processNextTokenInStarredList(src); 1396 } 1397 1398 // Get the next token. 1399 nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status); 1400 1401 if (nextToken == NULL) { 1402 return NULL; 1403 } 1404 1405 if (src->inRange) { 1406 // A new range has started. 1407 // Check whether it is a chain of ranges with more than one hyphen. 1408 if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) { 1409 *status = U_INVALID_FORMAT_ERROR; 1410 syntaxError(src->source,src->parsedToken.charsOffset-1, 1411 src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError); 1412 DBG_FORMAT_ERROR 1413 return NULL; 1414 } 1415 1416 // The current token indicates the second code point of the range. 1417 // Process just that, and then proceed with the star. 1418 src->currentStarredCharIndex = src->parsedToken.charsOffset; 1419 U16_NEXT(src->source, src->currentStarredCharIndex, 1420 (uint32_t)(src->end - src->source), src->lastRangeCp); 1421 if (src->lastRangeCp <= src->previousCp) { 1422 *status = U_INVALID_FORMAT_ERROR; 1423 syntaxError(src->source,src->parsedToken.charsOffset-1, 1424 src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); 1425 DBG_FORMAT_ERROR 1426 return NULL; 1427 } 1428 1429 // Set current range code point to process the range loop 1430 src->currentRangeCp = src->previousCp + 1; 1431 1432 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; 1433 1434 return ucol_tok_processNextCodePointInRange(src, status); 1435 } else if (src->isStarred) { 1436 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that 1437 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be 1438 // separated into several tokens and returned. 1439 src->currentStarredCharIndex = src->parsedToken.charsOffset; 1440 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; 1441 1442 return ucol_tok_processNextTokenInStarredList(src); 1443 } else { 1444 // Set previous codepoint 1445 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp); 1446 } 1447 return nextToken; 1448 } 1449 1450 1451 /* 1452 Processing Description 1453 1 Build a ListList. Each list has a header, which contains two lists (positive 1454 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and 1455 reset may be null. 1456 2 As you process, you keep a LAST pointer that points to the last token you 1457 handled. 1458 1459 */ 1460 1461 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext, 1462 UParseError *parseError, UErrorCode *status) 1463 { 1464 if(src->resultLen == src->listCapacity) { 1465 // Unfortunately, this won't work, as we store addresses of lhs in token 1466 src->listCapacity *= 2; 1467 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader)); 1468 if(src->lh == NULL) { 1469 *status = U_MEMORY_ALLOCATION_ERROR; 1470 return NULL; 1471 } 1472 } 1473 /* do the reset thing */ 1474 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); 1475 /* test for NULL */ 1476 if (sourceToken == NULL) { 1477 *status = U_MEMORY_ALLOCATION_ERROR; 1478 return NULL; 1479 } 1480 sourceToken->rulesToParseHdl = &(src->source); 1481 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; 1482 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; 1483 1484 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); 1485 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); 1486 1487 // keep the flags around so that we know about before 1488 sourceToken->flags = src->parsedToken.flags; 1489 1490 if(src->parsedToken.prefixOffset != 0) { 1491 // this is a syntax error 1492 *status = U_INVALID_FORMAT_ERROR; 1493 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); 1494 DBG_FORMAT_ERROR 1495 uprv_free(sourceToken); 1496 return 0; 1497 } else { 1498 sourceToken->prefix = 0; 1499 } 1500 1501 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ 1502 sourceToken->strength = UCOL_TOK_RESET; 1503 sourceToken->next = NULL; 1504 sourceToken->previous = NULL; 1505 sourceToken->noOfCEs = 0; 1506 sourceToken->noOfExpCEs = 0; 1507 sourceToken->listHeader = &src->lh[src->resultLen]; 1508 1509 src->lh[src->resultLen].first = NULL; 1510 src->lh[src->resultLen].last = NULL; 1511 src->lh[src->resultLen].first = NULL; 1512 src->lh[src->resultLen].last = NULL; 1513 1514 src->lh[src->resultLen].reset = sourceToken; 1515 1516 /* 1517 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... 1518 First convert all expansions into normal form. Examples: 1519 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * 1520 d * ... into &x * c/y * d * ... 1521 Note: reset values can never have expansions, although they can cause the 1522 very next item to have one. They may be contractions, if they are found 1523 earlier in the list. 1524 */ 1525 *expandNext = 0; 1526 if(expand != NULL) { 1527 /* check to see if there is an expansion */ 1528 if(src->parsedToken.charsLen > 1) { 1529 uint32_t resetCharsOffset; 1530 resetCharsOffset = (uint32_t)(expand - src->source); 1531 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset; 1532 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset); 1533 } 1534 } 1535 1536 src->resultLen++; 1537 1538 uhash_put(src->tailored, sourceToken, sourceToken, status); 1539 1540 return sourceToken; 1541 } 1542 1543 static 1544 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) { 1545 if(U_FAILURE(*status)) { 1546 return NULL; 1547 } 1548 /* this is a virgin before - we need to fish the anchor from the UCA */ 1549 collIterate s; 1550 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; 1551 uint32_t CE, SecondCE; 1552 // uint32_t invPos; 1553 if(sourceToken != NULL) { 1554 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status); 1555 } else { 1556 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status); 1557 } 1558 if(U_FAILURE(*status)) { 1559 return NULL; 1560 } 1561 1562 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; 1563 baseContCE = ucol_getNextCE(src->UCA, &s, status); 1564 if(baseContCE == UCOL_NO_MORE_CES) { 1565 baseContCE = 0; 1566 } 1567 1568 1569 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 1570 uint32_t ch = 0; 1571 uint32_t expandNext = 0; 1572 UColToken key; 1573 1574 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 1575 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); 1576 uint32_t raw = uprv_uca_getRawFromImplicit(primary); 1577 ch = uprv_uca_getCodePointFromRaw(raw-1); 1578 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); 1579 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; 1580 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; 1581 1582 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1583 *src->extraCurrent++ = 0xFFFE; 1584 *src->extraCurrent++ = (UChar)ch; 1585 src->parsedToken.charsLen++; 1586 1587 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; 1588 key.rulesToParseHdl = &(src->source); 1589 1590 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); 1591 sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1592 1593 if(sourceToken == NULL) { 1594 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; 1595 if(isContinuation(SecondCE)) { 1596 src->lh[src->resultLen].baseContCE = SecondCE; 1597 } else { 1598 src->lh[src->resultLen].baseContCE = 0; 1599 } 1600 src->lh[src->resultLen].nextCE = 0; 1601 src->lh[src->resultLen].nextContCE = 0; 1602 src->lh[src->resultLen].previousCE = 0; 1603 src->lh[src->resultLen].previousContCE = 0; 1604 1605 src->lh[src->resultLen].indirect = FALSE; 1606 1607 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 1608 } 1609 1610 } else { 1611 /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); 1612 1613 // we got the previous CE. Now we need to see if the difference between 1614 // the two CEs is really of the requested strength. 1615 // if it's a bigger difference (we asked for secondary and got primary), we 1616 // need to modify the CE. 1617 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) { 1618 // adjust the strength 1619 // now we are in the situation where our baseCE should actually be modified in 1620 // order to get the CE in the right position. 1621 if(strength == UCOL_SECONDARY) { 1622 CE = baseCE - 0x0200; 1623 } else { // strength == UCOL_TERTIARY 1624 CE = baseCE - 0x02; 1625 } 1626 if(baseContCE) { 1627 if(strength == UCOL_SECONDARY) { 1628 SecondCE = baseContCE - 0x0200; 1629 } else { // strength == UCOL_TERTIARY 1630 SecondCE = baseContCE - 0x02; 1631 } 1632 } 1633 } 1634 1635 #if 0 1636 // the code below relies on getting a code point from the inverse table, in order to be 1637 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work: 1638 // 1. There are many code points that have the same CE 1639 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken. 1640 // Also, in case when there is no equivalent strength before an element, we have to actually 1641 // construct one. For example, &[before 2]a << x won't result in x << a, because the element 1642 // before a is a primary difference. 1643 1644 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 1645 1646 1647 ch = CETable[3*invPos+2]; 1648 1649 if((ch & UCOL_INV_SIZEMASK) != 0) { 1650 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts); 1651 uint32_t offset = (ch & UCOL_INV_OFFSETMASK); 1652 ch = conts[offset]; 1653 } 1654 1655 *src->extraCurrent++ = (UChar)ch; 1656 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1); 1657 src->parsedToken.charsLen = 1; 1658 1659 // We got an UCA before. However, this might have been tailored. 1660 // example: 1661 // &\u30ca = \u306a 1662 // &[before 3]\u306a<<<\u306a|\u309d 1663 1664 1665 // uint32_t key = (*newCharsLen << 24) | *charsOffset; 1666 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; 1667 key.rulesToParseHdl = &(src->source); 1668 1669 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); 1670 sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1671 #endif 1672 1673 // here is how it should be. The situation such as &[before 1]a < x, should be 1674 // resolved exactly as if we wrote &a > x. 1675 // therefore, I don't really care if the UCA value before a has been changed. 1676 // However, I do care if the strength between my element and the previous element 1677 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll 1678 // have to construct the base CE. 1679 1680 1681 1682 // if we found a tailored thing, we have to use the UCA value and construct 1683 // a new reset token with constructed name 1684 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { 1685 // character to which we want to anchor is already tailored. 1686 // We need to construct a new token which will be the anchor 1687 // point 1688 //*(src->extraCurrent-1) = 0xFFFE; 1689 //*src->extraCurrent++ = (UChar)ch; 1690 // grab before 1691 src->parsedToken.charsOffset -= 10; 1692 src->parsedToken.charsLen += 10; 1693 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; 1694 if(isContinuation(SecondCE)) { 1695 src->lh[src->resultLen].baseContCE = SecondCE; 1696 } else { 1697 src->lh[src->resultLen].baseContCE = 0; 1698 } 1699 src->lh[src->resultLen].nextCE = 0; 1700 src->lh[src->resultLen].nextContCE = 0; 1701 src->lh[src->resultLen].previousCE = 0; 1702 src->lh[src->resultLen].previousContCE = 0; 1703 1704 src->lh[src->resultLen].indirect = FALSE; 1705 1706 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 1707 //} 1708 } 1709 1710 return sourceToken; 1711 1712 } 1713 1714 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) { 1715 UColToken *lastToken = NULL; 1716 const UChar *parseEnd = NULL; 1717 uint32_t expandNext = 0; 1718 UBool variableTop = FALSE; 1719 UBool top = FALSE; 1720 uint16_t specs = 0; 1721 UColTokListHeader *ListList = NULL; 1722 1723 src->parsedToken.strength = UCOL_TOK_UNSET; 1724 1725 ListList = src->lh; 1726 1727 if(U_FAILURE(*status)) { 1728 return 0; 1729 } 1730 #ifdef DEBUG_FOR_CODE_POINTS 1731 char filename[35]; 1732 sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid()); 1733 dfcp_fp = fopen(filename, "a"); 1734 fprintf(stdout, "Output is in the file %s.\n", filename); 1735 #endif 1736 1737 #ifdef DEBUG_FOR_COLL_RULES 1738 std::string s3; 1739 UnicodeString(src->source).toUTF8String(s3); 1740 std::cout << "src->source = " << s3 << std::endl; 1741 #endif 1742 1743 while(src->current < src->end || src->isStarred) { 1744 src->parsedToken.prefixOffset = 0; 1745 1746 parseEnd = ucol_tok_parseNextToken(src, 1747 (UBool)(lastToken == NULL), 1748 parseError, 1749 status); 1750 1751 specs = src->parsedToken.flags; 1752 1753 1754 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); 1755 top = ((specs & UCOL_TOK_TOP) != 0); 1756 1757 if(U_SUCCESS(*status) && parseEnd != NULL) { 1758 UColToken *sourceToken = NULL; 1759 //uint32_t key = 0; 1760 uint32_t lastStrength = UCOL_TOK_UNSET; 1761 1762 if(lastToken != NULL ) { 1763 lastStrength = lastToken->strength; 1764 } 1765 1766 #ifdef DEBUG_FOR_CODE_POINTS 1767 UChar32 cp; 1768 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp); 1769 fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength); 1770 #endif 1771 //key = newCharsLen << 24 | charsOffset; 1772 UColToken key; 1773 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; 1774 key.rulesToParseHdl = &(src->source); 1775 1776 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */ 1777 sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1778 1779 if(src->parsedToken.strength != UCOL_TOK_RESET) { 1780 if(lastToken == NULL) { /* this means that rules haven't started properly */ 1781 *status = U_INVALID_FORMAT_ERROR; 1782 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); 1783 DBG_FORMAT_ERROR 1784 return 0; 1785 } 1786 /* 6 Otherwise (when relation != reset) */ 1787 if(sourceToken == NULL) { 1788 /* If sourceToken is null, create new one, */ 1789 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); 1790 /* test for NULL */ 1791 if (sourceToken == NULL) { 1792 *status = U_MEMORY_ALLOCATION_ERROR; 1793 return 0; 1794 } 1795 sourceToken->rulesToParseHdl = &(src->source); 1796 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; 1797 1798 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); 1799 1800 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset; 1801 sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset); 1802 1803 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ 1804 sourceToken->next = NULL; 1805 sourceToken->previous = NULL; 1806 sourceToken->noOfCEs = 0; 1807 sourceToken->noOfExpCEs = 0; 1808 // keep the flags around so that we know about before 1809 sourceToken->flags = src->parsedToken.flags; 1810 uhash_put(src->tailored, sourceToken, sourceToken, status); 1811 if(U_FAILURE(*status)) { 1812 return 0; 1813 } 1814 } else { 1815 /* we could have fished out a reset here */ 1816 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) { 1817 /* otherwise remove sourceToken from where it was. */ 1818 if(sourceToken->next != NULL) { 1819 if(sourceToken->next->strength > sourceToken->strength) { 1820 sourceToken->next->strength = sourceToken->strength; 1821 } 1822 sourceToken->next->previous = sourceToken->previous; 1823 } else { 1824 sourceToken->listHeader->last = sourceToken->previous; 1825 } 1826 1827 if(sourceToken->previous != NULL) { 1828 sourceToken->previous->next = sourceToken->next; 1829 } else { 1830 sourceToken->listHeader->first = sourceToken->next; 1831 } 1832 sourceToken->next = NULL; 1833 sourceToken->previous = NULL; 1834 } 1835 } 1836 1837 sourceToken->strength = src->parsedToken.strength; 1838 sourceToken->listHeader = lastToken->listHeader; 1839 1840 /* 1841 1. Find the strongest strength in each list, and set strongestP and strongestN 1842 accordingly in the headers. 1843 */ 1844 if(lastStrength == UCOL_TOK_RESET 1845 || sourceToken->listHeader->first == 0) { 1846 /* If LAST is a reset 1847 insert sourceToken in the list. */ 1848 if(sourceToken->listHeader->first == 0) { 1849 sourceToken->listHeader->first = sourceToken; 1850 sourceToken->listHeader->last = sourceToken; 1851 } else { /* we need to find a place for us */ 1852 /* and we'll get in front of the same strength */ 1853 if(sourceToken->listHeader->first->strength <= sourceToken->strength) { 1854 sourceToken->next = sourceToken->listHeader->first; 1855 sourceToken->next->previous = sourceToken; 1856 sourceToken->listHeader->first = sourceToken; 1857 sourceToken->previous = NULL; 1858 } else { 1859 lastToken = sourceToken->listHeader->first; 1860 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { 1861 lastToken = lastToken->next; 1862 } 1863 if(lastToken->next != NULL) { 1864 lastToken->next->previous = sourceToken; 1865 } else { 1866 sourceToken->listHeader->last = sourceToken; 1867 } 1868 sourceToken->previous = lastToken; 1869 sourceToken->next = lastToken->next; 1870 lastToken->next = sourceToken; 1871 } 1872 } 1873 } else { 1874 /* Otherwise (when LAST is not a reset) 1875 if polarity (LAST) == polarity(relation), insert sourceToken after LAST, 1876 otherwise insert before. 1877 when inserting after or before, search to the next position with the same 1878 strength in that direction. (This is called postpone insertion). */ 1879 if(sourceToken != lastToken) { 1880 if(lastToken->polarity == sourceToken->polarity) { 1881 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { 1882 lastToken = lastToken->next; 1883 } 1884 sourceToken->previous = lastToken; 1885 if(lastToken->next != NULL) { 1886 lastToken->next->previous = sourceToken; 1887 } else { 1888 sourceToken->listHeader->last = sourceToken; 1889 } 1890 1891 sourceToken->next = lastToken->next; 1892 lastToken->next = sourceToken; 1893 } else { 1894 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) { 1895 lastToken = lastToken->previous; 1896 } 1897 sourceToken->next = lastToken; 1898 if(lastToken->previous != NULL) { 1899 lastToken->previous->next = sourceToken; 1900 } else { 1901 sourceToken->listHeader->first = sourceToken; 1902 } 1903 sourceToken->previous = lastToken->previous; 1904 lastToken->previous = sourceToken; 1905 } 1906 } else { /* repeated one thing twice in rules, stay with the stronger strength */ 1907 if(lastStrength < sourceToken->strength) { 1908 sourceToken->strength = lastStrength; 1909 } 1910 } 1911 } 1912 1913 /* if the token was a variable top, we're gonna put it in */ 1914 if(variableTop == TRUE && src->varTop == NULL) { 1915 variableTop = FALSE; 1916 src->varTop = sourceToken; 1917 } 1918 1919 // Treat the expansions. 1920 // There are two types of expansions: explicit (x / y) and reset based propagating expansions 1921 // (&abc * d * e <=> &ab * d / c * e / c) 1922 // if both of them are in effect for a token, they are combined. 1923 1924 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; 1925 1926 if(expandNext != 0) { 1927 if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */ 1928 expandNext = 0; 1929 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */ 1930 sourceToken->expansion = expandNext; 1931 } else { /* there is both explicit and implicit expansion. We need to make a combination */ 1932 uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); 1933 uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar)); 1934 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source)); 1935 src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen; 1936 } 1937 } 1938 1939 // This is just for debugging purposes 1940 if(sourceToken->expansion != 0) { 1941 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); 1942 } else { 1943 sourceToken->debugExpansion = 0; 1944 } 1945 // if the previous token was a reset before, the strength of this 1946 // token must match the strength of before. Otherwise we have an 1947 // undefined situation. 1948 // In other words, we currently have a cludge which we use to 1949 // represent &a >> x. This is written as &[before 2]a << x. 1950 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) { 1951 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1; 1952 if(beforeStrength != sourceToken->strength) { 1953 *status = U_INVALID_FORMAT_ERROR; 1954 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); 1955 DBG_FORMAT_ERROR 1956 return 0; 1957 } 1958 } 1959 } else { 1960 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { 1961 /* if the previous token was also a reset, */ 1962 /*this means that we have two consecutive resets */ 1963 /* and we want to remove the previous one if empty*/ 1964 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { 1965 src->resultLen--; 1966 } 1967 } 1968 1969 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */ 1970 uint32_t searchCharsLen = src->parsedToken.charsLen; 1971 while(searchCharsLen > 1 && sourceToken == NULL) { 1972 searchCharsLen--; 1973 //key = searchCharsLen << 24 | charsOffset; 1974 UColToken key; 1975 key.source = searchCharsLen << 24 | src->parsedToken.charsOffset; 1976 key.rulesToParseHdl = &(src->source); 1977 sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1978 } 1979 if(sourceToken != NULL) { 1980 expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen); 1981 } 1982 } 1983 1984 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ 1985 if(top == FALSE) { /* there is no indirection */ 1986 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; 1987 if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { 1988 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */ 1989 while(sourceToken->strength > strength && sourceToken->previous != NULL) { 1990 sourceToken = sourceToken->previous; 1991 } 1992 /* here, either we hit the strength or NULL */ 1993 if(sourceToken->strength == strength) { 1994 if(sourceToken->previous != NULL) { 1995 sourceToken = sourceToken->previous; 1996 } else { /* start of list */ 1997 sourceToken = sourceToken->listHeader->reset; 1998 } 1999 } else { /* we hit NULL */ 2000 /* we should be doing the else part */ 2001 sourceToken = sourceToken->listHeader->reset; 2002 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); 2003 } 2004 } else { 2005 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); 2006 } 2007 } else { /* this is both before and indirection */ 2008 top = FALSE; 2009 ListList[src->resultLen].previousCE = 0; 2010 ListList[src->resultLen].previousContCE = 0; 2011 ListList[src->resultLen].indirect = TRUE; 2012 /* we need to do slightly more work. we need to get the baseCE using the */ 2013 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */ 2014 /* in ucol_bld */ 2015 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; 2016 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; 2017 uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F; 2018 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; 2019 2020 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 2021 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && 2022 (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 2023 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); 2024 uint32_t raw = uprv_uca_getRawFromImplicit(primary); 2025 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); 2026 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; 2027 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; 2028 } else { 2029 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/ 2030 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); 2031 } 2032 2033 ListList[src->resultLen].baseCE = CE; 2034 ListList[src->resultLen].baseContCE = SecondCE; 2035 ListList[src->resultLen].nextCE = 0; 2036 ListList[src->resultLen].nextContCE = 0; 2037 2038 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 2039 } 2040 } 2041 2042 2043 /* 5 If the relation is a reset: 2044 If sourceToken is null 2045 Create new list, create new sourceToken, make the baseCE from source, put 2046 the sourceToken in ListHeader of the new list */ 2047 if(sourceToken == NULL) { 2048 /* 2049 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... 2050 First convert all expansions into normal form. Examples: 2051 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * 2052 d * ... into &x * c/y * d * ... 2053 Note: reset values can never have expansions, although they can cause the 2054 very next item to have one. They may be contractions, if they are found 2055 earlier in the list. 2056 */ 2057 if(top == FALSE) { 2058 collIterate s; 2059 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; 2060 2061 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status); 2062 2063 CE = ucol_getNextCE(src->UCA, &s, status); 2064 const UChar *expand = s.pos; 2065 SecondCE = ucol_getNextCE(src->UCA, &s, status); 2066 2067 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; 2068 if(isContinuation(SecondCE)) { 2069 ListList[src->resultLen].baseContCE = SecondCE; 2070 } else { 2071 ListList[src->resultLen].baseContCE = 0; 2072 } 2073 ListList[src->resultLen].nextCE = 0; 2074 ListList[src->resultLen].nextContCE = 0; 2075 ListList[src->resultLen].previousCE = 0; 2076 ListList[src->resultLen].previousContCE = 0; 2077 ListList[src->resultLen].indirect = FALSE; 2078 sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status); 2079 } else { /* top == TRUE */ 2080 /* just use the supplied values */ 2081 top = FALSE; 2082 ListList[src->resultLen].previousCE = 0; 2083 ListList[src->resultLen].previousContCE = 0; 2084 ListList[src->resultLen].indirect = TRUE; 2085 ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; 2086 ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE; 2087 ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE; 2088 ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE; 2089 2090 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 2091 2092 } 2093 } else { /* reset to something already in rules */ 2094 top = FALSE; 2095 } 2096 } 2097 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */ 2098 lastToken = sourceToken; 2099 } else { 2100 if(U_FAILURE(*status)) { 2101 return 0; 2102 } 2103 } 2104 } 2105 #ifdef DEBUG_FOR_CODE_POINTS 2106 fclose(dfcp_fp); 2107 #endif 2108 2109 2110 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { 2111 src->resultLen--; 2112 } 2113 return src->resultLen; 2114 } 2115 2116 const UChar* ucol_tok_getRulesFromBundle( 2117 void* /*context*/, 2118 const char* locale, 2119 const char* type, 2120 int32_t* pLength, 2121 UErrorCode* status) 2122 { 2123 const UChar* rules = NULL; 2124 UResourceBundle* bundle; 2125 UResourceBundle* collations; 2126 UResourceBundle* collation; 2127 2128 *pLength = 0; 2129 2130 bundle = ures_open(U_ICUDATA_COLL, locale, status); 2131 if(U_SUCCESS(*status)){ 2132 collations = ures_getByKey(bundle, "collations", NULL, status); 2133 if(U_SUCCESS(*status)){ 2134 collation = ures_getByKey(collations, type, NULL, status); 2135 if(U_SUCCESS(*status)){ 2136 rules = ures_getStringByKey(collation, "Sequence", pLength, status); 2137 if(U_FAILURE(*status)){ 2138 *pLength = 0; 2139 rules = NULL; 2140 } 2141 ures_close(collation); 2142 } 2143 ures_close(collations); 2144 } 2145 } 2146 2147 ures_close(bundle); 2148 2149 return rules; 2150 } 2151 2152 void ucol_tok_initTokenList( 2153 UColTokenParser *src, 2154 const UChar *rules, 2155 uint32_t rulesLength, 2156 const UCollator *UCA, 2157 GetCollationRulesFunction importFunc, 2158 void* context, 2159 UErrorCode *status) { 2160 U_NAMESPACE_USE 2161 2162 uint32_t nSize = 0; 2163 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE); 2164 2165 bool needToDeallocRules = false; 2166 2167 if(U_FAILURE(*status)) { 2168 return; 2169 } 2170 2171 // set everything to zero, so that we can clean up gracefully 2172 uprv_memset(src, 0, sizeof(UColTokenParser)); 2173 2174 // first we need to find options that don't like to be normalized, 2175 // like copy and remove... 2176 //const UChar *openBrace = rules; 2177 int32_t optionNumber = -1; 2178 const UChar *setStart = NULL; 2179 uint32_t i = 0; 2180 while(i < rulesLength) { 2181 if(rules[i] == 0x005B) { // '[': start of an option 2182 /* Gets the following: 2183 optionNumber: The index of the option. 2184 setStart: The pointer at which the option arguments start. 2185 */ 2186 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart); 2187 2188 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */ 2189 // [optimize] 2190 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); 2191 if(U_SUCCESS(*status)) { 2192 if(src->copySet == NULL) { 2193 src->copySet = newSet; 2194 } else { 2195 uset_addAll(src->copySet, newSet); 2196 uset_close(newSet); 2197 } 2198 } else { 2199 return; 2200 } 2201 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) { 2202 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); 2203 if(U_SUCCESS(*status)) { 2204 if(src->removeSet == NULL) { 2205 src->removeSet = newSet; 2206 } else { 2207 uset_addAll(src->removeSet, newSet); 2208 uset_close(newSet); 2209 } 2210 } else { 2211 return; 2212 } 2213 } else if(optionNumber == OPTION_IMPORT){ 2214 // [import <collation-name>] 2215 2216 // Find the address of the closing ]. 2217 UChar* import_end = u_strchr(setStart, 0x005D); 2218 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules); 2219 // Ignore trailing whitespace. 2220 while(PatternProps::isWhiteSpace(*(import_end-1))) { 2221 --import_end; 2222 } 2223 2224 int32_t optionLength = (int32_t)(import_end - setStart); 2225 char option[50]; 2226 if(optionLength >= (int32_t)sizeof(option)) { 2227 *status = U_ILLEGAL_ARGUMENT_ERROR; 2228 return; 2229 } 2230 u_UCharsToChars(setStart, option, optionLength); 2231 option[optionLength] = 0; 2232 2233 *status = U_ZERO_ERROR; 2234 char locale[50]; 2235 int32_t templ; 2236 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status); 2237 if(U_FAILURE(*status)) { 2238 *status = U_ILLEGAL_ARGUMENT_ERROR; 2239 return; 2240 } 2241 2242 char type[50]; 2243 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 || 2244 U_FAILURE(*status) 2245 ) { 2246 *status = U_ZERO_ERROR; 2247 uprv_strcpy(type, "standard"); 2248 } 2249 2250 // TODO: Use public functions when available, see ticket #8134. 2251 char *keywords = (char *)locale_getKeywordsStart(locale); 2252 if(keywords != NULL) { 2253 *keywords = 0; 2254 } 2255 2256 int32_t importRulesLength = 0; 2257 const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status); 2258 2259 #ifdef DEBUG_FOR_COLL_RULES 2260 std::string s; 2261 UnicodeString(importRules).toUTF8String(s); 2262 std::cout << "Import rules = " << s << std::endl; 2263 #endif 2264 2265 // Add the length of the imported rules to length of the original rules, 2266 // and subtract the length of the import option. 2267 uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i); 2268 2269 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar)); 2270 2271 #ifdef DEBUG_FOR_COLL_RULES 2272 std::string s1; 2273 UnicodeString(rules).toUTF8String(s1); 2274 std::cout << "Original rules = " << s1 << std::endl; 2275 #endif 2276 2277 2278 // Copy the section of the original rules leading up to the import 2279 uprv_memcpy(newRules, rules, i*sizeof(UChar)); 2280 // Copy the imported rules 2281 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar)); 2282 // Copy the rest of the original rules (minus the import option itself) 2283 uprv_memcpy(newRules+i+importRulesLength, 2284 rules+optionEndOffset, 2285 (rulesLength-optionEndOffset)*sizeof(UChar)); 2286 2287 #ifdef DEBUG_FOR_COLL_RULES 2288 std::string s2; 2289 UnicodeString(newRules).toUTF8String(s2); 2290 std::cout << "Resulting rules = " << s2 << std::endl; 2291 #endif 2292 2293 if(needToDeallocRules){ 2294 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free 2295 uprv_free((void*)rules); 2296 } 2297 needToDeallocRules = true; 2298 rules = newRules; 2299 rulesLength = newRulesLength; 2300 2301 estimatedSize += importRulesLength*2; 2302 2303 // First character of the new rules needs to be processed 2304 i--; 2305 } 2306 } 2307 //openBrace++; 2308 i++; 2309 } 2310 2311 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar)); 2312 /* test for NULL */ 2313 if (src->source == NULL) { 2314 *status = U_MEMORY_ALLOCATION_ERROR; 2315 return; 2316 } 2317 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar)); 2318 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status); 2319 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) { 2320 *status = U_ZERO_ERROR; 2321 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); 2322 /* test for NULL */ 2323 if (src->source == NULL) { 2324 *status = U_MEMORY_ALLOCATION_ERROR; 2325 return; 2326 } 2327 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); 2328 } 2329 if(needToDeallocRules){ 2330 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free 2331 uprv_free((void*)rules); 2332 } 2333 2334 2335 src->current = src->source; 2336 src->end = src->source+nSize; 2337 src->sourceCurrent = src->source; 2338 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly 2339 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 2340 src->varTop = NULL; 2341 src->UCA = UCA; 2342 src->invUCA = ucol_initInverseUCA(status); 2343 src->parsedToken.charsLen = 0; 2344 src->parsedToken.charsOffset = 0; 2345 src->parsedToken.extensionLen = 0; 2346 src->parsedToken.extensionOffset = 0; 2347 src->parsedToken.prefixLen = 0; 2348 src->parsedToken.prefixOffset = 0; 2349 src->parsedToken.flags = 0; 2350 src->parsedToken.strength = UCOL_TOK_UNSET; 2351 src->buildCCTabFlag = FALSE; 2352 src->isStarred = FALSE; 2353 src->inRange = FALSE; 2354 src->lastRangeCp = 0; 2355 src->previousCp = 0; 2356 2357 if(U_FAILURE(*status)) { 2358 return; 2359 } 2360 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status); 2361 if(U_FAILURE(*status)) { 2362 return; 2363 } 2364 uhash_setValueDeleter(src->tailored, uprv_free); 2365 2366 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); 2367 /* test for NULL */ 2368 if (src->opts == NULL) { 2369 *status = U_MEMORY_ALLOCATION_ERROR; 2370 return; 2371 } 2372 2373 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); 2374 2375 src->lh = 0; 2376 src->listCapacity = 1024; 2377 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader)); 2378 //Test for NULL 2379 if (src->lh == NULL) { 2380 *status = U_MEMORY_ALLOCATION_ERROR; 2381 return; 2382 } 2383 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader)); 2384 src->resultLen = 0; 2385 2386 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 2387 2388 // UCOL_RESET_TOP_VALUE 2389 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); 2390 // UCOL_FIRST_PRIMARY_IGNORABLE 2391 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0); 2392 // UCOL_LAST_PRIMARY_IGNORABLE 2393 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0); 2394 // UCOL_FIRST_SECONDARY_IGNORABLE 2395 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0); 2396 // UCOL_LAST_SECONDARY_IGNORABLE 2397 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0); 2398 // UCOL_FIRST_TERTIARY_IGNORABLE 2399 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0); 2400 // UCOL_LAST_TERTIARY_IGNORABLE 2401 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0); 2402 // UCOL_FIRST_VARIABLE 2403 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0); 2404 // UCOL_LAST_VARIABLE 2405 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0); 2406 // UCOL_FIRST_NON_VARIABLE 2407 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0); 2408 // UCOL_LAST_NON_VARIABLE 2409 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); 2410 // UCOL_FIRST_IMPLICIT 2411 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0); 2412 // UCOL_LAST_IMPLICIT 2413 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING); 2414 // UCOL_FIRST_TRAILING 2415 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0); 2416 // UCOL_LAST_TRAILING 2417 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0); 2418 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24); 2419 } 2420 2421 2422 void ucol_tok_closeTokenList(UColTokenParser *src) { 2423 if(src->copySet != NULL) { 2424 uset_close(src->copySet); 2425 } 2426 if(src->removeSet != NULL) { 2427 uset_close(src->removeSet); 2428 } 2429 if(src->tailored != NULL) { 2430 uhash_close(src->tailored); 2431 } 2432 if(src->lh != NULL) { 2433 uprv_free(src->lh); 2434 } 2435 if(src->source != NULL) { 2436 uprv_free(src->source); 2437 } 2438 if(src->opts != NULL) { 2439 uprv_free(src->opts); 2440 } 2441 if (src->reorderCodes != NULL) { 2442 uprv_free(src->reorderCodes); 2443 } 2444 } 2445 2446 #endif /* #if !UCONFIG_NO_COLLATION */ 2447