1 /******************************************************************** 2 * Copyright (c) 2001-2008 International Business Machines 3 * Corporation and others. All Rights Reserved. 4 ******************************************************************** 5 * File USRCHDAT.H 6 * Modification History: 7 * Name date Description 8 * synwee July 31 2001 creation 9 ********************************************************************/ 10 11 12 /* 13 Note: This file is included by other C and C++ files. This file should not be directly compiled. 14 */ 15 #ifndef USRCHDAT_C 16 #define USRCHDAT_C 17 18 #include "unicode/ucol.h" 19 20 #if !UCONFIG_NO_COLLATION 21 22 /* Set to 1 if matches must be on grapheme boundaries */ 23 #define GRAPHEME_BOUNDARIES 1 24 25 U_CDECL_BEGIN 26 struct SearchData { 27 const char *text; 28 const char *pattern; 29 const char *collator; 30 UCollationStrength strength; 31 const char *breaker; 32 int8_t offset[32]; 33 uint8_t size[32]; 34 }; 35 U_CDECL_END 36 37 typedef struct SearchData SearchData; 38 39 static const char *TESTCOLLATORRULE = "& o,O ; p,P"; 40 41 static const char *EXTRACOLLATIONRULE = " & ae ; \\u00e4 & AE ; \\u00c4 & oe ; \\u00f6 & OE ; \\u00d6 & ue ; \\u00fc & UE ; \\u00dc"; 42 43 static const SearchData BASIC[] = { 44 {"xxxxxxxxxxxxxxxxxxxx", "fisher", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 45 {"silly spring string", "string", NULL, UCOL_TERTIARY, NULL, {13, -1}, 46 {6}}, 47 {"silly spring string string", "string", NULL, UCOL_TERTIARY, NULL, 48 {13, 20, -1}, {6, 6}}, 49 {"silly string spring string", "string", NULL, UCOL_TERTIARY, NULL, 50 {6, 20, -1}, {6, 6}}, 51 {"string spring string", "string", NULL, UCOL_TERTIARY, NULL, {0, 14, -1}, 52 {6, 6}}, 53 {"Scott Ganyo", "c", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}}, 54 {"Scott Ganyo", " ", NULL, UCOL_TERTIARY, NULL, {5, -1}, {1}}, 55 {"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 56 {"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 57 58 #if GRAPHEME_BOUNDARIES 59 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 60 {"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 61 #else 62 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}}, 63 {"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}}, 64 #endif 65 66 {"\\u00c9", "e", NULL, UCOL_PRIMARY, NULL, {0, -1}, {1}}, 67 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 68 }; 69 70 static const SearchData BREAKITERATOREXACT[] = { 71 {"foxy fox", "fox", NULL, UCOL_TERTIARY, "characterbreaker", {0, 5, -1}, 72 {3, 3}}, 73 {"foxy fox", "fox", NULL, UCOL_TERTIARY, "wordbreaker", {5, -1}, {3}}, 74 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY, 75 "characterbreaker", {10, 14, -1}, {3, 2}}, 76 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY, "wordbreaker", 77 {10, -1}, {3}}, 78 {"Channel, another channel, more channels, and one last Channel", 79 "Channel", "es", UCOL_TERTIARY, "wordbreaker", {0, 54, -1}, {7, 7}}, 80 /* jitterbug 1745 */ 81 {"testing that \\u00e9 does not match e", "e", NULL, UCOL_TERTIARY, 82 "characterbreaker", {1, 17, 30, -1}, {1, 1, 1}}, 83 {"testing that string ab\\u00e9cd does not match e", "e", NULL, 84 UCOL_TERTIARY, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}}, 85 {"\\u00c9", "e", "fr", UCOL_PRIMARY, "characterbreaker", {0, -1}, {1}}, 86 #if 0 87 /* Problem reported by Dave Bertoni, same as ticket 4279? */ 88 {"\\u0043\\u004F\\u0302\\u0054\\u00C9", "\\u004F", NULL, UCOL_TERTIARY, "characterbreaker", {1, -1}, {2}}, 89 #endif 90 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 91 }; 92 93 static const SearchData STRENGTH[] = { 94 /*012345678901234567890123456789012345678901234567890123456789*/ 95 {"The quick brown fox jumps over the lazy foxes", "fox", "en", 96 UCOL_PRIMARY, NULL, {16, 40, -1}, {3, 3}}, 97 {"The quick brown fox jumps over the lazy foxes", "fox", "en", 98 UCOL_PRIMARY, "wordbreaker", {16, -1}, {3}}, 99 {"blackbirds Pat p\\u00E9ch\\u00E9 p\\u00EAche p\\u00E9cher p\\u00EAcher Tod T\\u00F6ne black Tofu blackbirds Ton PAT toehold blackbird black-bird pat toe big Toe", 100 "peche", "fr", UCOL_PRIMARY, NULL, {15, 21, 27, 34, -1}, {5, 5, 5, 5}}, 101 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY, NULL, 102 {10, 14, -1}, {3, 2}}, 103 {"A channel, another CHANNEL, more Channels, and one last channel...", 104 "channel", "es", UCOL_PRIMARY, NULL, {2, 19, 33, 56, -1}, 105 {7, 7, 7, 7}}, 106 {"\\u00c0 should match but not A", "A\\u0300", "en", UCOL_IDENTICAL, 107 NULL, {0, -1}, {1, 0}}, 108 109 #if 0 110 /* Ticket 5382 */ 111 {"12\\u0171", "\\u0170", NULL, UCOL_SECONDARY, NULL, {2, -1}, {2}}, 112 #endif 113 114 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 115 }; 116 117 static const SearchData VARIABLE[] = { 118 /*012345678901234567890123456789012345678901234567890123456789*/ 119 {"blackbirds black blackbirds blackbird black-bird", 120 "blackbird", NULL, UCOL_TERTIARY, NULL, {0, 17, 28, 38, -1}, 121 {9, 9, 9, 10}}, 122 /* to see that it doesn't go into an infinite loop if the start of text 123 is a ignorable character */ 124 {" on", "go", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 125 {"abcdefghijklmnopqrstuvwxyz", " ", NULL, UCOL_PRIMARY, NULL, 126 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 127 20, 21, 22, 23, 24, 25, -1}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}, 129 /* testing tightest match */ 130 {" abc a bc ab c a bc ab c", "abc", NULL, UCOL_QUATERNARY, 131 NULL, {1, -1}, {3}}, 132 /*012345678901234567890123456789012345678901234567890123456789 */ 133 {" abc a bc ab c a bc ab c", "abc", NULL, UCOL_SECONDARY, 134 NULL, {1, 6, 13, 21, 31, -1}, {3, 4, 4, 5, 5}}, 135 /* totally ignorable text */ 136 {" ---------------", "abc", NULL, UCOL_SECONDARY, 137 NULL, {-1}, {0}}, 138 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 139 }; 140 141 static const SearchData NORMEXACT[] = { 142 {"a\\u0300\\u0325", "a\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}}, 143 144 #if GRAPHEME_BOUNDARIES 145 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 146 #else 147 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}}, 148 #endif 149 150 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 151 }; 152 153 static const SearchData NONNORMEXACT[] = { 154 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 155 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 156 }; 157 158 static const SearchData OVERLAP[] = { 159 {"abababab", "abab", NULL, UCOL_TERTIARY, NULL, {0, 2, 4, -1}, 160 {4, 4, 4}}, 161 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 162 }; 163 164 static const SearchData NONOVERLAP[] = { 165 {"abababab", "abab", NULL, UCOL_TERTIARY, NULL, {0, 4, -1}, {4, 4}}, 166 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 167 }; 168 169 static const SearchData COLLATOR[] = { 170 /* english */ 171 {"fox fpx", "fox", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}}, 172 /* tailored */ 173 {"fox fpx", "fox", NULL, UCOL_PRIMARY, NULL, {0, 4, -1}, {3, 3}}, 174 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 175 }; 176 177 static const SearchData PATTERN[] = { 178 {"The quick brown fox jumps over the lazy foxes", "the", NULL, 179 UCOL_PRIMARY, NULL, {0, 31, -1}, {3, 3}}, 180 {"The quick brown fox jumps over the lazy foxes", "fox", NULL, 181 UCOL_PRIMARY, NULL, {16, 40, -1}, {3, 3}}, 182 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 183 }; 184 185 static const SearchData TEXT[] = { 186 {"the foxy brown fox", "fox", NULL, UCOL_TERTIARY, NULL, {4, 15, -1}, 187 {3, 3}}, 188 {"the quick brown fox", "fox", NULL, UCOL_TERTIARY, NULL, {16, -1}, 189 {3}}, 190 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 191 }; 192 193 static const SearchData COMPOSITEBOUNDARIES[] = { 194 #if GRAPHEME_BOUNDARIES 195 {"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 196 {"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 197 {"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}}, 198 {"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 199 {"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 200 {"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 201 {"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 202 #else 203 {"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 204 {"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}}, 205 {"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}}, 206 {"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}}, 207 {"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 208 {"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 209 {"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, 210 {1, 1}}, 211 #endif 212 213 {"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 214 /* A + 030A + 0301 */ 215 {"\\u01FA", "\\u01FA", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 216 {"\\u01FA", "A\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 217 {"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 218 {"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 219 {"\\u01FA", "\\u030AA", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 220 {"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 221 {"\\u01FA", "A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 222 {"\\u01FA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 223 224 #if GRAPHEME_BOUNDARIES 225 {"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 226 #else 227 {"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 228 #endif 229 230 {"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 231 {"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 232 {"\\u0F73", "\\u0F73", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 233 {"\\u0F73", "\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 234 {"\\u0F73", "\\u0F72", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 235 {"\\u0F73", "\\u0F71\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 236 {"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 237 {"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 238 239 /* Ticket 5024 */ 240 {"a\\u00e1", "a\\u00e1", NULL, UCOL_SECONDARY, NULL, {0, -1}, {2}}, 241 242 /* Ticket 5420 */ 243 {"fu\\u00dfball", "fu\\u00df", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}}, 244 {"fu\\u00dfball", "fuss", NULL, UCOL_PRIMARY, NULL, {0, -1}, {3}}, 245 {"fu\\u00dfball", "uss", NULL, UCOL_PRIMARY, NULL, {1, -1}, {2}}, 246 247 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 248 }; 249 250 static const SearchData MATCH[] = { 251 {"a busy bee is a very busy beeee", "bee", NULL, UCOL_TERTIARY, NULL, 252 {7, 26, -1}, {3, 3}}, 253 /* 012345678901234567890123456789012345678901234567890 */ 254 {"a busy bee is a very busy beeee with no bee life", "bee", NULL, 255 UCOL_TERTIARY, NULL, {7, 26, 40, -1}, {3, 3, 3}}, 256 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 257 }; 258 259 static const SearchData SUPPLEMENTARY[] = { 260 /* 012345678901234567890123456789012345678901234567890012345678901234567890123456789012345678901234567890012345678901234567890123456789 */ 261 {"abc \\uD800\\uDC00 \\uD800\\uDC01 \\uD801\\uDC00 \\uD800\\uDC00abc abc\\uD800\\uDC00 \\uD800\\uD800\\uDC00 \\uD800\\uDC00\\uDC00", 262 "\\uD800\\uDC00", NULL, UCOL_TERTIARY, NULL, {4, 13, 22, 26, 29, -1}, 263 {2, 2, 2, 2, 2}}, 264 {"and\\uD834\\uDDB9this sentence", "\\uD834\\uDDB9", NULL, 265 UCOL_TERTIARY, NULL, {3, -1}, {2}}, 266 {"and \\uD834\\uDDB9 this sentence", " \\uD834\\uDDB9 ", NULL, 267 UCOL_TERTIARY, NULL, {3, -1}, {4}}, 268 {"and-\\uD834\\uDDB9-this sentence", "-\\uD834\\uDDB9-", NULL, 269 UCOL_TERTIARY, NULL, {3, -1}, {4}}, 270 {"and,\\uD834\\uDDB9,this sentence", ",\\uD834\\uDDB9,", NULL, 271 UCOL_TERTIARY, NULL, {3, -1}, {4}}, 272 {"and?\\uD834\\uDDB9?this sentence", "?\\uD834\\uDDB9?", NULL, 273 UCOL_TERTIARY, NULL, {3, -1}, {4}}, 274 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 275 }; 276 277 static const char *CONTRACTIONRULE = 278 "&z = ab/c < AB < X\\u0300 < ABC < X\\u0300\\u0315"; 279 280 static const SearchData CONTRACTION[] = { 281 /* common discontiguous */ 282 {"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 283 284 #if GRAPHEME_BOUNDARIES 285 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 286 #else 287 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}}, 288 #endif 289 290 /* contraction prefix */ 291 {"AB\\u0315C", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 292 293 #if GRAPHEME_BOUNDARIES 294 {"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 295 {"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 296 #else 297 {"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 298 {"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {2, -1}, {1}}, 299 #endif 300 301 /* discontiguous problem here for backwards iteration. 302 accents not found because discontiguous stores all information */ 303 {"X\\u0300\\u0319\\u0315", "\\u0319", NULL, UCOL_TERTIARY, NULL, {-1}, 304 {0}}, 305 /* ends not with a contraction character */ 306 {"X\\u0315\\u0300D", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, 307 {0}}, 308 {"X\\u0315\\u0300D", "X\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, 309 {0, -1}, {3}}, 310 {"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, 311 {0}}, 312 /* blocked discontiguous */ 313 {"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL, 314 {-1}, {0}}, 315 316 #if GRAPHEME_BOUNDARIES 317 /* 318 * "ab" generates a contraction that's an expansion. The "z" matches the 319 * first CE of the expansion but the match fails because it ends in the 320 * middle of an expansion... 321 */ 322 {"ab", "z", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 323 #else 324 {"ab", "z", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 325 #endif 326 327 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 328 }; 329 330 static const char *IGNORABLERULE = "&a = \\u0300"; 331 332 static const SearchData IGNORABLE[] = { 333 #if GRAPHEME_BOUNDARIES 334 /* 335 * This isn't much of a test when matches have to be on 336 * grapheme boundiaries. The match at 0 only works because 337 * it's at the start of the text. 338 */ 339 {"\\u0300\\u0315 \\u0300\\u0315 ", "\\u0300", NULL, UCOL_PRIMARY, NULL, 340 {0, -1}, {2}}, 341 #else 342 {"\\u0300\\u0315 \\u0300\\u0315 ", "\\u0300", NULL, UCOL_PRIMARY, NULL, 343 {0, 3, -1}, {2, 2}}, 344 #endif 345 346 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 347 }; 348 349 static const SearchData BASICCANONICAL[] = { 350 {"xxxxxxxxxxxxxxxxxxxx", "fisher", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 351 {"silly spring string", "string", NULL, UCOL_TERTIARY, NULL, {13, -1}, 352 {6}}, 353 {"silly spring string string", "string", NULL, UCOL_TERTIARY, NULL, 354 {13, 20, -1}, {6, 6}}, 355 {"silly string spring string", "string", NULL, UCOL_TERTIARY, NULL, 356 {6, 20, -1}, {6, 6}}, 357 {"string spring string", "string", NULL, UCOL_TERTIARY, NULL, {0, 14, -1}, 358 {6, 6}}, 359 {"Scott Ganyo", "c", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}}, 360 {"Scott Ganyo", " ", NULL, UCOL_TERTIARY, NULL, {5, -1}, {1}}, 361 362 #if GRAPHEME_BOUNDARIES 363 {"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 364 {"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 365 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 366 {"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 367 {"a\\u0300\\u0325b", "\\u0300b", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 368 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0300A\\u0300", NULL, UCOL_TERTIARY, 369 NULL, {-1}, {0}}, 370 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0325A\\u0325", NULL, UCOL_TERTIARY, 371 NULL, {-1}, {0}}, 372 {"a\\u0300\\u0325b\\u0300\\u0325c \\u0325b\\u0300 \\u0300b\\u0325", 373 "\\u0300b\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 374 #else 375 {"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 376 {"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}}, 377 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1}, 378 {2}}, 379 {"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}}, 380 {"a\\u0300\\u0325b", "\\u0300b", NULL, UCOL_TERTIARY, NULL, {1, -1}, {3}}, 381 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0300A\\u0300", NULL, UCOL_TERTIARY, 382 NULL, {0, -1}, {5}}, 383 {"\\u0325\\u0300A\\u0325\\u0300", "\\u0325A\\u0325", NULL, UCOL_TERTIARY, 384 NULL, {0, -1}, {5}}, 385 {"a\\u0300\\u0325b\\u0300\\u0325c \\u0325b\\u0300 \\u0300b\\u0325", 386 "\\u0300b\\u0325", NULL, UCOL_TERTIARY, NULL, {1, 12, -1}, {5, 3}}, 387 #endif 388 389 {"\\u00c4\\u0323", "A\\u0323\\u0308", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 390 {"\\u0308\\u0323", "\\u0323\\u0308", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 391 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 392 }; 393 394 395 static const SearchData NORMCANONICAL[] = { 396 #if GRAPHEME_BOUNDARIES 397 /* 398 * These tests don't really mean anything. With matches restricted to grapheme 399 * boundaries, isCanonicalMatch doesn't mean anything unless normalization is 400 * also turned on... 401 */ 402 {"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 403 {"\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 404 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 405 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 406 {"a\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 407 {"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 408 #else 409 {"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 410 {"\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 411 {"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, 412 {2}}, 413 {"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1}, 414 {2}}, 415 {"a\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}}, 416 {"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}}, 417 #endif 418 419 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 420 }; 421 422 static const SearchData BREAKITERATORCANONICAL[] = { 423 {"foxy fox", "fox", NULL, UCOL_TERTIARY, "characterbreaker", {0, 5, -1}, 424 {3, 3}}, 425 {"foxy fox", "fox", NULL, UCOL_TERTIARY, "wordbreaker", {5, -1}, {3}}, 426 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY, 427 "characterbreaker", {10, 14, -1}, {3, 2}}, 428 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY, "wordbreaker", 429 {10, -1}, {3}}, 430 {"Channel, another channel, more channels, and one last Channel", 431 "Channel", "es", UCOL_TERTIARY, "wordbreaker", {0, 54, -1}, {7, 7}}, 432 /* jitterbug 1745 */ 433 {"testing that \\u00e9 does not match e", "e", NULL, UCOL_TERTIARY, 434 "characterbreaker", {1, 17, 30, -1}, {1, 1, 1}}, 435 {"testing that string ab\\u00e9cd does not match e", "e", NULL, 436 UCOL_TERTIARY, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}}, 437 {"\\u00c9", "e", "fr", UCOL_PRIMARY, "characterbreaker", {0, -1}, {1}}, 438 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 439 }; 440 441 static const SearchData STRENGTHCANONICAL[] = { 442 /*012345678901234567890123456789012345678901234567890123456789 */ 443 {"The quick brown fox jumps over the lazy foxes", "fox", "en", 444 UCOL_PRIMARY, NULL, {16, 40, -1}, {3, 3}}, 445 {"The quick brown fox jumps over the lazy foxes", "fox", "en", 446 UCOL_PRIMARY, "wordbreaker", {16, -1}, {3}}, 447 {"blackbirds Pat p\\u00E9ch\\u00E9 p\\u00EAche p\\u00E9cher p\\u00EAcher Tod T\\u00F6ne black Tofu blackbirds Ton PAT toehold blackbird black-bird pat toe big Toe", 448 "peche", "fr", UCOL_PRIMARY, NULL, {15, 21, 27, 34, -1}, {5, 5, 5, 5}}, 449 {"This is a toe T\\u00F6ne", "toe", "de", UCOL_PRIMARY, NULL, 450 {10, 14, -1}, {3, 2}}, 451 {"A channel, another CHANNEL, more Channels, and one last channel...", 452 "channel", "es", UCOL_PRIMARY, NULL, {2, 19, 33, 56, -1}, 453 {7, 7, 7, 7}}, 454 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 455 }; 456 457 static const SearchData VARIABLECANONICAL[] = { 458 /*012345678901234567890123456789012345678901234567890123456789 */ 459 {"blackbirds black blackbirds blackbird black-bird", 460 "blackbird", NULL, UCOL_TERTIARY, NULL, {0, 17, 28, 38, -1}, 461 {9, 9, 9, 10}}, 462 /* to see that it doesn't go into an infinite loop if the start of text 463 is a ignorable character */ 464 {" on", "go", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 465 {"abcdefghijklmnopqrstuvwxyz", " ", NULL, UCOL_PRIMARY, NULL, 466 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 467 20, 21, 22, 23, 24, 25, -1}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}, 469 /* testing tightest match */ 470 {" abc a bc ab c a bc ab c", "abc", NULL, UCOL_QUATERNARY, 471 NULL, {1, -1}, {3}}, 472 /*012345678901234567890123456789012345678901234567890123456789 */ 473 {" abc a bc ab c a bc ab c", "abc", NULL, UCOL_SECONDARY, 474 NULL, {1, 6, 13, 21, 31, -1}, {3, 4, 4, 5, 5}}, 475 /* totally ignorable text */ 476 {" ---------------", "abc", NULL, UCOL_SECONDARY, 477 NULL, {-1}, {0}}, 478 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 479 }; 480 481 static const SearchData OVERLAPCANONICAL[] = { 482 {"abababab", "abab", NULL, UCOL_TERTIARY, NULL, {0, 2, 4, -1}, 483 {4, 4, 4}}, 484 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 485 }; 486 487 static const SearchData NONOVERLAPCANONICAL[] = { 488 {"abababab", "abab", NULL, UCOL_TERTIARY, NULL, {0, 4, -1}, {4, 4}}, 489 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 490 }; 491 492 static const SearchData COLLATORCANONICAL[] = { 493 /* english */ 494 {"fox fpx", "fox", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}}, 495 /* tailored */ 496 {"fox fpx", "fox", NULL, UCOL_PRIMARY, NULL, {0, 4, -1}, {3, 3}}, 497 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 498 }; 499 500 static const SearchData PATTERNCANONICAL[] = { 501 {"The quick brown fox jumps over the lazy foxes", "the", NULL, 502 UCOL_PRIMARY, NULL, {0, 31, -1}, {3, 3}}, 503 {"The quick brown fox jumps over the lazy foxes", "fox", NULL, 504 UCOL_PRIMARY, NULL, {16, 40, -1}, {3, 3}}, 505 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 506 }; 507 508 static const SearchData TEXTCANONICAL[] = { 509 {"the foxy brown fox", "fox", NULL, UCOL_TERTIARY, NULL, {4, 15, -1}, 510 {3, 3}}, 511 {"the quick brown fox", "fox", NULL, UCOL_TERTIARY, NULL, {16, -1}, 512 {3}}, 513 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 514 }; 515 516 static const SearchData COMPOSITEBOUNDARIESCANONICAL[] = { 517 #if GRAPHEME_BOUNDARIES 518 {"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 519 {"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 520 {"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}}, 521 {"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 522 {"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 523 {"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 524 525 /* first one matches only because it's at the start of the text */ 526 {"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 527 528 /* \\u0300 blocked by \\u0300 */ 529 {"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 530 #else 531 {"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 532 {"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}}, 533 {"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}}, 534 {"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}}, 535 {"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 536 {"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 537 {"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, 538 {1, 1}}, 539 /* \\u0300 blocked by \\u0300 */ 540 {"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 541 #endif 542 543 /* A + 030A + 0301 */ 544 {"\\u01FA", "\\u01FA", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 545 {"\\u01FA", "A\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 546 547 #if GRAPHEME_BOUNDARIES 548 {"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 549 {"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 550 #else 551 {"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 552 {"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 553 #endif 554 555 {"\\u01FA", "\\u030AA", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 556 557 #if GRAPHEME_BOUNDARIES 558 {"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 559 #else 560 {"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 561 #endif 562 563 /* blocked accent */ 564 {"\\u01FA", "A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 565 {"\\u01FA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 566 567 #if GRAPHEME_BOUNDARIES 568 {"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 569 {"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 570 {"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 571 #else 572 {"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 573 {"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}}, 574 {"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 575 #endif 576 577 {"\\u0F73", "\\u0F73", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 578 579 #if GRAPHEME_BOUNDARIES 580 {"\\u0F73", "\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 581 {"\\u0F73", "\\u0F72", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 582 #else 583 {"\\u0F73", "\\u0F71", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 584 {"\\u0F73", "\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 585 #endif 586 587 {"\\u0F73", "\\u0F71\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}}, 588 589 #if GRAPHEME_BOUNDARIES 590 {"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 591 {"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 592 {"\\u01FA A\\u0301\\u030A A\\u030A\\u0301 A\\u030A \\u01FA", "A\\u030A", 593 NULL, UCOL_TERTIARY, NULL, {10, -1}, {2}}, 594 #else 595 {"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 596 {"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 597 {"\\u01FA A\\u0301\\u030A A\\u030A\\u0301 A\\u030A \\u01FA", "A\\u030A", 598 NULL, UCOL_TERTIARY, NULL, {0, 6, 10, 13, -1}, {1, 3, 2, 1}}, 599 #endif 600 601 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 602 }; 603 604 static const SearchData MATCHCANONICAL[] = { 605 {"a busy bee is a very busy beeee", "bee", NULL, UCOL_TERTIARY, NULL, 606 {7, 26, -1}, {3, 3}}, 607 /*012345678901234567890123456789012345678901234567890 */ 608 {"a busy bee is a very busy beeee with no bee life", "bee", NULL, 609 UCOL_TERTIARY, NULL, {7, 26, 40, -1}, {3, 3, 3}}, 610 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 611 }; 612 613 static const SearchData SUPPLEMENTARYCANONICAL[] = { 614 /*012345678901234567890123456789012345678901234567890012345678901234567890123456789012345678901234567890012345678901234567890123456789 */ 615 {"abc \\uD800\\uDC00 \\uD800\\uDC01 \\uD801\\uDC00 \\uD800\\uDC00abc abc\\uD800\\uDC00 \\uD800\\uD800\\uDC00 \\uD800\\uDC00\\uDC00", 616 "\\uD800\\uDC00", NULL, UCOL_TERTIARY, NULL, {4, 13, 22, 26, 29, -1}, 617 {2, 2, 2, 2, 2}}, 618 {"and\\uD834\\uDDB9this sentence", "\\uD834\\uDDB9", NULL, 619 UCOL_TERTIARY, NULL, {3, -1}, {2}}, 620 {"and \\uD834\\uDDB9 this sentence", " \\uD834\\uDDB9 ", NULL, 621 UCOL_TERTIARY, NULL, {3, -1}, {4}}, 622 {"and-\\uD834\\uDDB9-this sentence", "-\\uD834\\uDDB9-", NULL, 623 UCOL_TERTIARY, NULL, {3, -1}, {4}}, 624 {"and,\\uD834\\uDDB9,this sentence", ",\\uD834\\uDDB9,", NULL, 625 UCOL_TERTIARY, NULL, {3, -1}, {4}}, 626 {"and?\\uD834\\uDDB9?this sentence", "?\\uD834\\uDDB9?", NULL, 627 UCOL_TERTIARY, NULL, {3, -1}, {4}}, 628 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 629 }; 630 631 static const SearchData CONTRACTIONCANONICAL[] = { 632 /* common discontiguous */ 633 #if GRAPHEME_BOUNDARIES 634 {"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 635 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 636 #else 637 {"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}}, 638 {"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}}, 639 #endif 640 641 /* contraction prefix */ 642 {"AB\\u0315C", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 643 644 #if GRAPHEME_BOUNDARIES 645 {"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 646 {"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 647 #else 648 {"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 649 {"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {2, -1}, {1}}, 650 #endif 651 652 /* discontiguous problem here for backwards iteration. 653 forwards gives 0, 4 but backwards give 1, 3 */ 654 /* {"X\\u0300\\u0319\\u0315", "\\u0319", NULL, UCOL_TERTIARY, NULL, {0, -1}, 655 {4}}, */ 656 657 /* ends not with a contraction character */ 658 {"X\\u0315\\u0300D", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 659 {"X\\u0315\\u0300D", "X\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}}, 660 661 #if GRAPHEME_BOUNDARIES 662 {"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 663 664 /* blocked discontiguous */ 665 {"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL, {-1}, {0}}, 666 667 /* 668 * "ab" generates a contraction that's an expansion. The "z" matches the 669 * first CE of the expansion but the match fails because it ends in the 670 * middle of an expansion... 671 */ 672 {"ab", "z", NULL, UCOL_TERTIARY, NULL, {-1}, {2}}, 673 #else 674 {"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {4}}, 675 676 /* blocked discontiguous */ 677 {"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL, {1, -1}, {4}}, 678 679 {"ab", "z", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}}, 680 #endif 681 682 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 683 }; 684 685 static const SearchData DIACRITICMATCH[] = { 686 {"\\u03BA\\u03B1\\u03B9\\u0300\\u0020\\u03BA\\u03B1\\u1F76", "\\u03BA\\u03B1\\u03B9", NULL, UCOL_PRIMARY, NULL, {0, 5,-1}, {4, 3}}, 687 {"\\u0061\\u0061\\u00E1", "\\u0061\\u00E1", NULL, UCOL_SECONDARY, NULL, {1, -1}, {2}}, 688 {"\\u0020\\u00C2\\u0303\\u0020\\u0041\\u0061\\u1EAA\\u0041\\u0302\\u0303\\u00C2\\u0303\\u1EAB\\u0061\\u0302\\u0303\\u00E2\\u0303\\uD806\\uDC01\\u0300\\u0020", 689 "\\u00C2\\u0303", "LDE_AN_CX_EX_FX_HX_NX_S1", UCOL_PRIMARY, NULL, {1, 4, 5, 6, 7, 10, 12, 13, 16,-1}, {2, 1, 1, 1, 3, 2, 1, 3, 2}}, 690 {NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}} 691 }; 692 693 #endif /* #if !UCONFIG_NO_COLLATION */ 694 695 #endif 696