1 /*********************************************************************** 2 * 2016 and later: Unicode, Inc. and others. 3 * License & terms of use: http://www.unicode.org/copyright.html#License 4 *********************************************************************** 5 *********************************************************************** 6 * COPYRIGHT: 7 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. 8 * 9 ***********************************************************************/ 10 /******************************************************************************** 11 * 12 * File CALLCOLL.C 13 * 14 * Modification History: 15 * Name Description 16 * Andy Heninger First Version 17 * 18 ********************************************************************************* 19 */ 20 21 // 22 // This program tests string collation and sort key generation performance. 23 // Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString 24 // A file of names is required as input, one per line. It must be in utf-8 or utf-16 format, 25 // and include a byte order mark. Either LE or BE format is OK. 26 // 27 28 const char gUsageString[] = 29 "usage: collperf options...\n" 30 "-help Display this message.\n" 31 "-file file_name utf-16 format file of names.\n" 32 "-locale name ICU locale to use. Default is en_US\n" 33 "-rules file_name Collation rules file (overrides locale)\n" 34 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" 35 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" 36 "-win Run test using Windows native services. (ICU is default)\n" 37 "-unix Run test using Unix strxfrm, strcoll services.\n" 38 "-uselen Use API with string lengths. Default is null-terminated strings\n" 39 "-usekeys Run tests using sortkeys rather than strcoll\n" 40 "-strcmp Run tests using u_strcmp rather than strcoll\n" 41 "-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n" 42 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" 43 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" 44 " under test at each call point. For measuring test overhead.\n" 45 "-terse Terse numbers-only output. Intended for use by scripts.\n" 46 "-french French accent ordering\n" 47 "-frenchoff No French accent ordering (for use with French locales.)\n" 48 "-norm Normalizing mode on\n" 49 "-shifted Shifted mode\n" 50 "-lower Lower case first\n" 51 "-upper Upper case first\n" 52 "-case Enable separate case level\n" 53 "-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n" 54 "-keyhist Produce a table sort key size vs. string length\n" 55 "-binsearch Binary Search timing test\n" 56 "-keygen Sort Key Generation timing test\n" 57 "-qsort Quicksort timing test\n" 58 "-iter Iteration Performance Test\n" 59 "-dump Display strings, sort keys and CEs.\n" 60 ; 61 62 63 64 #include <stdio.h> 65 #include <string.h> 66 #include <stdlib.h> 67 #include <math.h> 68 #include <locale.h> 69 #include <errno.h> 70 71 #include <unicode/utypes.h> 72 #include <unicode/ucol.h> 73 #include <unicode/ucoleitr.h> 74 #include <unicode/uloc.h> 75 #include <unicode/ustring.h> 76 #include <unicode/ures.h> 77 #include <unicode/uchar.h> 78 #include <unicode/ucnv.h> 79 #include <unicode/utf8.h> 80 81 #ifdef WIN32 82 #include <windows.h> 83 #else 84 // 85 // Stubs for Windows API functions when building on UNIXes. 86 // 87 typedef int DWORD; 88 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} 89 #include <sys/time.h> 90 unsigned long timeGetTime() { 91 struct timeval t; 92 gettimeofday(&t, 0); 93 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. 94 val += t.tv_usec / 1000; 95 return val; 96 } 97 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} 98 const int LCMAP_SORTKEY = 0; 99 #define MAKELCID(a,b) 0 100 const int SORT_DEFAULT = 0; 101 #endif 102 103 104 105 // 106 // Command line option variables 107 // These global variables are set according to the options specified 108 // on the command line by the user. 109 char * opt_fName = 0; 110 const char * opt_locale = "en_US"; 111 int opt_langid = 0; // Defaults to value corresponding to opt_locale. 112 char * opt_rules = 0; 113 UBool opt_help = FALSE; 114 int opt_loopCount = 1; 115 int opt_iLoopCount = 1; 116 UBool opt_terse = FALSE; 117 UBool opt_qsort = FALSE; 118 UBool opt_binsearch = FALSE; 119 UBool opt_icu = TRUE; 120 UBool opt_win = FALSE; // Run with Windows native functions. 121 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. 122 UBool opt_uselen = FALSE; 123 UBool opt_usekeys = FALSE; 124 UBool opt_strcmp = FALSE; 125 UBool opt_strcmpCPO = FALSE; 126 UBool opt_norm = FALSE; 127 UBool opt_keygen = FALSE; 128 UBool opt_french = FALSE; 129 UBool opt_frenchoff = FALSE; 130 UBool opt_shifted = FALSE; 131 UBool opt_lower = FALSE; 132 UBool opt_upper = FALSE; 133 UBool opt_case = FALSE; 134 int opt_level = 0; 135 UBool opt_keyhist = FALSE; 136 UBool opt_itertest = FALSE; 137 UBool opt_dump = FALSE; 138 139 140 141 // 142 // Definitions for the command line options 143 // 144 struct OptSpec { 145 const char *name; 146 enum {FLAG, NUM, STRING} type; 147 void *pVar; 148 }; 149 150 OptSpec opts[] = { 151 {"-file", OptSpec::STRING, &opt_fName}, 152 {"-locale", OptSpec::STRING, &opt_locale}, 153 {"-langid", OptSpec::NUM, &opt_langid}, 154 {"-rules", OptSpec::STRING, &opt_rules}, 155 {"-qsort", OptSpec::FLAG, &opt_qsort}, 156 {"-binsearch", OptSpec::FLAG, &opt_binsearch}, 157 {"-iter", OptSpec::FLAG, &opt_itertest}, 158 {"-win", OptSpec::FLAG, &opt_win}, 159 {"-unix", OptSpec::FLAG, &opt_unix}, 160 {"-uselen", OptSpec::FLAG, &opt_uselen}, 161 {"-usekeys", OptSpec::FLAG, &opt_usekeys}, 162 {"-strcmp", OptSpec::FLAG, &opt_strcmp}, 163 {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO}, 164 {"-norm", OptSpec::FLAG, &opt_norm}, 165 {"-french", OptSpec::FLAG, &opt_french}, 166 {"-frenchoff", OptSpec::FLAG, &opt_frenchoff}, 167 {"-shifted", OptSpec::FLAG, &opt_shifted}, 168 {"-lower", OptSpec::FLAG, &opt_lower}, 169 {"-upper", OptSpec::FLAG, &opt_upper}, 170 {"-case", OptSpec::FLAG, &opt_case}, 171 {"-level", OptSpec::NUM, &opt_level}, 172 {"-keyhist", OptSpec::FLAG, &opt_keyhist}, 173 {"-keygen", OptSpec::FLAG, &opt_keygen}, 174 {"-loop", OptSpec::NUM, &opt_loopCount}, 175 {"-iloop", OptSpec::NUM, &opt_iLoopCount}, 176 {"-terse", OptSpec::FLAG, &opt_terse}, 177 {"-dump", OptSpec::FLAG, &opt_dump}, 178 {"-help", OptSpec::FLAG, &opt_help}, 179 {"-?", OptSpec::FLAG, &opt_help}, 180 {0, OptSpec::FLAG, 0} 181 }; 182 183 184 //--------------------------------------------------------------------------- 185 // 186 // Global variables pointing to and describing the test file 187 // 188 //--------------------------------------------------------------------------- 189 190 // 191 // struct Line 192 // 193 // Each line from the source file (containing a name, presumably) gets 194 // one of these structs. 195 // 196 struct Line { 197 UChar *name; 198 int len; 199 char *winSortKey; 200 char *icuSortKey; 201 char *unixSortKey; 202 char *unixName; 203 }; 204 205 206 207 Line *gFileLines; // Ptr to array of Line structs, one per line in the file. 208 int gNumFileLines; 209 UCollator *gCol; 210 DWORD gWinLCID; 211 212 Line **gSortedLines; 213 Line **gRandomLines; 214 int gCount; 215 216 217 218 //--------------------------------------------------------------------------- 219 // 220 // ProcessOptions() Function to read the command line options. 221 // 222 //--------------------------------------------------------------------------- 223 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) 224 { 225 int i; 226 int argNum; 227 const char *pArgName; 228 OptSpec *pOpt; 229 230 for (argNum=1; argNum<argc; argNum++) { 231 pArgName = argv[argNum]; 232 for (pOpt = opts; pOpt->name != 0; pOpt++) { 233 if (strcmp(pOpt->name, pArgName) == 0) { 234 switch (pOpt->type) { 235 case OptSpec::FLAG: 236 *(UBool *)(pOpt->pVar) = TRUE; 237 break; 238 case OptSpec::STRING: 239 argNum ++; 240 if (argNum >= argc) { 241 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 242 return FALSE; 243 } 244 *(const char **)(pOpt->pVar) = argv[argNum]; 245 break; 246 case OptSpec::NUM: 247 argNum ++; 248 if (argNum >= argc) { 249 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 250 return FALSE; 251 } 252 char *endp; 253 i = strtol(argv[argNum], &endp, 0); 254 if (endp == argv[argNum]) { 255 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); 256 return FALSE; 257 } 258 *(int *)(pOpt->pVar) = i; 259 } 260 break; 261 } 262 } 263 if (pOpt->name == 0) 264 { 265 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); 266 return FALSE; 267 } 268 } 269 return TRUE; 270 } 271 272 //--------------------------------------------------------------------------------------- 273 // 274 // Comparison functions for use by qsort. 275 // 276 // Six flavors, ICU or Windows, SortKey or String Compare, Strings with length 277 // or null terminated. 278 // 279 //--------------------------------------------------------------------------------------- 280 int ICUstrcmpK(const void *a, const void *b) { 281 gCount++; 282 int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey); 283 return t; 284 } 285 286 287 int ICUstrcmpL(const void *a, const void *b) { 288 gCount++; 289 UCollationResult t; 290 t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); 291 if (t == UCOL_LESS) return -1; 292 if (t == UCOL_GREATER) return +1; 293 return 0; 294 } 295 296 297 int ICUstrcmp(const void *a, const void *b) { 298 gCount++; 299 UCollationResult t; 300 t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); 301 if (t == UCOL_LESS) return -1; 302 if (t == UCOL_GREATER) return +1; 303 return 0; 304 } 305 306 307 int Winstrcmp(const void *a, const void *b) { 308 gCount++; 309 int t; 310 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); 311 return t-2; 312 } 313 314 315 int UNIXstrcmp(const void *a, const void *b) { 316 gCount++; 317 int t; 318 t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName); 319 return t; 320 } 321 322 323 int WinstrcmpL(const void *a, const void *b) { 324 gCount++; 325 int t; 326 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); 327 return t-2; 328 } 329 330 331 int WinstrcmpK(const void *a, const void *b) { 332 gCount++; 333 int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey); 334 return t; 335 } 336 337 338 //--------------------------------------------------------------------------------------- 339 // 340 // Function for sorting the names (lines) into a random order. 341 // Order is based on a hash of the ICU Sort key for the lines 342 // The randomized order is used as input for the sorting timing tests. 343 // 344 //--------------------------------------------------------------------------------------- 345 int ICURandomCmp(const void *a, const void *b) { 346 char *ask = (*(Line **)a)->icuSortKey; 347 char *bsk = (*(Line **)b)->icuSortKey; 348 int aVal = 0; 349 int bVal = 0; 350 int retVal; 351 while (*ask != 0) { 352 aVal += aVal*37 + *ask++; 353 } 354 while (*bsk != 0) { 355 bVal += bVal*37 + *bsk++; 356 } 357 retVal = -1; 358 if (aVal == bVal) { 359 retVal = 0; 360 } 361 else if (aVal > bVal) { 362 retVal = 1; 363 } 364 return retVal; 365 } 366 367 //--------------------------------------------------------------------------------------- 368 // 369 // doKeyGen() Key Generation Timing Test 370 // 371 //--------------------------------------------------------------------------------------- 372 void doKeyGen() 373 { 374 int line; 375 int loops = 0; 376 int iLoop; 377 int t; 378 int len=-1; 379 380 // Adjust loop count to compensate for file size. Should be order n 381 double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines)); 382 int adj_loopCount = int(dLoopCount); 383 if (adj_loopCount < 1) adj_loopCount = 1; 384 385 386 unsigned long startTime = timeGetTime(); 387 388 if (opt_win) { 389 for (loops=0; loops<adj_loopCount; loops++) { 390 for (line=0; line < gNumFileLines; line++) { 391 if (opt_uselen) { 392 len = gFileLines[line].len; 393 } 394 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 395 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, 396 gFileLines[line].name, len, 397 (unsigned short *)gFileLines[line].winSortKey, 5000); // TODO something with length. 398 } 399 } 400 } 401 } 402 else if (opt_icu) 403 { 404 for (loops=0; loops<adj_loopCount; loops++) { 405 for (line=0; line < gNumFileLines; line++) { 406 if (opt_uselen) { 407 len = gFileLines[line].len; 408 } 409 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 410 t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000); 411 } 412 } 413 } 414 } 415 else if (opt_unix) 416 { 417 for (loops=0; loops<adj_loopCount; loops++) { 418 for (line=0; line < gNumFileLines; line++) { 419 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 420 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000); 421 } 422 } 423 } 424 } 425 426 unsigned long elapsedTime = timeGetTime() - startTime; 427 int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines)); 428 429 if (opt_terse == FALSE) { 430 printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLines); 431 printf("Sort Key Generation: time per key = %d ns\n", ns); 432 } 433 else { 434 printf("%d, ", ns); 435 } 436 437 int totalKeyLen = 0; 438 int totalChars = 0; 439 for (line=0; line<gNumFileLines; line++) { 440 totalChars += u_strlen(gFileLines[line].name); 441 if (opt_win) { 442 totalKeyLen += strlen(gFileLines[line].winSortKey); 443 } 444 else if (opt_icu) { 445 totalKeyLen += strlen(gFileLines[line].icuSortKey); 446 } 447 else if (opt_unix) { 448 totalKeyLen += strlen(gFileLines[line].unixSortKey); 449 } 450 451 } 452 if (opt_terse == FALSE) { 453 printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars); 454 } else { 455 printf("%f, ", (float)totalKeyLen / (float)totalChars); 456 } 457 } 458 459 460 461 //--------------------------------------------------------------------------------------- 462 // 463 // doBinarySearch() Binary Search timing test. Each name from the list 464 // is looked up in the full sorted list of names. 465 // 466 //--------------------------------------------------------------------------------------- 467 void doBinarySearch() 468 { 469 470 gCount = 0; 471 int line; 472 int loops = 0; 473 int iLoop = 0; 474 unsigned long elapsedTime = 0; 475 476 // Adjust loop count to compensate for file size. Should be order n (lookups) * log n (compares/lookup) 477 // Accurate timings do not depend on this being perfect. The correction is just to try to 478 // get total running times of about the right order, so the that user doesn't need to 479 // manually adjust the loop count for every different file size. 480 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines)); 481 if (opt_usekeys) dLoopCount *= 5; 482 int adj_loopCount = int(dLoopCount); 483 if (adj_loopCount < 1) adj_loopCount = 1; 484 485 486 for (;;) { // not really a loop, just allows "break" to work, to simplify 487 // inadvertantly running more than one test through here. 488 if (opt_strcmp || opt_strcmpCPO) 489 { 490 unsigned long startTime = timeGetTime(); 491 typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *); 492 PF pf = u_strcmp; 493 if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;} 494 //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the difference between int32_t and int 495 // which forces the use of a cast here. 496 497 int r = 0; 498 for (loops=0; loops<adj_loopCount; loops++) { 499 500 for (line=0; line < gNumFileLines; line++) { 501 int hi = gNumFileLines-1; 502 int lo = 0; 503 int guess = -1; 504 for (;;) { 505 int newGuess = (hi + lo) / 2; 506 if (newGuess == guess) 507 break; 508 guess = newGuess; 509 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 510 r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name); 511 } 512 gCount++; 513 if (r== 0) 514 break; 515 if (r < 0) 516 hi = guess; 517 else 518 lo = guess; 519 } 520 } 521 } 522 elapsedTime = timeGetTime() - startTime; 523 break; 524 } 525 526 527 if (opt_icu) 528 { 529 unsigned long startTime = timeGetTime(); 530 UCollationResult r = UCOL_EQUAL; 531 for (loops=0; loops<adj_loopCount; loops++) { 532 533 for (line=0; line < gNumFileLines; line++) { 534 int lineLen = -1; 535 int guessLen = -1; 536 if (opt_uselen) { 537 lineLen = (gSortedLines[line])->len; 538 } 539 int hi = gNumFileLines-1; 540 int lo = 0; 541 int guess = -1; 542 for (;;) { 543 int newGuess = (hi + lo) / 2; 544 if (newGuess == guess) 545 break; 546 guess = newGuess; 547 int ri = 0; 548 if (opt_usekeys) { 549 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 550 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey); 551 } 552 gCount++; 553 r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;} 554 } 555 else 556 { 557 if (opt_uselen) { 558 guessLen = (gSortedLines[guess])->len; 559 } 560 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 561 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); 562 } 563 gCount++; 564 } 565 if (r== UCOL_EQUAL) 566 break; 567 if (r == UCOL_LESS) 568 hi = guess; 569 else 570 lo = guess; 571 } 572 } 573 } 574 elapsedTime = timeGetTime() - startTime; 575 break; 576 } 577 578 if (opt_win) 579 { 580 unsigned long startTime = timeGetTime(); 581 int r = 0; 582 for (loops=0; loops<adj_loopCount; loops++) { 583 584 for (line=0; line < gNumFileLines; line++) { 585 int lineLen = -1; 586 int guessLen = -1; 587 if (opt_uselen) { 588 lineLen = (gSortedLines[line])->len; 589 } 590 int hi = gNumFileLines-1; 591 int lo = 0; 592 int guess = -1; 593 for (;;) { 594 int newGuess = (hi + lo) / 2; 595 if (newGuess == guess) 596 break; 597 guess = newGuess; 598 if (opt_usekeys) { 599 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 600 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey); 601 } 602 gCount++; 603 r+=2; 604 } 605 else 606 { 607 if (opt_uselen) { 608 guessLen = (gSortedLines[guess])->len; 609 } 610 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 611 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); 612 } 613 if (r == 0) { 614 if (opt_terse == FALSE) { 615 fprintf(stderr, "Error returned from Windows CompareStringW.\n"); 616 } 617 exit(-1); 618 } 619 gCount++; 620 } 621 if (r== 2) // strings == 622 break; 623 if (r == 1) // line < guess 624 hi = guess; 625 else // line > guess 626 lo = guess; 627 } 628 } 629 } 630 elapsedTime = timeGetTime() - startTime; 631 break; 632 } 633 634 if (opt_unix) 635 { 636 unsigned long startTime = timeGetTime(); 637 int r = 0; 638 for (loops=0; loops<adj_loopCount; loops++) { 639 640 for (line=0; line < gNumFileLines; line++) { 641 int hi = gNumFileLines-1; 642 int lo = 0; 643 int guess = -1; 644 for (;;) { 645 int newGuess = (hi + lo) / 2; 646 if (newGuess == guess) 647 break; 648 guess = newGuess; 649 if (opt_usekeys) { 650 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 651 r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey); 652 } 653 gCount++; 654 } 655 else 656 { 657 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 658 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName); 659 } 660 errno = 0; 661 if (errno != 0) { 662 fprintf(stderr, "Error %d returned from strcoll.\n", errno); 663 exit(-1); 664 } 665 gCount++; 666 } 667 if (r == 0) // strings == 668 break; 669 if (r < 0) // line < guess 670 hi = guess; 671 else // line > guess 672 lo = guess; 673 } 674 } 675 } 676 elapsedTime = timeGetTime() - startTime; 677 break; 678 } 679 break; 680 } 681 682 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 683 if (opt_terse == FALSE) { 684 printf("binary search: total # of string compares = %d\n", gCount); 685 printf("binary search: compares per loop = %d\n", gCount / loops); 686 printf("binary search: time per compare = %d ns\n", ns); 687 } else { 688 printf("%d, ", ns); 689 } 690 691 } 692 693 694 695 696 //--------------------------------------------------------------------------------------- 697 // 698 // doQSort() The quick sort timing test. Uses the C library qsort function. 699 // 700 //--------------------------------------------------------------------------------------- 701 void doQSort() { 702 int i; 703 Line **sortBuf = new Line *[gNumFileLines]; 704 705 // Adjust loop count to compensate for file size. QSort should be n log(n) 706 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines)); 707 if (opt_usekeys) dLoopCount *= 5; 708 int adj_loopCount = int(dLoopCount); 709 if (adj_loopCount < 1) adj_loopCount = 1; 710 711 712 gCount = 0; 713 unsigned long startTime = timeGetTime(); 714 if (opt_win && opt_usekeys) { 715 for (i=0; i<opt_loopCount; i++) { 716 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 717 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK); 718 } 719 } 720 721 else if (opt_win && opt_uselen) { 722 for (i=0; i<adj_loopCount; i++) { 723 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 724 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL); 725 } 726 } 727 728 729 else if (opt_win && !opt_uselen) { 730 for (i=0; i<adj_loopCount; i++) { 731 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 732 qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp); 733 } 734 } 735 736 else if (opt_icu && opt_usekeys) { 737 for (i=0; i<adj_loopCount; i++) { 738 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 739 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK); 740 } 741 } 742 743 else if (opt_icu && opt_uselen) { 744 for (i=0; i<adj_loopCount; i++) { 745 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 746 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL); 747 } 748 } 749 750 751 else if (opt_icu && !opt_uselen) { 752 for (i=0; i<adj_loopCount; i++) { 753 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 754 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp); 755 } 756 } 757 758 else if (opt_unix && !opt_usekeys) { 759 for (i=0; i<adj_loopCount; i++) { 760 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 761 qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp); 762 } 763 } 764 765 unsigned long elapsedTime = timeGetTime() - startTime; 766 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 767 if (opt_terse == FALSE) { 768 printf("qsort: total # of string compares = %d\n", gCount); 769 printf("qsort: time per compare = %d ns\n", ns); 770 } else { 771 printf("%d, ", ns); 772 } 773 } 774 775 776 777 //--------------------------------------------------------------------------------------- 778 // 779 // doKeyHist() Output a table of data for 780 // average sort key size vs. string length. 781 // 782 //--------------------------------------------------------------------------------------- 783 void doKeyHist() { 784 int i; 785 int maxLen = 0; 786 787 // Find the maximum string length 788 for (i=0; i<gNumFileLines; i++) { 789 if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len; 790 } 791 792 // Allocate arrays to hold the histogram data 793 int *accumulatedLen = new int[maxLen+1]; 794 int *numKeysOfSize = new int[maxLen+1]; 795 for (i=0; i<=maxLen; i++) { 796 accumulatedLen[i] = 0; 797 numKeysOfSize[i] = 0; 798 } 799 800 // Fill the arrays... 801 for (i=0; i<gNumFileLines; i++) { 802 int len = gFileLines[i].len; 803 accumulatedLen[len] += strlen(gFileLines[i].icuSortKey); 804 numKeysOfSize[len] += 1; 805 } 806 807 // And write out averages 808 printf("String Length, Avg Key Length, Avg Key Len per char\n"); 809 for (i=1; i<=maxLen; i++) { 810 if (numKeysOfSize[i] > 0) { 811 printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i], 812 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i)); 813 } 814 } 815 delete []accumulatedLen; 816 delete []numKeysOfSize ; 817 } 818 819 //--------------------------------------------------------------------------------------- 820 // 821 // doForwardIterTest(UBool) Forward iteration test 822 // argument null-terminated string used 823 // 824 //--------------------------------------------------------------------------------------- 825 void doForwardIterTest(UBool haslen) { 826 int count = 0; 827 828 UErrorCode error = U_ZERO_ERROR; 829 printf("\n\nPerforming forward iteration performance test with "); 830 831 if (haslen) { 832 printf("non-null terminated data -----------\n"); 833 } 834 else { 835 printf("null terminated data -----------\n"); 836 } 837 printf("performance test on strings from file -----------\n"); 838 839 UChar dummytext[] = {0, 0}; 840 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); 841 ucol_setText(iter, dummytext, 1, &error); 842 843 gCount = 0; 844 unsigned long startTime = timeGetTime(); 845 while (count < opt_loopCount) { 846 int linecount = 0; 847 while (linecount < gNumFileLines) { 848 UChar *str = gFileLines[linecount].name; 849 int strlen = haslen?gFileLines[linecount].len:-1; 850 ucol_setText(iter, str, strlen, &error); 851 while (ucol_next(iter, &error) != UCOL_NULLORDER) { 852 gCount++; 853 } 854 855 linecount ++; 856 } 857 count ++; 858 } 859 unsigned long elapsedTime = timeGetTime() - startTime; 860 printf("elapsedTime %ld\n", elapsedTime); 861 862 // empty loop recalculation 863 count = 0; 864 startTime = timeGetTime(); 865 while (count < opt_loopCount) { 866 int linecount = 0; 867 while (linecount < gNumFileLines) { 868 UChar *str = gFileLines[linecount].name; 869 int strlen = haslen?gFileLines[linecount].len:-1; 870 ucol_setText(iter, str, strlen, &error); 871 linecount ++; 872 } 873 count ++; 874 } 875 elapsedTime -= (timeGetTime() - startTime); 876 printf("elapsedTime %ld\n", elapsedTime); 877 878 ucol_closeElements(iter); 879 880 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 881 printf("Total number of strings compared %d in %d loops\n", gNumFileLines, 882 opt_loopCount); 883 printf("Average time per ucol_next() nano seconds %d\n", ns); 884 885 printf("performance test on skipped-5 concatenated strings from file -----------\n"); 886 887 UChar *str; 888 int strlen = 0; 889 // appending all the strings 890 int linecount = 0; 891 while (linecount < gNumFileLines) { 892 strlen += haslen?gFileLines[linecount].len: 893 u_strlen(gFileLines[linecount].name); 894 linecount ++; 895 } 896 str = (UChar *)malloc(sizeof(UChar) * strlen); 897 int strindex = 0; 898 linecount = 0; 899 while (strindex < strlen) { 900 int len = 0; 901 len += haslen?gFileLines[linecount].len: 902 u_strlen(gFileLines[linecount].name); 903 memcpy(str + strindex, gFileLines[linecount].name, 904 sizeof(UChar) * len); 905 strindex += len; 906 linecount ++; 907 } 908 909 printf("Total size of strings %d\n", strlen); 910 911 gCount = 0; 912 count = 0; 913 914 if (!haslen) { 915 strlen = -1; 916 } 917 iter = ucol_openElements(gCol, str, strlen, &error); 918 if (!haslen) { 919 strlen = u_strlen(str); 920 } 921 strlen -= 5; // any left over characters are not iterated, 922 // this is to ensure the backwards and forwards iterators 923 // gets the same position 924 startTime = timeGetTime(); 925 while (count < opt_loopCount) { 926 int count5 = 5; 927 strindex = 0; 928 ucol_setOffset(iter, strindex, &error); 929 while (TRUE) { 930 if (ucol_next(iter, &error) == UCOL_NULLORDER) { 931 break; 932 } 933 gCount++; 934 count5 --; 935 if (count5 == 0) { 936 strindex += 10; 937 if (strindex > strlen) { 938 break; 939 } 940 ucol_setOffset(iter, strindex, &error); 941 count5 = 5; 942 } 943 } 944 count ++; 945 } 946 947 elapsedTime = timeGetTime() - startTime; 948 printf("elapsedTime %ld\n", elapsedTime); 949 950 // empty loop recalculation 951 int tempgCount = 0; 952 count = 0; 953 startTime = timeGetTime(); 954 while (count < opt_loopCount) { 955 int count5 = 5; 956 strindex = 0; 957 ucol_setOffset(iter, strindex, &error); 958 while (TRUE) { 959 tempgCount ++; 960 count5 --; 961 if (count5 == 0) { 962 strindex += 10; 963 if (strindex > strlen) { 964 break; 965 } 966 ucol_setOffset(iter, strindex, &error); 967 count5 = 5; 968 } 969 } 970 count ++; 971 } 972 elapsedTime -= (timeGetTime() - startTime); 973 printf("elapsedTime %ld\n", elapsedTime); 974 975 ucol_closeElements(iter); 976 977 printf("gCount %d\n", gCount); 978 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 979 printf("Average time per ucol_next() nano seconds %d\n", ns); 980 } 981 982 //--------------------------------------------------------------------------------------- 983 // 984 // doBackwardIterTest(UBool) Backwards iteration test 985 // argument null-terminated string used 986 // 987 //--------------------------------------------------------------------------------------- 988 void doBackwardIterTest(UBool haslen) { 989 int count = 0; 990 UErrorCode error = U_ZERO_ERROR; 991 printf("\n\nPerforming backward iteration performance test with "); 992 993 if (haslen) { 994 printf("non-null terminated data -----------\n"); 995 } 996 else { 997 printf("null terminated data -----------\n"); 998 } 999 1000 printf("performance test on strings from file -----------\n"); 1001 1002 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); 1003 UChar dummytext[] = {0, 0}; 1004 ucol_setText(iter, dummytext, 1, &error); 1005 1006 gCount = 0; 1007 unsigned long startTime = timeGetTime(); 1008 while (count < opt_loopCount) { 1009 int linecount = 0; 1010 while (linecount < gNumFileLines) { 1011 UChar *str = gFileLines[linecount].name; 1012 int strlen = haslen?gFileLines[linecount].len:-1; 1013 ucol_setText(iter, str, strlen, &error); 1014 while (ucol_previous(iter, &error) != UCOL_NULLORDER) { 1015 gCount ++; 1016 } 1017 1018 linecount ++; 1019 } 1020 count ++; 1021 } 1022 unsigned long elapsedTime = timeGetTime() - startTime; 1023 1024 printf("elapsedTime %ld\n", elapsedTime); 1025 1026 // empty loop recalculation 1027 count = 0; 1028 startTime = timeGetTime(); 1029 while (count < opt_loopCount) { 1030 int linecount = 0; 1031 while (linecount < gNumFileLines) { 1032 UChar *str = gFileLines[linecount].name; 1033 int strlen = haslen?gFileLines[linecount].len:-1; 1034 ucol_setText(iter, str, strlen, &error); 1035 linecount ++; 1036 } 1037 count ++; 1038 } 1039 elapsedTime -= (timeGetTime() - startTime); 1040 1041 printf("elapsedTime %ld\n", elapsedTime); 1042 ucol_closeElements(iter); 1043 1044 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 1045 printf("Total number of strings compared %d in %d loops\n", gNumFileLines, 1046 opt_loopCount); 1047 printf("Average time per ucol_previous() nano seconds %d\n", ns); 1048 1049 printf("performance test on skipped-5 concatenated strings from file -----------\n"); 1050 1051 UChar *str; 1052 int strlen = 0; 1053 // appending all the strings 1054 int linecount = 0; 1055 while (linecount < gNumFileLines) { 1056 strlen += haslen?gFileLines[linecount].len: 1057 u_strlen(gFileLines[linecount].name); 1058 linecount ++; 1059 } 1060 str = (UChar *)malloc(sizeof(UChar) * strlen); 1061 int strindex = 0; 1062 linecount = 0; 1063 while (strindex < strlen) { 1064 int len = 0; 1065 len += haslen?gFileLines[linecount].len: 1066 u_strlen(gFileLines[linecount].name); 1067 memcpy(str + strindex, gFileLines[linecount].name, 1068 sizeof(UChar) * len); 1069 strindex += len; 1070 linecount ++; 1071 } 1072 1073 printf("Total size of strings %d\n", strlen); 1074 1075 gCount = 0; 1076 count = 0; 1077 1078 if (!haslen) { 1079 strlen = -1; 1080 } 1081 1082 iter = ucol_openElements(gCol, str, strlen, &error); 1083 if (!haslen) { 1084 strlen = u_strlen(str); 1085 } 1086 1087 startTime = timeGetTime(); 1088 while (count < opt_loopCount) { 1089 int count5 = 5; 1090 strindex = 5; 1091 ucol_setOffset(iter, strindex, &error); 1092 while (TRUE) { 1093 if (ucol_previous(iter, &error) == UCOL_NULLORDER) { 1094 break; 1095 } 1096 gCount ++; 1097 count5 --; 1098 if (count5 == 0) { 1099 strindex += 10; 1100 if (strindex > strlen) { 1101 break; 1102 } 1103 ucol_setOffset(iter, strindex, &error); 1104 count5 = 5; 1105 } 1106 } 1107 count ++; 1108 } 1109 1110 elapsedTime = timeGetTime() - startTime; 1111 printf("elapsedTime %ld\n", elapsedTime); 1112 1113 // empty loop recalculation 1114 count = 0; 1115 int tempgCount = 0; 1116 startTime = timeGetTime(); 1117 while (count < opt_loopCount) { 1118 int count5 = 5; 1119 strindex = 5; 1120 ucol_setOffset(iter, strindex, &error); 1121 while (TRUE) { 1122 tempgCount ++; 1123 count5 --; 1124 if (count5 == 0) { 1125 strindex += 10; 1126 if (strindex > strlen) { 1127 break; 1128 } 1129 ucol_setOffset(iter, strindex, &error); 1130 count5 = 5; 1131 } 1132 } 1133 count ++; 1134 } 1135 elapsedTime -= (timeGetTime() - startTime); 1136 printf("elapsedTime %ld\n", elapsedTime); 1137 ucol_closeElements(iter); 1138 1139 printf("gCount %d\n", gCount); 1140 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 1141 printf("Average time per ucol_previous() nano seconds %d\n", ns); 1142 } 1143 1144 //--------------------------------------------------------------------------------------- 1145 // 1146 // doIterTest() Iteration test 1147 // 1148 //--------------------------------------------------------------------------------------- 1149 void doIterTest() { 1150 doForwardIterTest(opt_uselen); 1151 doBackwardIterTest(opt_uselen); 1152 } 1153 1154 1155 //---------------------------------------------------------------------------------------- 1156 // 1157 // UnixConvert -- Convert the lines of the file to the encoding for UNIX 1158 // Since it appears that Unicode support is going in the general 1159 // direction of the use of UTF-8 locales, that is the approach 1160 // that is used here. 1161 // 1162 //---------------------------------------------------------------------------------------- 1163 void UnixConvert() { 1164 int line; 1165 1166 UConverter *cvrtr; // An ICU code page converter. 1167 UErrorCode status = U_ZERO_ERROR; 1168 1169 1170 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. 1171 if (U_FAILURE(status)) { 1172 fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status)); 1173 exit(-1); 1174 } 1175 1176 for (line=0; line < gNumFileLines; line++) { 1177 int sizeNeeded = ucnv_fromUChars(cvrtr, 1178 0, // ptr to target buffer. 1179 0, // length of target buffer. 1180 gFileLines[line].name, 1181 -1, // source is null terminated 1182 &status); 1183 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { 1184 //fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); 1185 //exit(-1); 1186 } 1187 status = U_ZERO_ERROR; 1188 gFileLines[line].unixName = new char[sizeNeeded+1]; 1189 sizeNeeded = ucnv_fromUChars(cvrtr, 1190 gFileLines[line].unixName, // ptr to target buffer. 1191 sizeNeeded+1, // length of target buffer. 1192 gFileLines[line].name, 1193 -1, // source is null terminated 1194 &status); 1195 if (U_FAILURE(status)) { 1196 fprintf(stderr, "ICU Conversion Failed.: %d\n", status); 1197 exit(-1); 1198 } 1199 gFileLines[line].unixName[sizeNeeded] = 0; 1200 }; 1201 ucnv_close(cvrtr); 1202 } 1203 1204 1205 //---------------------------------------------------------------------------------------- 1206 // 1207 // class UCharFile Class to hide all the gorp to read a file in 1208 // and produce a stream of UChars. 1209 // 1210 //---------------------------------------------------------------------------------------- 1211 class UCharFile { 1212 public: 1213 UCharFile(const char *fileName); 1214 ~UCharFile(); 1215 UChar get(); 1216 UBool eof() {return fEof;}; 1217 UBool error() {return fError;}; 1218 1219 private: 1220 UCharFile (const UCharFile & /*other*/) {}; // No copy constructor. 1221 UCharFile & operator = (const UCharFile &/*other*/) {return *this;}; // No assignment op 1222 1223 FILE *fFile; 1224 const char *fName; 1225 UBool fEof; 1226 UBool fError; 1227 UChar fPending2ndSurrogate; 1228 1229 enum {UTF16LE, UTF16BE, UTF8} fEncoding; 1230 }; 1231 1232 UCharFile::UCharFile(const char * fileName) { 1233 fEof = FALSE; 1234 fError = FALSE; 1235 fName = fileName; 1236 fFile = fopen(fName, "rb"); 1237 fPending2ndSurrogate = 0; 1238 if (fFile == NULL) { 1239 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); 1240 fError = TRUE; 1241 return; 1242 } 1243 // 1244 // Look for the byte order mark at the start of the file. 1245 // 1246 int BOMC1, BOMC2, BOMC3; 1247 BOMC1 = fgetc(fFile); 1248 BOMC2 = fgetc(fFile); 1249 1250 if (BOMC1 == 0xff && BOMC2 == 0xfe) { 1251 fEncoding = UTF16LE; } 1252 else if (BOMC1 == 0xfe && BOMC2 == 0xff) { 1253 fEncoding = UTF16BE; } 1254 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { 1255 fEncoding = UTF8; } 1256 else 1257 { 1258 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " 1259 "must include a BOM.\n", fileName); 1260 fError = true; 1261 return; 1262 } 1263 } 1264 1265 1266 UCharFile::~UCharFile() { 1267 fclose(fFile); 1268 } 1269 1270 1271 1272 UChar UCharFile::get() { 1273 UChar c; 1274 switch (fEncoding) { 1275 case UTF16LE: 1276 { 1277 int cL, cH; 1278 cL = fgetc(fFile); 1279 cH = fgetc(fFile); 1280 c = cL | (cH << 8); 1281 if (cH == EOF) { 1282 c = 0; 1283 fEof = TRUE; 1284 } 1285 break; 1286 } 1287 case UTF16BE: 1288 { 1289 int cL, cH; 1290 cH = fgetc(fFile); 1291 cL = fgetc(fFile); 1292 c = cL | (cH << 8); 1293 if (cL == EOF) { 1294 c = 0; 1295 fEof = TRUE; 1296 } 1297 break; 1298 } 1299 case UTF8: 1300 { 1301 if (fPending2ndSurrogate != 0) { 1302 c = fPending2ndSurrogate; 1303 fPending2ndSurrogate = 0; 1304 break; 1305 } 1306 1307 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. 1308 if (ch == EOF) { 1309 c = 0; 1310 fEof = TRUE; 1311 break; 1312 } 1313 1314 if (ch <= 0x7f) { 1315 // It's ascii. No further utf-8 conversion. 1316 c = ch; 1317 break; 1318 } 1319 1320 // Figure out the lenght of the char and read the rest of the bytes 1321 // into a temp array. 1322 int nBytes; 1323 if (ch >= 0xF0) {nBytes=4;} 1324 else if (ch >= 0xE0) {nBytes=3;} 1325 else if (ch >= 0xC0) {nBytes=2;} 1326 else { 1327 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); 1328 fError = TRUE; 1329 return 0; 1330 } 1331 1332 unsigned char bytes[10]; 1333 bytes[0] = (unsigned char)ch; 1334 int i; 1335 for (i=1; i<nBytes; i++) { 1336 bytes[i] = fgetc(fFile); 1337 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { 1338 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); 1339 fError = TRUE; 1340 return 0; 1341 } 1342 } 1343 1344 // Convert the bytes from the temp array to a Unicode char. 1345 i = 0; 1346 uint32_t cp; 1347 U8_NEXT_UNSAFE(bytes, i, cp); 1348 c = (UChar)cp; 1349 1350 if (cp >= 0x10000) { 1351 // The code point needs to be broken up into a utf-16 surrogate pair. 1352 // Process first half this time through the main loop, and 1353 // remember the other half for the next time through. 1354 UChar utf16Buf[3]; 1355 i = 0; 1356 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); 1357 fPending2ndSurrogate = utf16Buf[1]; 1358 c = utf16Buf[0]; 1359 } 1360 break; 1361 }; 1362 default: 1363 c = 0xFFFD; /* Error, unspecified codepage*/ 1364 fprintf(stderr, "UCharFile: Error: unknown fEncoding\n"); 1365 exit(1); 1366 } 1367 return c; 1368 } 1369 1370 //---------------------------------------------------------------------------------------- 1371 // 1372 // openRulesCollator - Command line specified a rules file. Read it in 1373 // and open a collator with it. 1374 // 1375 //---------------------------------------------------------------------------------------- 1376 UCollator *openRulesCollator() { 1377 UCharFile f(opt_rules); 1378 if (f.error()) { 1379 return 0; 1380 } 1381 1382 int bufLen = 10000; 1383 UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar)); 1384 UChar *tmp; 1385 int i = 0; 1386 1387 for(;;) { 1388 buf[i] = f.get(); 1389 if (f.eof()) { 1390 break; 1391 } 1392 if (f.error()) { 1393 return 0; 1394 } 1395 i++; 1396 if (i >= bufLen) { 1397 tmp = buf; 1398 bufLen += 10000; 1399 buf = (UChar *)realloc(buf, bufLen); 1400 if (buf == NULL) { 1401 free(tmp); 1402 return 0; 1403 } 1404 } 1405 } 1406 buf[i] = 0; 1407 1408 UErrorCode status = U_ZERO_ERROR; 1409 UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF, 1410 UCOL_DEFAULT_STRENGTH, NULL, &status); 1411 if (U_FAILURE(status)) { 1412 fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status); 1413 return 0; 1414 } 1415 free(buf); 1416 return coll; 1417 } 1418 1419 1420 1421 1422 1423 //---------------------------------------------------------------------------------------- 1424 // 1425 // Main -- process command line, read in and pre-process the test file, 1426 // call other functions to do the actual tests. 1427 // 1428 //---------------------------------------------------------------------------------------- 1429 int main(int argc, const char** argv) { 1430 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) { 1431 printf(gUsageString); 1432 exit (1); 1433 } 1434 1435 // Make sure that we've only got one API selected. 1436 if (opt_unix || opt_win) opt_icu = FALSE; 1437 if (opt_unix) opt_win = FALSE; 1438 1439 // 1440 // Set up an ICU collator 1441 // 1442 UErrorCode status = U_ZERO_ERROR; 1443 1444 if (opt_rules != 0) { 1445 gCol = openRulesCollator(); 1446 if (gCol == 0) {return -1;} 1447 } 1448 else { 1449 gCol = ucol_open(opt_locale, &status); 1450 if (U_FAILURE(status)) { 1451 fprintf(stderr, "Collator creation failed.: %d\n", status); 1452 return -1; 1453 } 1454 } 1455 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { 1456 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); 1457 } 1458 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { 1459 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); 1460 } 1461 1462 if (opt_norm) { 1463 ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 1464 } 1465 if (opt_french && opt_frenchoff) { 1466 fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options."); 1467 exit(-1); 1468 } 1469 if (opt_french) { 1470 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status); 1471 } 1472 if (opt_frenchoff) { 1473 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status); 1474 } 1475 if (opt_lower) { 1476 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status); 1477 } 1478 if (opt_upper) { 1479 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status); 1480 } 1481 if (opt_case) { 1482 ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status); 1483 } 1484 if (opt_shifted) { 1485 ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status); 1486 } 1487 if (opt_level != 0) { 1488 switch (opt_level) { 1489 case 1: 1490 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status); 1491 break; 1492 case 2: 1493 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status); 1494 break; 1495 case 3: 1496 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status); 1497 break; 1498 case 4: 1499 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status); 1500 break; 1501 case 5: 1502 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status); 1503 break; 1504 default: 1505 fprintf(stderr, "-level param must be between 1 and 5\n"); 1506 exit(-1); 1507 } 1508 } 1509 1510 if (U_FAILURE(status)) { 1511 fprintf(stderr, "Collator attribute setting failed.: %d\n", status); 1512 return -1; 1513 } 1514 1515 1516 // 1517 // Set up a Windows LCID 1518 // 1519 if (opt_langid != 0) { 1520 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); 1521 } 1522 else { 1523 gWinLCID = uloc_getLCID(opt_locale); 1524 } 1525 1526 1527 // 1528 // Set the UNIX locale 1529 // 1530 if (opt_unix) { 1531 if (setlocale(LC_ALL, opt_locale) == 0) { 1532 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); 1533 exit(-1); 1534 } 1535 } 1536 1537 // Read in the input file. 1538 // File assumed to be utf-16. 1539 // Lines go onto heap buffers. Global index array to line starts is created. 1540 // Lines themselves are null terminated. 1541 // 1542 1543 UCharFile f(opt_fName); 1544 if (f.error()) { 1545 exit(-1); 1546 } 1547 1548 const int MAXLINES = 100000; 1549 gFileLines = new Line[MAXLINES]; 1550 UChar buf[1024]; 1551 int column = 0; 1552 1553 // Read the file, split into lines, and save in memory. 1554 // Loop runs once per utf-16 value from the input file, 1555 // (The number of bytes read from file per loop iteration depends on external encoding.) 1556 for (;;) { 1557 1558 UChar c = f.get(); 1559 if (f.error()){ 1560 exit(-1); 1561 } 1562 1563 1564 // We now have a good UTF-16 value in c. 1565 1566 // Watch for CR, LF, EOF; these finish off a line. 1567 if (c == 0xd) { 1568 continue; 1569 } 1570 1571 if (f.eof() || c == 0x0a || c==0x2028) { // Unipad inserts 2028 line separators! 1572 buf[column++] = 0; 1573 if (column > 1) { 1574 gFileLines[gNumFileLines].name = new UChar[column]; 1575 gFileLines[gNumFileLines].len = column-1; 1576 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar)); 1577 gNumFileLines++; 1578 column = 0; 1579 if (gNumFileLines >= MAXLINES) { 1580 fprintf(stderr, "File too big. Max number of lines is %d\n", MAXLINES); 1581 exit(-1); 1582 } 1583 1584 } 1585 if (c == 0xa || c == 0x2028) 1586 continue; 1587 else 1588 break; // EOF 1589 } 1590 buf[column++] = c; 1591 if (column >= 1023) 1592 { 1593 static UBool warnFlag = TRUE; 1594 if (warnFlag) { 1595 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n"); 1596 warnFlag = FALSE; 1597 } 1598 column--; 1599 } 1600 } 1601 1602 if (opt_terse == FALSE) { 1603 printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines); 1604 } 1605 1606 1607 // Convert the lines to the UNIX encoding. 1608 if (opt_unix) { 1609 UnixConvert(); 1610 } 1611 1612 // 1613 // Pre-compute ICU sort keys for the lines of the file. 1614 // 1615 int line; 1616 int32_t t; 1617 1618 for (line=0; line<gNumFileLines; line++) { 1619 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf)); 1620 gFileLines[line].icuSortKey = new char[t]; 1621 1622 if (t > (int32_t)sizeof(buf)) { 1623 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t); 1624 } 1625 else 1626 { 1627 memcpy(gFileLines[line].icuSortKey, buf, t); 1628 } 1629 } 1630 1631 1632 1633 // 1634 // Pre-compute Windows sort keys for the lines of the file. 1635 // 1636 for (line=0; line<gNumFileLines; line++) { 1637 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf)); 1638 gFileLines[line].winSortKey = new char[t]; 1639 if (t > (int32_t)sizeof(buf)) { 1640 t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t); 1641 } 1642 else 1643 { 1644 memcpy(gFileLines[line].winSortKey, buf, t); 1645 } 1646 } 1647 1648 // 1649 // Pre-compute UNIX sort keys for the lines of the file. 1650 // 1651 if (opt_unix) { 1652 for (line=0; line<gNumFileLines; line++) { 1653 t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf)); 1654 gFileLines[line].unixSortKey = new char[t]; 1655 if (t > (int32_t)sizeof(buf)) { 1656 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, sizeof(buf)); 1657 } 1658 else 1659 { 1660 memcpy(gFileLines[line].unixSortKey, buf, t); 1661 } 1662 } 1663 } 1664 1665 1666 // 1667 // Dump file lines, CEs, Sort Keys if requested. 1668 // 1669 if (opt_dump) { 1670 int i; 1671 for (line=0; line<gNumFileLines; line++) { 1672 for (i=0;;i++) { 1673 UChar c = gFileLines[line].name[i]; 1674 if (c == 0) 1675 break; 1676 if (c < 0x20 || c > 0x7e) { 1677 printf("\\u%.4x", c); 1678 } 1679 else { 1680 printf("%c", c); 1681 } 1682 } 1683 printf("\n"); 1684 1685 printf(" CEs: "); 1686 UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status); 1687 int32_t ce; 1688 i = 0; 1689 for (;;) { 1690 ce = ucol_next(CEiter, &status); 1691 if (ce == UCOL_NULLORDER) { 1692 break; 1693 } 1694 printf(" %.8x", ce); 1695 if (++i > 8) { 1696 printf("\n "); 1697 i = 0; 1698 } 1699 } 1700 printf("\n"); 1701 ucol_closeElements(CEiter); 1702 1703 1704 printf(" ICU Sort Key: "); 1705 for (i=0; ; i++) { 1706 unsigned char c = gFileLines[line].icuSortKey[i]; 1707 printf("%02x ", c); 1708 if (c == 0) { 1709 break; 1710 } 1711 if (i > 0 && i % 20 == 0) { 1712 printf("\n "); 1713 } 1714 } 1715 printf("\n"); 1716 } 1717 } 1718 1719 1720 // 1721 // Pre-sort the lines. 1722 // 1723 int i; 1724 gSortedLines = new Line *[gNumFileLines]; 1725 for (i=0; i<gNumFileLines; i++) { 1726 gSortedLines[i] = &gFileLines[i]; 1727 } 1728 1729 if (opt_win) { 1730 qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp); 1731 } 1732 else if (opt_unix) { 1733 qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp); 1734 } 1735 else /* ICU */ 1736 { 1737 qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp); 1738 } 1739 1740 1741 // 1742 // Make up a randomized order, will be used for sorting tests. 1743 // 1744 gRandomLines = new Line *[gNumFileLines]; 1745 for (i=0; i<gNumFileLines; i++) { 1746 gRandomLines[i] = &gFileLines[i]; 1747 } 1748 qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp); 1749 1750 1751 1752 1753 // 1754 // We've got the file read into memory. Go do something with it. 1755 // 1756 1757 if (opt_qsort) doQSort(); 1758 if (opt_binsearch) doBinarySearch(); 1759 if (opt_keygen) doKeyGen(); 1760 if (opt_keyhist) doKeyHist(); 1761 if (opt_itertest) doIterTest(); 1762 1763 return 0; 1764 1765 } 1766