1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (C) 2001-2009 IBM, Inc. All Rights Reserved. 4 * 5 ********************************************************************/ 6 /******************************************************************************** 7 * 8 * File CALLCOLL.C 9 * 10 * Modification History: 11 * Name Description 12 * Andy Heninger First Version 13 * 14 ********************************************************************************* 15 */ 16 17 // 18 // This program tests string collation and sort key generation performance. 19 // Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString 20 // A file of names is required as input, one per line. It must be in utf-8 or utf-16 format, 21 // and include a byte order mark. Either LE or BE format is OK. 22 // 23 24 const char gUsageString[] = 25 "usage: collperf options...\n" 26 "-help Display this message.\n" 27 "-file file_name utf-16 format file of names.\n" 28 "-locale name ICU locale to use. Default is en_US\n" 29 "-rules file_name Collation rules file (overrides locale)\n" 30 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" 31 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" 32 "-win Run test using Windows native services. (ICU is default)\n" 33 "-unix Run test using Unix strxfrm, strcoll services.\n" 34 "-uselen Use API with string lengths. Default is null-terminated strings\n" 35 "-usekeys Run tests using sortkeys rather than strcoll\n" 36 "-strcmp Run tests using u_strcmp rather than strcoll\n" 37 "-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n" 38 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" 39 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" 40 " under test at each call point. For measuring test overhead.\n" 41 "-terse Terse numbers-only output. Intended for use by scripts.\n" 42 "-french French accent ordering\n" 43 "-frenchoff No French accent ordering (for use with French locales.)\n" 44 "-norm Normalizing mode on\n" 45 "-shifted Shifted mode\n" 46 "-lower Lower case first\n" 47 "-upper Upper case first\n" 48 "-case Enable separate case level\n" 49 "-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n" 50 "-keyhist Produce a table sort key size vs. string length\n" 51 "-binsearch Binary Search timing test\n" 52 "-keygen Sort Key Generation timing test\n" 53 "-qsort Quicksort timing test\n" 54 "-iter Iteration Performance Test\n" 55 "-dump Display strings, sort keys and CEs.\n" 56 ; 57 58 59 60 #include <stdio.h> 61 #include <string.h> 62 #include <stdlib.h> 63 #include <math.h> 64 #include <locale.h> 65 #include <errno.h> 66 67 #include <unicode/utypes.h> 68 #include <unicode/ucol.h> 69 #include <unicode/ucoleitr.h> 70 #include <unicode/uloc.h> 71 #include <unicode/ustring.h> 72 #include <unicode/ures.h> 73 #include <unicode/uchar.h> 74 #include <unicode/ucnv.h> 75 #include <unicode/utf8.h> 76 77 #ifdef WIN32 78 #include <windows.h> 79 #else 80 // 81 // Stubs for Windows API functions when building on UNIXes. 82 // 83 typedef int DWORD; 84 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} 85 #include <sys/time.h> 86 unsigned long timeGetTime() { 87 struct timeval t; 88 gettimeofday(&t, 0); 89 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. 90 val += t.tv_usec / 1000; 91 return val; 92 } 93 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} 94 const int LCMAP_SORTKEY = 0; 95 #define MAKELCID(a,b) 0 96 const int SORT_DEFAULT = 0; 97 #endif 98 99 100 101 // 102 // Command line option variables 103 // These global variables are set according to the options specified 104 // on the command line by the user. 105 char * opt_fName = 0; 106 const char * opt_locale = "en_US"; 107 int opt_langid = 0; // Defaults to value corresponding to opt_locale. 108 char * opt_rules = 0; 109 UBool opt_help = FALSE; 110 int opt_loopCount = 1; 111 int opt_iLoopCount = 1; 112 UBool opt_terse = FALSE; 113 UBool opt_qsort = FALSE; 114 UBool opt_binsearch = FALSE; 115 UBool opt_icu = TRUE; 116 UBool opt_win = FALSE; // Run with Windows native functions. 117 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. 118 UBool opt_uselen = FALSE; 119 UBool opt_usekeys = FALSE; 120 UBool opt_strcmp = FALSE; 121 UBool opt_strcmpCPO = FALSE; 122 UBool opt_norm = FALSE; 123 UBool opt_keygen = FALSE; 124 UBool opt_french = FALSE; 125 UBool opt_frenchoff = FALSE; 126 UBool opt_shifted = FALSE; 127 UBool opt_lower = FALSE; 128 UBool opt_upper = FALSE; 129 UBool opt_case = FALSE; 130 int opt_level = 0; 131 UBool opt_keyhist = FALSE; 132 UBool opt_itertest = FALSE; 133 UBool opt_dump = FALSE; 134 135 136 137 // 138 // Definitions for the command line options 139 // 140 struct OptSpec { 141 const char *name; 142 enum {FLAG, NUM, STRING} type; 143 void *pVar; 144 }; 145 146 OptSpec opts[] = { 147 {"-file", OptSpec::STRING, &opt_fName}, 148 {"-locale", OptSpec::STRING, &opt_locale}, 149 {"-langid", OptSpec::NUM, &opt_langid}, 150 {"-rules", OptSpec::STRING, &opt_rules}, 151 {"-qsort", OptSpec::FLAG, &opt_qsort}, 152 {"-binsearch", OptSpec::FLAG, &opt_binsearch}, 153 {"-iter", OptSpec::FLAG, &opt_itertest}, 154 {"-win", OptSpec::FLAG, &opt_win}, 155 {"-unix", OptSpec::FLAG, &opt_unix}, 156 {"-uselen", OptSpec::FLAG, &opt_uselen}, 157 {"-usekeys", OptSpec::FLAG, &opt_usekeys}, 158 {"-strcmp", OptSpec::FLAG, &opt_strcmp}, 159 {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO}, 160 {"-norm", OptSpec::FLAG, &opt_norm}, 161 {"-french", OptSpec::FLAG, &opt_french}, 162 {"-frenchoff", OptSpec::FLAG, &opt_frenchoff}, 163 {"-shifted", OptSpec::FLAG, &opt_shifted}, 164 {"-lower", OptSpec::FLAG, &opt_lower}, 165 {"-upper", OptSpec::FLAG, &opt_upper}, 166 {"-case", OptSpec::FLAG, &opt_case}, 167 {"-level", OptSpec::NUM, &opt_level}, 168 {"-keyhist", OptSpec::FLAG, &opt_keyhist}, 169 {"-keygen", OptSpec::FLAG, &opt_keygen}, 170 {"-loop", OptSpec::NUM, &opt_loopCount}, 171 {"-iloop", OptSpec::NUM, &opt_iLoopCount}, 172 {"-terse", OptSpec::FLAG, &opt_terse}, 173 {"-dump", OptSpec::FLAG, &opt_dump}, 174 {"-help", OptSpec::FLAG, &opt_help}, 175 {"-?", OptSpec::FLAG, &opt_help}, 176 {0, OptSpec::FLAG, 0} 177 }; 178 179 180 //--------------------------------------------------------------------------- 181 // 182 // Global variables pointing to and describing the test file 183 // 184 //--------------------------------------------------------------------------- 185 186 // 187 // struct Line 188 // 189 // Each line from the source file (containing a name, presumably) gets 190 // one of these structs. 191 // 192 struct Line { 193 UChar *name; 194 int len; 195 char *winSortKey; 196 char *icuSortKey; 197 char *unixSortKey; 198 char *unixName; 199 }; 200 201 202 203 Line *gFileLines; // Ptr to array of Line structs, one per line in the file. 204 int gNumFileLines; 205 UCollator *gCol; 206 DWORD gWinLCID; 207 208 Line **gSortedLines; 209 Line **gRandomLines; 210 int gCount; 211 212 213 214 //--------------------------------------------------------------------------- 215 // 216 // ProcessOptions() Function to read the command line options. 217 // 218 //--------------------------------------------------------------------------- 219 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) 220 { 221 int i; 222 int argNum; 223 const char *pArgName; 224 OptSpec *pOpt; 225 226 for (argNum=1; argNum<argc; argNum++) { 227 pArgName = argv[argNum]; 228 for (pOpt = opts; pOpt->name != 0; pOpt++) { 229 if (strcmp(pOpt->name, pArgName) == 0) { 230 switch (pOpt->type) { 231 case OptSpec::FLAG: 232 *(UBool *)(pOpt->pVar) = TRUE; 233 break; 234 case OptSpec::STRING: 235 argNum ++; 236 if (argNum >= argc) { 237 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 238 return FALSE; 239 } 240 *(const char **)(pOpt->pVar) = argv[argNum]; 241 break; 242 case OptSpec::NUM: 243 argNum ++; 244 if (argNum >= argc) { 245 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 246 return FALSE; 247 } 248 char *endp; 249 i = strtol(argv[argNum], &endp, 0); 250 if (endp == argv[argNum]) { 251 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); 252 return FALSE; 253 } 254 *(int *)(pOpt->pVar) = i; 255 } 256 break; 257 } 258 } 259 if (pOpt->name == 0) 260 { 261 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); 262 return FALSE; 263 } 264 } 265 return TRUE; 266 } 267 268 //--------------------------------------------------------------------------------------- 269 // 270 // Comparison functions for use by qsort. 271 // 272 // Six flavors, ICU or Windows, SortKey or String Compare, Strings with length 273 // or null terminated. 274 // 275 //--------------------------------------------------------------------------------------- 276 int ICUstrcmpK(const void *a, const void *b) { 277 gCount++; 278 int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey); 279 return t; 280 } 281 282 283 int ICUstrcmpL(const void *a, const void *b) { 284 gCount++; 285 UCollationResult t; 286 t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); 287 if (t == UCOL_LESS) return -1; 288 if (t == UCOL_GREATER) return +1; 289 return 0; 290 } 291 292 293 int ICUstrcmp(const void *a, const void *b) { 294 gCount++; 295 UCollationResult t; 296 t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); 297 if (t == UCOL_LESS) return -1; 298 if (t == UCOL_GREATER) return +1; 299 return 0; 300 } 301 302 303 int Winstrcmp(const void *a, const void *b) { 304 gCount++; 305 int t; 306 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); 307 return t-2; 308 } 309 310 311 int UNIXstrcmp(const void *a, const void *b) { 312 gCount++; 313 int t; 314 t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName); 315 return t; 316 } 317 318 319 int WinstrcmpL(const void *a, const void *b) { 320 gCount++; 321 int t; 322 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); 323 return t-2; 324 } 325 326 327 int WinstrcmpK(const void *a, const void *b) { 328 gCount++; 329 int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey); 330 return t; 331 } 332 333 334 //--------------------------------------------------------------------------------------- 335 // 336 // Function for sorting the names (lines) into a random order. 337 // Order is based on a hash of the ICU Sort key for the lines 338 // The randomized order is used as input for the sorting timing tests. 339 // 340 //--------------------------------------------------------------------------------------- 341 int ICURandomCmp(const void *a, const void *b) { 342 char *ask = (*(Line **)a)->icuSortKey; 343 char *bsk = (*(Line **)b)->icuSortKey; 344 int aVal = 0; 345 int bVal = 0; 346 int retVal; 347 while (*ask != 0) { 348 aVal += aVal*37 + *ask++; 349 } 350 while (*bsk != 0) { 351 bVal += bVal*37 + *bsk++; 352 } 353 retVal = -1; 354 if (aVal == bVal) { 355 retVal = 0; 356 } 357 else if (aVal > bVal) { 358 retVal = 1; 359 } 360 return retVal; 361 } 362 363 //--------------------------------------------------------------------------------------- 364 // 365 // doKeyGen() Key Generation Timing Test 366 // 367 //--------------------------------------------------------------------------------------- 368 void doKeyGen() 369 { 370 int line; 371 int loops = 0; 372 int iLoop; 373 int t; 374 int len=-1; 375 376 // Adjust loop count to compensate for file size. Should be order n 377 double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines)); 378 int adj_loopCount = int(dLoopCount); 379 if (adj_loopCount < 1) adj_loopCount = 1; 380 381 382 unsigned long startTime = timeGetTime(); 383 384 if (opt_win) { 385 for (loops=0; loops<adj_loopCount; loops++) { 386 for (line=0; line < gNumFileLines; line++) { 387 if (opt_uselen) { 388 len = gFileLines[line].len; 389 } 390 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 391 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, 392 gFileLines[line].name, len, 393 (unsigned short *)gFileLines[line].winSortKey, 5000); // TODO something with length. 394 } 395 } 396 } 397 } 398 else if (opt_icu) 399 { 400 for (loops=0; loops<adj_loopCount; loops++) { 401 for (line=0; line < gNumFileLines; line++) { 402 if (opt_uselen) { 403 len = gFileLines[line].len; 404 } 405 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 406 t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000); 407 } 408 } 409 } 410 } 411 else if (opt_unix) 412 { 413 for (loops=0; loops<adj_loopCount; loops++) { 414 for (line=0; line < gNumFileLines; line++) { 415 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 416 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000); 417 } 418 } 419 } 420 } 421 422 unsigned long elapsedTime = timeGetTime() - startTime; 423 int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines)); 424 425 if (opt_terse == FALSE) { 426 printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLines); 427 printf("Sort Key Generation: time per key = %d ns\n", ns); 428 } 429 else { 430 printf("%d, ", ns); 431 } 432 433 int totalKeyLen = 0; 434 int totalChars = 0; 435 for (line=0; line<gNumFileLines; line++) { 436 totalChars += u_strlen(gFileLines[line].name); 437 if (opt_win) { 438 totalKeyLen += strlen(gFileLines[line].winSortKey); 439 } 440 else if (opt_icu) { 441 totalKeyLen += strlen(gFileLines[line].icuSortKey); 442 } 443 else if (opt_unix) { 444 totalKeyLen += strlen(gFileLines[line].unixSortKey); 445 } 446 447 } 448 if (opt_terse == FALSE) { 449 printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars); 450 } else { 451 printf("%f, ", (float)totalKeyLen / (float)totalChars); 452 } 453 } 454 455 456 457 //--------------------------------------------------------------------------------------- 458 // 459 // doBinarySearch() Binary Search timing test. Each name from the list 460 // is looked up in the full sorted list of names. 461 // 462 //--------------------------------------------------------------------------------------- 463 void doBinarySearch() 464 { 465 466 gCount = 0; 467 int line; 468 int loops = 0; 469 int iLoop = 0; 470 unsigned long elapsedTime = 0; 471 472 // Adjust loop count to compensate for file size. Should be order n (lookups) * log n (compares/lookup) 473 // Accurate timings do not depend on this being perfect. The correction is just to try to 474 // get total running times of about the right order, so the that user doesn't need to 475 // manually adjust the loop count for every different file size. 476 double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines)); 477 if (opt_usekeys) dLoopCount *= 5; 478 int adj_loopCount = int(dLoopCount); 479 if (adj_loopCount < 1) adj_loopCount = 1; 480 481 482 for (;;) { // not really a loop, just allows "break" to work, to simplify 483 // inadvertantly running more than one test through here. 484 if (opt_strcmp || opt_strcmpCPO) 485 { 486 unsigned long startTime = timeGetTime(); 487 typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *); 488 PF pf = u_strcmp; 489 if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;} 490 //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the difference between int32_t and int 491 // which forces the use of a cast here. 492 493 int r = 0; 494 for (loops=0; loops<adj_loopCount; loops++) { 495 496 for (line=0; line < gNumFileLines; line++) { 497 int hi = gNumFileLines-1; 498 int lo = 0; 499 int guess = -1; 500 for (;;) { 501 int newGuess = (hi + lo) / 2; 502 if (newGuess == guess) 503 break; 504 guess = newGuess; 505 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 506 r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name); 507 } 508 gCount++; 509 if (r== 0) 510 break; 511 if (r < 0) 512 hi = guess; 513 else 514 lo = guess; 515 } 516 } 517 } 518 elapsedTime = timeGetTime() - startTime; 519 break; 520 } 521 522 523 if (opt_icu) 524 { 525 unsigned long startTime = timeGetTime(); 526 UCollationResult r = UCOL_EQUAL; 527 for (loops=0; loops<adj_loopCount; loops++) { 528 529 for (line=0; line < gNumFileLines; line++) { 530 int lineLen = -1; 531 int guessLen = -1; 532 if (opt_uselen) { 533 lineLen = (gSortedLines[line])->len; 534 } 535 int hi = gNumFileLines-1; 536 int lo = 0; 537 int guess = -1; 538 for (;;) { 539 int newGuess = (hi + lo) / 2; 540 if (newGuess == guess) 541 break; 542 guess = newGuess; 543 int ri = 0; 544 if (opt_usekeys) { 545 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 546 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey); 547 } 548 gCount++; 549 r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;} 550 } 551 else 552 { 553 if (opt_uselen) { 554 guessLen = (gSortedLines[guess])->len; 555 } 556 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 557 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); 558 } 559 gCount++; 560 } 561 if (r== UCOL_EQUAL) 562 break; 563 if (r == UCOL_LESS) 564 hi = guess; 565 else 566 lo = guess; 567 } 568 } 569 } 570 elapsedTime = timeGetTime() - startTime; 571 break; 572 } 573 574 if (opt_win) 575 { 576 unsigned long startTime = timeGetTime(); 577 int r = 0; 578 for (loops=0; loops<adj_loopCount; loops++) { 579 580 for (line=0; line < gNumFileLines; line++) { 581 int lineLen = -1; 582 int guessLen = -1; 583 if (opt_uselen) { 584 lineLen = (gSortedLines[line])->len; 585 } 586 int hi = gNumFileLines-1; 587 int lo = 0; 588 int guess = -1; 589 for (;;) { 590 int newGuess = (hi + lo) / 2; 591 if (newGuess == guess) 592 break; 593 guess = newGuess; 594 if (opt_usekeys) { 595 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 596 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey); 597 } 598 gCount++; 599 r+=2; 600 } 601 else 602 { 603 if (opt_uselen) { 604 guessLen = (gSortedLines[guess])->len; 605 } 606 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 607 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); 608 } 609 if (r == 0) { 610 if (opt_terse == FALSE) { 611 fprintf(stderr, "Error returned from Windows CompareStringW.\n"); 612 } 613 exit(-1); 614 } 615 gCount++; 616 } 617 if (r== 2) // strings == 618 break; 619 if (r == 1) // line < guess 620 hi = guess; 621 else // line > guess 622 lo = guess; 623 } 624 } 625 } 626 elapsedTime = timeGetTime() - startTime; 627 break; 628 } 629 630 if (opt_unix) 631 { 632 unsigned long startTime = timeGetTime(); 633 int r = 0; 634 for (loops=0; loops<adj_loopCount; loops++) { 635 636 for (line=0; line < gNumFileLines; line++) { 637 int hi = gNumFileLines-1; 638 int lo = 0; 639 int guess = -1; 640 for (;;) { 641 int newGuess = (hi + lo) / 2; 642 if (newGuess == guess) 643 break; 644 guess = newGuess; 645 if (opt_usekeys) { 646 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 647 r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey); 648 } 649 gCount++; 650 } 651 else 652 { 653 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 654 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName); 655 } 656 errno = 0; 657 if (errno != 0) { 658 fprintf(stderr, "Error %d returned from strcoll.\n", errno); 659 exit(-1); 660 } 661 gCount++; 662 } 663 if (r == 0) // strings == 664 break; 665 if (r < 0) // line < guess 666 hi = guess; 667 else // line > guess 668 lo = guess; 669 } 670 } 671 } 672 elapsedTime = timeGetTime() - startTime; 673 break; 674 } 675 break; 676 } 677 678 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 679 if (opt_terse == FALSE) { 680 printf("binary search: total # of string compares = %d\n", gCount); 681 printf("binary search: compares per loop = %d\n", gCount / loops); 682 printf("binary search: time per compare = %d ns\n", ns); 683 } else { 684 printf("%d, ", ns); 685 } 686 687 } 688 689 690 691 692 //--------------------------------------------------------------------------------------- 693 // 694 // doQSort() The quick sort timing test. Uses the C library qsort function. 695 // 696 //--------------------------------------------------------------------------------------- 697 void doQSort() { 698 int i; 699 Line **sortBuf = new Line *[gNumFileLines]; 700 701 // Adjust loop count to compensate for file size. QSort should be n log(n) 702 double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines)); 703 if (opt_usekeys) dLoopCount *= 5; 704 int adj_loopCount = int(dLoopCount); 705 if (adj_loopCount < 1) adj_loopCount = 1; 706 707 708 gCount = 0; 709 unsigned long startTime = timeGetTime(); 710 if (opt_win && opt_usekeys) { 711 for (i=0; i<opt_loopCount; i++) { 712 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 713 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK); 714 } 715 } 716 717 else if (opt_win && opt_uselen) { 718 for (i=0; i<adj_loopCount; i++) { 719 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 720 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL); 721 } 722 } 723 724 725 else if (opt_win && !opt_uselen) { 726 for (i=0; i<adj_loopCount; i++) { 727 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 728 qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp); 729 } 730 } 731 732 else if (opt_icu && opt_usekeys) { 733 for (i=0; i<adj_loopCount; i++) { 734 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 735 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK); 736 } 737 } 738 739 else if (opt_icu && opt_uselen) { 740 for (i=0; i<adj_loopCount; i++) { 741 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 742 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL); 743 } 744 } 745 746 747 else if (opt_icu && !opt_uselen) { 748 for (i=0; i<adj_loopCount; i++) { 749 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 750 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp); 751 } 752 } 753 754 else if (opt_unix && !opt_usekeys) { 755 for (i=0; i<adj_loopCount; i++) { 756 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 757 qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp); 758 } 759 } 760 761 unsigned long elapsedTime = timeGetTime() - startTime; 762 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 763 if (opt_terse == FALSE) { 764 printf("qsort: total # of string compares = %d\n", gCount); 765 printf("qsort: time per compare = %d ns\n", ns); 766 } else { 767 printf("%d, ", ns); 768 } 769 } 770 771 772 773 //--------------------------------------------------------------------------------------- 774 // 775 // doKeyHist() Output a table of data for 776 // average sort key size vs. string length. 777 // 778 //--------------------------------------------------------------------------------------- 779 void doKeyHist() { 780 int i; 781 int maxLen = 0; 782 783 // Find the maximum string length 784 for (i=0; i<gNumFileLines; i++) { 785 if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len; 786 } 787 788 // Allocate arrays to hold the histogram data 789 int *accumulatedLen = new int[maxLen+1]; 790 int *numKeysOfSize = new int[maxLen+1]; 791 for (i=0; i<=maxLen; i++) { 792 accumulatedLen[i] = 0; 793 numKeysOfSize[i] = 0; 794 } 795 796 // Fill the arrays... 797 for (i=0; i<gNumFileLines; i++) { 798 int len = gFileLines[i].len; 799 accumulatedLen[len] += strlen(gFileLines[i].icuSortKey); 800 numKeysOfSize[len] += 1; 801 } 802 803 // And write out averages 804 printf("String Length, Avg Key Length, Avg Key Len per char\n"); 805 for (i=1; i<=maxLen; i++) { 806 if (numKeysOfSize[i] > 0) { 807 printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i], 808 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i)); 809 } 810 } 811 } 812 813 //--------------------------------------------------------------------------------------- 814 // 815 // doForwardIterTest(UBool) Forward iteration test 816 // argument null-terminated string used 817 // 818 //--------------------------------------------------------------------------------------- 819 void doForwardIterTest(UBool haslen) { 820 int count = 0; 821 822 UErrorCode error = U_ZERO_ERROR; 823 printf("\n\nPerforming forward iteration performance test with "); 824 825 if (haslen) { 826 printf("non-null terminated data -----------\n"); 827 } 828 else { 829 printf("null terminated data -----------\n"); 830 } 831 printf("performance test on strings from file -----------\n"); 832 833 UChar dummytext[] = {0, 0}; 834 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); 835 ucol_setText(iter, dummytext, 1, &error); 836 837 gCount = 0; 838 unsigned long startTime = timeGetTime(); 839 while (count < opt_loopCount) { 840 int linecount = 0; 841 while (linecount < gNumFileLines) { 842 UChar *str = gFileLines[linecount].name; 843 int strlen = haslen?gFileLines[linecount].len:-1; 844 ucol_setText(iter, str, strlen, &error); 845 while (ucol_next(iter, &error) != UCOL_NULLORDER) { 846 gCount++; 847 } 848 849 linecount ++; 850 } 851 count ++; 852 } 853 unsigned long elapsedTime = timeGetTime() - startTime; 854 printf("elapsedTime %ld\n", elapsedTime); 855 856 // empty loop recalculation 857 count = 0; 858 startTime = timeGetTime(); 859 while (count < opt_loopCount) { 860 int linecount = 0; 861 while (linecount < gNumFileLines) { 862 UChar *str = gFileLines[linecount].name; 863 int strlen = haslen?gFileLines[linecount].len:-1; 864 ucol_setText(iter, str, strlen, &error); 865 linecount ++; 866 } 867 count ++; 868 } 869 elapsedTime -= (timeGetTime() - startTime); 870 printf("elapsedTime %ld\n", elapsedTime); 871 872 ucol_closeElements(iter); 873 874 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 875 printf("Total number of strings compared %d in %d loops\n", gNumFileLines, 876 opt_loopCount); 877 printf("Average time per ucol_next() nano seconds %d\n", ns); 878 879 printf("performance test on skipped-5 concatenated strings from file -----------\n"); 880 881 UChar *str; 882 int strlen = 0; 883 // appending all the strings 884 int linecount = 0; 885 while (linecount < gNumFileLines) { 886 strlen += haslen?gFileLines[linecount].len: 887 u_strlen(gFileLines[linecount].name); 888 linecount ++; 889 } 890 str = (UChar *)malloc(sizeof(UChar) * strlen); 891 int strindex = 0; 892 linecount = 0; 893 while (strindex < strlen) { 894 int len = 0; 895 len += haslen?gFileLines[linecount].len: 896 u_strlen(gFileLines[linecount].name); 897 memcpy(str + strindex, gFileLines[linecount].name, 898 sizeof(UChar) * len); 899 strindex += len; 900 linecount ++; 901 } 902 903 printf("Total size of strings %d\n", strlen); 904 905 gCount = 0; 906 count = 0; 907 908 if (!haslen) { 909 strlen = -1; 910 } 911 iter = ucol_openElements(gCol, str, strlen, &error); 912 if (!haslen) { 913 strlen = u_strlen(str); 914 } 915 strlen -= 5; // any left over characters are not iterated, 916 // this is to ensure the backwards and forwards iterators 917 // gets the same position 918 startTime = timeGetTime(); 919 while (count < opt_loopCount) { 920 int count5 = 5; 921 strindex = 0; 922 ucol_setOffset(iter, strindex, &error); 923 while (TRUE) { 924 if (ucol_next(iter, &error) == UCOL_NULLORDER) { 925 break; 926 } 927 gCount++; 928 count5 --; 929 if (count5 == 0) { 930 strindex += 10; 931 if (strindex > strlen) { 932 break; 933 } 934 ucol_setOffset(iter, strindex, &error); 935 count5 = 5; 936 } 937 } 938 count ++; 939 } 940 941 elapsedTime = timeGetTime() - startTime; 942 printf("elapsedTime %ld\n", elapsedTime); 943 944 // empty loop recalculation 945 int tempgCount = 0; 946 count = 0; 947 startTime = timeGetTime(); 948 while (count < opt_loopCount) { 949 int count5 = 5; 950 strindex = 0; 951 ucol_setOffset(iter, strindex, &error); 952 while (TRUE) { 953 tempgCount ++; 954 count5 --; 955 if (count5 == 0) { 956 strindex += 10; 957 if (strindex > strlen) { 958 break; 959 } 960 ucol_setOffset(iter, strindex, &error); 961 count5 = 5; 962 } 963 } 964 count ++; 965 } 966 elapsedTime -= (timeGetTime() - startTime); 967 printf("elapsedTime %ld\n", elapsedTime); 968 969 ucol_closeElements(iter); 970 971 printf("gCount %d\n", gCount); 972 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 973 printf("Average time per ucol_next() nano seconds %d\n", ns); 974 } 975 976 //--------------------------------------------------------------------------------------- 977 // 978 // doBackwardIterTest(UBool) Backwards iteration test 979 // argument null-terminated string used 980 // 981 //--------------------------------------------------------------------------------------- 982 void doBackwardIterTest(UBool haslen) { 983 int count = 0; 984 UErrorCode error = U_ZERO_ERROR; 985 printf("\n\nPerforming backward iteration performance test with "); 986 987 if (haslen) { 988 printf("non-null terminated data -----------\n"); 989 } 990 else { 991 printf("null terminated data -----------\n"); 992 } 993 994 printf("performance test on strings from file -----------\n"); 995 996 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); 997 UChar dummytext[] = {0, 0}; 998 ucol_setText(iter, dummytext, 1, &error); 999 1000 gCount = 0; 1001 unsigned long startTime = timeGetTime(); 1002 while (count < opt_loopCount) { 1003 int linecount = 0; 1004 while (linecount < gNumFileLines) { 1005 UChar *str = gFileLines[linecount].name; 1006 int strlen = haslen?gFileLines[linecount].len:-1; 1007 ucol_setText(iter, str, strlen, &error); 1008 while (ucol_previous(iter, &error) != UCOL_NULLORDER) { 1009 gCount ++; 1010 } 1011 1012 linecount ++; 1013 } 1014 count ++; 1015 } 1016 unsigned long elapsedTime = timeGetTime() - startTime; 1017 1018 printf("elapsedTime %ld\n", elapsedTime); 1019 1020 // empty loop recalculation 1021 count = 0; 1022 startTime = timeGetTime(); 1023 while (count < opt_loopCount) { 1024 int linecount = 0; 1025 while (linecount < gNumFileLines) { 1026 UChar *str = gFileLines[linecount].name; 1027 int strlen = haslen?gFileLines[linecount].len:-1; 1028 ucol_setText(iter, str, strlen, &error); 1029 linecount ++; 1030 } 1031 count ++; 1032 } 1033 elapsedTime -= (timeGetTime() - startTime); 1034 1035 printf("elapsedTime %ld\n", elapsedTime); 1036 ucol_closeElements(iter); 1037 1038 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 1039 printf("Total number of strings compared %d in %d loops\n", gNumFileLines, 1040 opt_loopCount); 1041 printf("Average time per ucol_previous() nano seconds %d\n", ns); 1042 1043 printf("performance test on skipped-5 concatenated strings from file -----------\n"); 1044 1045 UChar *str; 1046 int strlen = 0; 1047 // appending all the strings 1048 int linecount = 0; 1049 while (linecount < gNumFileLines) { 1050 strlen += haslen?gFileLines[linecount].len: 1051 u_strlen(gFileLines[linecount].name); 1052 linecount ++; 1053 } 1054 str = (UChar *)malloc(sizeof(UChar) * strlen); 1055 int strindex = 0; 1056 linecount = 0; 1057 while (strindex < strlen) { 1058 int len = 0; 1059 len += haslen?gFileLines[linecount].len: 1060 u_strlen(gFileLines[linecount].name); 1061 memcpy(str + strindex, gFileLines[linecount].name, 1062 sizeof(UChar) * len); 1063 strindex += len; 1064 linecount ++; 1065 } 1066 1067 printf("Total size of strings %d\n", strlen); 1068 1069 gCount = 0; 1070 count = 0; 1071 1072 if (!haslen) { 1073 strlen = -1; 1074 } 1075 1076 iter = ucol_openElements(gCol, str, strlen, &error); 1077 if (!haslen) { 1078 strlen = u_strlen(str); 1079 } 1080 1081 startTime = timeGetTime(); 1082 while (count < opt_loopCount) { 1083 int count5 = 5; 1084 strindex = 5; 1085 ucol_setOffset(iter, strindex, &error); 1086 while (TRUE) { 1087 if (ucol_previous(iter, &error) == UCOL_NULLORDER) { 1088 break; 1089 } 1090 gCount ++; 1091 count5 --; 1092 if (count5 == 0) { 1093 strindex += 10; 1094 if (strindex > strlen) { 1095 break; 1096 } 1097 ucol_setOffset(iter, strindex, &error); 1098 count5 = 5; 1099 } 1100 } 1101 count ++; 1102 } 1103 1104 elapsedTime = timeGetTime() - startTime; 1105 printf("elapsedTime %ld\n", elapsedTime); 1106 1107 // empty loop recalculation 1108 count = 0; 1109 int tempgCount = 0; 1110 startTime = timeGetTime(); 1111 while (count < opt_loopCount) { 1112 int count5 = 5; 1113 strindex = 5; 1114 ucol_setOffset(iter, strindex, &error); 1115 while (TRUE) { 1116 tempgCount ++; 1117 count5 --; 1118 if (count5 == 0) { 1119 strindex += 10; 1120 if (strindex > strlen) { 1121 break; 1122 } 1123 ucol_setOffset(iter, strindex, &error); 1124 count5 = 5; 1125 } 1126 } 1127 count ++; 1128 } 1129 elapsedTime -= (timeGetTime() - startTime); 1130 printf("elapsedTime %ld\n", elapsedTime); 1131 ucol_closeElements(iter); 1132 1133 printf("gCount %d\n", gCount); 1134 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 1135 printf("Average time per ucol_previous() nano seconds %d\n", ns); 1136 } 1137 1138 //--------------------------------------------------------------------------------------- 1139 // 1140 // doIterTest() Iteration test 1141 // 1142 //--------------------------------------------------------------------------------------- 1143 void doIterTest() { 1144 doForwardIterTest(opt_uselen); 1145 doBackwardIterTest(opt_uselen); 1146 } 1147 1148 1149 //---------------------------------------------------------------------------------------- 1150 // 1151 // UnixConvert -- Convert the lines of the file to the encoding for UNIX 1152 // Since it appears that Unicode support is going in the general 1153 // direction of the use of UTF-8 locales, that is the approach 1154 // that is used here. 1155 // 1156 //---------------------------------------------------------------------------------------- 1157 void UnixConvert() { 1158 int line; 1159 1160 UConverter *cvrtr; // An ICU code page converter. 1161 UErrorCode status = U_ZERO_ERROR; 1162 1163 1164 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. 1165 if (U_FAILURE(status)) { 1166 fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status)); 1167 exit(-1); 1168 } 1169 1170 for (line=0; line < gNumFileLines; line++) { 1171 int sizeNeeded = ucnv_fromUChars(cvrtr, 1172 0, // ptr to target buffer. 1173 0, // length of target buffer. 1174 gFileLines[line].name, 1175 -1, // source is null terminated 1176 &status); 1177 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { 1178 //fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); 1179 //exit(-1); 1180 } 1181 status = U_ZERO_ERROR; 1182 gFileLines[line].unixName = new char[sizeNeeded+1]; 1183 sizeNeeded = ucnv_fromUChars(cvrtr, 1184 gFileLines[line].unixName, // ptr to target buffer. 1185 sizeNeeded+1, // length of target buffer. 1186 gFileLines[line].name, 1187 -1, // source is null terminated 1188 &status); 1189 if (U_FAILURE(status)) { 1190 fprintf(stderr, "ICU Conversion Failed.: %d\n", status); 1191 exit(-1); 1192 } 1193 gFileLines[line].unixName[sizeNeeded] = 0; 1194 }; 1195 ucnv_close(cvrtr); 1196 } 1197 1198 1199 //---------------------------------------------------------------------------------------- 1200 // 1201 // class UCharFile Class to hide all the gorp to read a file in 1202 // and produce a stream of UChars. 1203 // 1204 //---------------------------------------------------------------------------------------- 1205 class UCharFile { 1206 public: 1207 UCharFile(const char *fileName); 1208 ~UCharFile(); 1209 UChar get(); 1210 UBool eof() {return fEof;}; 1211 UBool error() {return fError;}; 1212 1213 private: 1214 UCharFile (const UCharFile & /*other*/) {}; // No copy constructor. 1215 UCharFile & operator = (const UCharFile &/*other*/) {return *this;}; // No assignment op 1216 1217 FILE *fFile; 1218 const char *fName; 1219 UBool fEof; 1220 UBool fError; 1221 UChar fPending2ndSurrogate; 1222 1223 enum {UTF16LE, UTF16BE, UTF8} fEncoding; 1224 }; 1225 1226 UCharFile::UCharFile(const char * fileName) { 1227 fEof = FALSE; 1228 fError = FALSE; 1229 fName = fileName; 1230 fFile = fopen(fName, "rb"); 1231 fPending2ndSurrogate = 0; 1232 if (fFile == NULL) { 1233 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); 1234 fError = TRUE; 1235 return; 1236 } 1237 // 1238 // Look for the byte order mark at the start of the file. 1239 // 1240 int BOMC1, BOMC2, BOMC3; 1241 BOMC1 = fgetc(fFile); 1242 BOMC2 = fgetc(fFile); 1243 1244 if (BOMC1 == 0xff && BOMC2 == 0xfe) { 1245 fEncoding = UTF16LE; } 1246 else if (BOMC1 == 0xfe && BOMC2 == 0xff) { 1247 fEncoding = UTF16BE; } 1248 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { 1249 fEncoding = UTF8; } 1250 else 1251 { 1252 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " 1253 "must include a BOM.\n", fileName); 1254 fError = true; 1255 return; 1256 } 1257 } 1258 1259 1260 UCharFile::~UCharFile() { 1261 fclose(fFile); 1262 } 1263 1264 1265 1266 UChar UCharFile::get() { 1267 UChar c; 1268 switch (fEncoding) { 1269 case UTF16LE: 1270 { 1271 int cL, cH; 1272 cL = fgetc(fFile); 1273 cH = fgetc(fFile); 1274 c = cL | (cH << 8); 1275 if (cH == EOF) { 1276 c = 0; 1277 fEof = TRUE; 1278 } 1279 break; 1280 } 1281 case UTF16BE: 1282 { 1283 int cL, cH; 1284 cH = fgetc(fFile); 1285 cL = fgetc(fFile); 1286 c = cL | (cH << 8); 1287 if (cL == EOF) { 1288 c = 0; 1289 fEof = TRUE; 1290 } 1291 break; 1292 } 1293 case UTF8: 1294 { 1295 if (fPending2ndSurrogate != 0) { 1296 c = fPending2ndSurrogate; 1297 fPending2ndSurrogate = 0; 1298 break; 1299 } 1300 1301 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. 1302 if (ch == EOF) { 1303 c = 0; 1304 fEof = TRUE; 1305 break; 1306 } 1307 1308 if (ch <= 0x7f) { 1309 // It's ascii. No further utf-8 conversion. 1310 c = ch; 1311 break; 1312 } 1313 1314 // Figure out the lenght of the char and read the rest of the bytes 1315 // into a temp array. 1316 int nBytes; 1317 if (ch >= 0xF0) {nBytes=4;} 1318 else if (ch >= 0xE0) {nBytes=3;} 1319 else if (ch >= 0xC0) {nBytes=2;} 1320 else { 1321 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); 1322 fError = TRUE; 1323 return 0; 1324 } 1325 1326 unsigned char bytes[10]; 1327 bytes[0] = (unsigned char)ch; 1328 int i; 1329 for (i=1; i<nBytes; i++) { 1330 bytes[i] = fgetc(fFile); 1331 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { 1332 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); 1333 fError = TRUE; 1334 return 0; 1335 } 1336 } 1337 1338 // Convert the bytes from the temp array to a Unicode char. 1339 i = 0; 1340 uint32_t cp; 1341 UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp); 1342 c = (UChar)cp; 1343 1344 if (cp >= 0x10000) { 1345 // The code point needs to be broken up into a utf-16 surrogate pair. 1346 // Process first half this time through the main loop, and 1347 // remember the other half for the next time through. 1348 UChar utf16Buf[3]; 1349 i = 0; 1350 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); 1351 fPending2ndSurrogate = utf16Buf[1]; 1352 c = utf16Buf[0]; 1353 } 1354 break; 1355 }; 1356 default: 1357 c = 0xFFFD; /* Error, unspecified codepage*/ 1358 fprintf(stderr, "UCharFile: Error: unknown fEncoding\n"); 1359 exit(1); 1360 } 1361 return c; 1362 } 1363 1364 //---------------------------------------------------------------------------------------- 1365 // 1366 // openRulesCollator - Command line specified a rules file. Read it in 1367 // and open a collator with it. 1368 // 1369 //---------------------------------------------------------------------------------------- 1370 UCollator *openRulesCollator() { 1371 UCharFile f(opt_rules); 1372 if (f.error()) { 1373 return 0; 1374 } 1375 1376 int bufLen = 10000; 1377 UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar)); 1378 int i = 0; 1379 1380 for(;;) { 1381 buf[i] = f.get(); 1382 if (f.eof()) { 1383 break; 1384 } 1385 if (f.error()) { 1386 return 0; 1387 } 1388 i++; 1389 if (i >= bufLen) { 1390 bufLen += 10000; 1391 buf = (UChar *)realloc(buf, bufLen); 1392 } 1393 } 1394 buf[i] = 0; 1395 1396 UErrorCode status = U_ZERO_ERROR; 1397 UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF, 1398 UCOL_DEFAULT_STRENGTH, NULL, &status); 1399 if (U_FAILURE(status)) { 1400 fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status); 1401 return 0; 1402 } 1403 free(buf); 1404 return coll; 1405 } 1406 1407 1408 1409 1410 1411 //---------------------------------------------------------------------------------------- 1412 // 1413 // Main -- process command line, read in and pre-process the test file, 1414 // call other functions to do the actual tests. 1415 // 1416 //---------------------------------------------------------------------------------------- 1417 int main(int argc, const char** argv) { 1418 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) { 1419 printf(gUsageString); 1420 exit (1); 1421 } 1422 1423 // Make sure that we've only got one API selected. 1424 if (opt_unix || opt_win) opt_icu = FALSE; 1425 if (opt_unix) opt_win = FALSE; 1426 1427 // 1428 // Set up an ICU collator 1429 // 1430 UErrorCode status = U_ZERO_ERROR; 1431 1432 if (opt_rules != 0) { 1433 gCol = openRulesCollator(); 1434 if (gCol == 0) {return -1;} 1435 } 1436 else { 1437 gCol = ucol_open(opt_locale, &status); 1438 if (U_FAILURE(status)) { 1439 fprintf(stderr, "Collator creation failed.: %d\n", status); 1440 return -1; 1441 } 1442 } 1443 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { 1444 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); 1445 } 1446 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { 1447 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); 1448 } 1449 1450 if (opt_norm) { 1451 ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 1452 } 1453 if (opt_french && opt_frenchoff) { 1454 fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options."); 1455 exit(-1); 1456 } 1457 if (opt_french) { 1458 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status); 1459 } 1460 if (opt_frenchoff) { 1461 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status); 1462 } 1463 if (opt_lower) { 1464 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status); 1465 } 1466 if (opt_upper) { 1467 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status); 1468 } 1469 if (opt_case) { 1470 ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status); 1471 } 1472 if (opt_shifted) { 1473 ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status); 1474 } 1475 if (opt_level != 0) { 1476 switch (opt_level) { 1477 case 1: 1478 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status); 1479 break; 1480 case 2: 1481 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status); 1482 break; 1483 case 3: 1484 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status); 1485 break; 1486 case 4: 1487 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status); 1488 break; 1489 case 5: 1490 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status); 1491 break; 1492 default: 1493 fprintf(stderr, "-level param must be between 1 and 5\n"); 1494 exit(-1); 1495 } 1496 } 1497 1498 if (U_FAILURE(status)) { 1499 fprintf(stderr, "Collator attribute setting failed.: %d\n", status); 1500 return -1; 1501 } 1502 1503 1504 // 1505 // Set up a Windows LCID 1506 // 1507 if (opt_langid != 0) { 1508 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); 1509 } 1510 else { 1511 gWinLCID = uloc_getLCID(opt_locale); 1512 } 1513 1514 1515 // 1516 // Set the UNIX locale 1517 // 1518 if (opt_unix) { 1519 if (setlocale(LC_ALL, opt_locale) == 0) { 1520 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); 1521 exit(-1); 1522 } 1523 } 1524 1525 // Read in the input file. 1526 // File assumed to be utf-16. 1527 // Lines go onto heap buffers. Global index array to line starts is created. 1528 // Lines themselves are null terminated. 1529 // 1530 1531 UCharFile f(opt_fName); 1532 if (f.error()) { 1533 exit(-1); 1534 } 1535 1536 const int MAXLINES = 100000; 1537 gFileLines = new Line[MAXLINES]; 1538 UChar buf[1024]; 1539 int column = 0; 1540 1541 // Read the file, split into lines, and save in memory. 1542 // Loop runs once per utf-16 value from the input file, 1543 // (The number of bytes read from file per loop iteration depends on external encoding.) 1544 for (;;) { 1545 1546 UChar c = f.get(); 1547 if (f.error()){ 1548 exit(-1); 1549 } 1550 1551 1552 // We now have a good UTF-16 value in c. 1553 1554 // Watch for CR, LF, EOF; these finish off a line. 1555 if (c == 0xd) { 1556 continue; 1557 } 1558 1559 if (f.eof() || c == 0x0a || c==0x2028) { // Unipad inserts 2028 line separators! 1560 buf[column++] = 0; 1561 if (column > 1) { 1562 gFileLines[gNumFileLines].name = new UChar[column]; 1563 gFileLines[gNumFileLines].len = column-1; 1564 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar)); 1565 gNumFileLines++; 1566 column = 0; 1567 if (gNumFileLines >= MAXLINES) { 1568 fprintf(stderr, "File too big. Max number of lines is %d\n", MAXLINES); 1569 exit(-1); 1570 } 1571 1572 } 1573 if (c == 0xa || c == 0x2028) 1574 continue; 1575 else 1576 break; // EOF 1577 } 1578 buf[column++] = c; 1579 if (column >= 1023) 1580 { 1581 static UBool warnFlag = TRUE; 1582 if (warnFlag) { 1583 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n"); 1584 warnFlag = FALSE; 1585 } 1586 column--; 1587 } 1588 } 1589 1590 if (opt_terse == FALSE) { 1591 printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines); 1592 } 1593 1594 1595 // Convert the lines to the UNIX encoding. 1596 if (opt_unix) { 1597 UnixConvert(); 1598 } 1599 1600 // 1601 // Pre-compute ICU sort keys for the lines of the file. 1602 // 1603 int line; 1604 int32_t t; 1605 1606 for (line=0; line<gNumFileLines; line++) { 1607 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf)); 1608 gFileLines[line].icuSortKey = new char[t]; 1609 1610 if (t > (int32_t)sizeof(buf)) { 1611 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t); 1612 } 1613 else 1614 { 1615 memcpy(gFileLines[line].icuSortKey, buf, t); 1616 } 1617 } 1618 1619 1620 1621 // 1622 // Pre-compute Windows sort keys for the lines of the file. 1623 // 1624 for (line=0; line<gNumFileLines; line++) { 1625 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf)); 1626 gFileLines[line].winSortKey = new char[t]; 1627 if (t > (int32_t)sizeof(buf)) { 1628 t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t); 1629 } 1630 else 1631 { 1632 memcpy(gFileLines[line].winSortKey, buf, t); 1633 } 1634 } 1635 1636 // 1637 // Pre-compute UNIX sort keys for the lines of the file. 1638 // 1639 if (opt_unix) { 1640 for (line=0; line<gNumFileLines; line++) { 1641 t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf)); 1642 gFileLines[line].unixSortKey = new char[t]; 1643 if (t > (int32_t)sizeof(buf)) { 1644 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, sizeof(buf)); 1645 } 1646 else 1647 { 1648 memcpy(gFileLines[line].unixSortKey, buf, t); 1649 } 1650 } 1651 } 1652 1653 1654 // 1655 // Dump file lines, CEs, Sort Keys if requested. 1656 // 1657 if (opt_dump) { 1658 int i; 1659 for (line=0; line<gNumFileLines; line++) { 1660 for (i=0;;i++) { 1661 UChar c = gFileLines[line].name[i]; 1662 if (c == 0) 1663 break; 1664 if (c < 0x20 || c > 0x7e) { 1665 printf("\\u%.4x", c); 1666 } 1667 else { 1668 printf("%c", c); 1669 } 1670 } 1671 printf("\n"); 1672 1673 printf(" CEs: "); 1674 UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status); 1675 int32_t ce; 1676 i = 0; 1677 for (;;) { 1678 ce = ucol_next(CEiter, &status); 1679 if (ce == UCOL_NULLORDER) { 1680 break; 1681 } 1682 printf(" %.8x", ce); 1683 if (++i > 8) { 1684 printf("\n "); 1685 i = 0; 1686 } 1687 } 1688 printf("\n"); 1689 ucol_closeElements(CEiter); 1690 1691 1692 printf(" ICU Sort Key: "); 1693 for (i=0; ; i++) { 1694 unsigned char c = gFileLines[line].icuSortKey[i]; 1695 printf("%02x ", c); 1696 if (c == 0) { 1697 break; 1698 } 1699 if (i > 0 && i % 20 == 0) { 1700 printf("\n "); 1701 } 1702 } 1703 printf("\n"); 1704 } 1705 } 1706 1707 1708 // 1709 // Pre-sort the lines. 1710 // 1711 int i; 1712 gSortedLines = new Line *[gNumFileLines]; 1713 for (i=0; i<gNumFileLines; i++) { 1714 gSortedLines[i] = &gFileLines[i]; 1715 } 1716 1717 if (opt_win) { 1718 qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp); 1719 } 1720 else if (opt_unix) { 1721 qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp); 1722 } 1723 else /* ICU */ 1724 { 1725 qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp); 1726 } 1727 1728 1729 // 1730 // Make up a randomized order, will be used for sorting tests. 1731 // 1732 gRandomLines = new Line *[gNumFileLines]; 1733 for (i=0; i<gNumFileLines; i++) { 1734 gRandomLines[i] = &gFileLines[i]; 1735 } 1736 qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp); 1737 1738 1739 1740 1741 // 1742 // We've got the file read into memory. Go do something with it. 1743 // 1744 1745 if (opt_qsort) doQSort(); 1746 if (opt_binsearch) doBinarySearch(); 1747 if (opt_keygen) doKeyGen(); 1748 if (opt_keyhist) doKeyHist(); 1749 if (opt_itertest) doIterTest(); 1750 1751 return 0; 1752 1753 } 1754