1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (C) 2001-2010 IBM, Inc. All Rights Reserved. 4 * 5 ********************************************************************/ 6 /******************************************************************************** 7 * 8 * File CALLCOLL.C 9 * 10 * Modification History: 11 * Name Description 12 * Andy Heninger First Version 13 * 14 ********************************************************************************* 15 */ 16 17 // 18 // This program tests string collation and sort key generation performance. 19 // Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString 20 // A file of names is required as input, one per line. It must be in utf-8 or utf-16 format, 21 // and include a byte order mark. Either LE or BE format is OK. 22 // 23 24 const char gUsageString[] = 25 "usage: collperf options...\n" 26 "-help Display this message.\n" 27 "-file file_name utf-16 format file of names.\n" 28 "-locale name ICU locale to use. Default is en_US\n" 29 "-rules file_name Collation rules file (overrides locale)\n" 30 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" 31 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" 32 "-win Run test using Windows native services. (ICU is default)\n" 33 "-unix Run test using Unix strxfrm, strcoll services.\n" 34 "-uselen Use API with string lengths. Default is null-terminated strings\n" 35 "-usekeys Run tests using sortkeys rather than strcoll\n" 36 "-strcmp Run tests using u_strcmp rather than strcoll\n" 37 "-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n" 38 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" 39 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" 40 " under test at each call point. For measuring test overhead.\n" 41 "-terse Terse numbers-only output. Intended for use by scripts.\n" 42 "-french French accent ordering\n" 43 "-frenchoff No French accent ordering (for use with French locales.)\n" 44 "-norm Normalizing mode on\n" 45 "-shifted Shifted mode\n" 46 "-lower Lower case first\n" 47 "-upper Upper case first\n" 48 "-case Enable separate case level\n" 49 "-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n" 50 "-keyhist Produce a table sort key size vs. string length\n" 51 "-binsearch Binary Search timing test\n" 52 "-keygen Sort Key Generation timing test\n" 53 "-qsort Quicksort timing test\n" 54 "-iter Iteration Performance Test\n" 55 "-dump Display strings, sort keys and CEs.\n" 56 ; 57 58 59 60 #include <stdio.h> 61 #include <string.h> 62 #include <stdlib.h> 63 #include <math.h> 64 #include <locale.h> 65 #include <errno.h> 66 67 #include <unicode/utypes.h> 68 #include <unicode/ucol.h> 69 #include <unicode/ucoleitr.h> 70 #include <unicode/uloc.h> 71 #include <unicode/ustring.h> 72 #include <unicode/ures.h> 73 #include <unicode/uchar.h> 74 #include <unicode/ucnv.h> 75 #include <unicode/utf8.h> 76 77 #ifdef WIN32 78 #include <windows.h> 79 #else 80 // 81 // Stubs for Windows API functions when building on UNIXes. 82 // 83 typedef int DWORD; 84 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} 85 #include <sys/time.h> 86 unsigned long timeGetTime() { 87 struct timeval t; 88 gettimeofday(&t, 0); 89 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. 90 val += t.tv_usec / 1000; 91 return val; 92 } 93 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} 94 const int LCMAP_SORTKEY = 0; 95 #define MAKELCID(a,b) 0 96 const int SORT_DEFAULT = 0; 97 #endif 98 99 100 101 // 102 // Command line option variables 103 // These global variables are set according to the options specified 104 // on the command line by the user. 105 char * opt_fName = 0; 106 const char * opt_locale = "en_US"; 107 int opt_langid = 0; // Defaults to value corresponding to opt_locale. 108 char * opt_rules = 0; 109 UBool opt_help = FALSE; 110 int opt_loopCount = 1; 111 int opt_iLoopCount = 1; 112 UBool opt_terse = FALSE; 113 UBool opt_qsort = FALSE; 114 UBool opt_binsearch = FALSE; 115 UBool opt_icu = TRUE; 116 UBool opt_win = FALSE; // Run with Windows native functions. 117 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. 118 UBool opt_uselen = FALSE; 119 UBool opt_usekeys = FALSE; 120 UBool opt_strcmp = FALSE; 121 UBool opt_strcmpCPO = FALSE; 122 UBool opt_norm = FALSE; 123 UBool opt_keygen = FALSE; 124 UBool opt_french = FALSE; 125 UBool opt_frenchoff = FALSE; 126 UBool opt_shifted = FALSE; 127 UBool opt_lower = FALSE; 128 UBool opt_upper = FALSE; 129 UBool opt_case = FALSE; 130 int opt_level = 0; 131 UBool opt_keyhist = FALSE; 132 UBool opt_itertest = FALSE; 133 UBool opt_dump = FALSE; 134 135 136 137 // 138 // Definitions for the command line options 139 // 140 struct OptSpec { 141 const char *name; 142 enum {FLAG, NUM, STRING} type; 143 void *pVar; 144 }; 145 146 OptSpec opts[] = { 147 {"-file", OptSpec::STRING, &opt_fName}, 148 {"-locale", OptSpec::STRING, &opt_locale}, 149 {"-langid", OptSpec::NUM, &opt_langid}, 150 {"-rules", OptSpec::STRING, &opt_rules}, 151 {"-qsort", OptSpec::FLAG, &opt_qsort}, 152 {"-binsearch", OptSpec::FLAG, &opt_binsearch}, 153 {"-iter", OptSpec::FLAG, &opt_itertest}, 154 {"-win", OptSpec::FLAG, &opt_win}, 155 {"-unix", OptSpec::FLAG, &opt_unix}, 156 {"-uselen", OptSpec::FLAG, &opt_uselen}, 157 {"-usekeys", OptSpec::FLAG, &opt_usekeys}, 158 {"-strcmp", OptSpec::FLAG, &opt_strcmp}, 159 {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO}, 160 {"-norm", OptSpec::FLAG, &opt_norm}, 161 {"-french", OptSpec::FLAG, &opt_french}, 162 {"-frenchoff", OptSpec::FLAG, &opt_frenchoff}, 163 {"-shifted", OptSpec::FLAG, &opt_shifted}, 164 {"-lower", OptSpec::FLAG, &opt_lower}, 165 {"-upper", OptSpec::FLAG, &opt_upper}, 166 {"-case", OptSpec::FLAG, &opt_case}, 167 {"-level", OptSpec::NUM, &opt_level}, 168 {"-keyhist", OptSpec::FLAG, &opt_keyhist}, 169 {"-keygen", OptSpec::FLAG, &opt_keygen}, 170 {"-loop", OptSpec::NUM, &opt_loopCount}, 171 {"-iloop", OptSpec::NUM, &opt_iLoopCount}, 172 {"-terse", OptSpec::FLAG, &opt_terse}, 173 {"-dump", OptSpec::FLAG, &opt_dump}, 174 {"-help", OptSpec::FLAG, &opt_help}, 175 {"-?", OptSpec::FLAG, &opt_help}, 176 {0, OptSpec::FLAG, 0} 177 }; 178 179 180 //--------------------------------------------------------------------------- 181 // 182 // Global variables pointing to and describing the test file 183 // 184 //--------------------------------------------------------------------------- 185 186 // 187 // struct Line 188 // 189 // Each line from the source file (containing a name, presumably) gets 190 // one of these structs. 191 // 192 struct Line { 193 UChar *name; 194 int len; 195 char *winSortKey; 196 char *icuSortKey; 197 char *unixSortKey; 198 char *unixName; 199 }; 200 201 202 203 Line *gFileLines; // Ptr to array of Line structs, one per line in the file. 204 int gNumFileLines; 205 UCollator *gCol; 206 DWORD gWinLCID; 207 208 Line **gSortedLines; 209 Line **gRandomLines; 210 int gCount; 211 212 213 214 //--------------------------------------------------------------------------- 215 // 216 // ProcessOptions() Function to read the command line options. 217 // 218 //--------------------------------------------------------------------------- 219 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) 220 { 221 int i; 222 int argNum; 223 const char *pArgName; 224 OptSpec *pOpt; 225 226 for (argNum=1; argNum<argc; argNum++) { 227 pArgName = argv[argNum]; 228 for (pOpt = opts; pOpt->name != 0; pOpt++) { 229 if (strcmp(pOpt->name, pArgName) == 0) { 230 switch (pOpt->type) { 231 case OptSpec::FLAG: 232 *(UBool *)(pOpt->pVar) = TRUE; 233 break; 234 case OptSpec::STRING: 235 argNum ++; 236 if (argNum >= argc) { 237 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 238 return FALSE; 239 } 240 *(const char **)(pOpt->pVar) = argv[argNum]; 241 break; 242 case OptSpec::NUM: 243 argNum ++; 244 if (argNum >= argc) { 245 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 246 return FALSE; 247 } 248 char *endp; 249 i = strtol(argv[argNum], &endp, 0); 250 if (endp == argv[argNum]) { 251 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); 252 return FALSE; 253 } 254 *(int *)(pOpt->pVar) = i; 255 } 256 break; 257 } 258 } 259 if (pOpt->name == 0) 260 { 261 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); 262 return FALSE; 263 } 264 } 265 return TRUE; 266 } 267 268 //--------------------------------------------------------------------------------------- 269 // 270 // Comparison functions for use by qsort. 271 // 272 // Six flavors, ICU or Windows, SortKey or String Compare, Strings with length 273 // or null terminated. 274 // 275 //--------------------------------------------------------------------------------------- 276 int ICUstrcmpK(const void *a, const void *b) { 277 gCount++; 278 int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey); 279 return t; 280 } 281 282 283 int ICUstrcmpL(const void *a, const void *b) { 284 gCount++; 285 UCollationResult t; 286 t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); 287 if (t == UCOL_LESS) return -1; 288 if (t == UCOL_GREATER) return +1; 289 return 0; 290 } 291 292 293 int ICUstrcmp(const void *a, const void *b) { 294 gCount++; 295 UCollationResult t; 296 t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); 297 if (t == UCOL_LESS) return -1; 298 if (t == UCOL_GREATER) return +1; 299 return 0; 300 } 301 302 303 int Winstrcmp(const void *a, const void *b) { 304 gCount++; 305 int t; 306 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); 307 return t-2; 308 } 309 310 311 int UNIXstrcmp(const void *a, const void *b) { 312 gCount++; 313 int t; 314 t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName); 315 return t; 316 } 317 318 319 int WinstrcmpL(const void *a, const void *b) { 320 gCount++; 321 int t; 322 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); 323 return t-2; 324 } 325 326 327 int WinstrcmpK(const void *a, const void *b) { 328 gCount++; 329 int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey); 330 return t; 331 } 332 333 334 //--------------------------------------------------------------------------------------- 335 // 336 // Function for sorting the names (lines) into a random order. 337 // Order is based on a hash of the ICU Sort key for the lines 338 // The randomized order is used as input for the sorting timing tests. 339 // 340 //--------------------------------------------------------------------------------------- 341 int ICURandomCmp(const void *a, const void *b) { 342 char *ask = (*(Line **)a)->icuSortKey; 343 char *bsk = (*(Line **)b)->icuSortKey; 344 int aVal = 0; 345 int bVal = 0; 346 int retVal; 347 while (*ask != 0) { 348 aVal += aVal*37 + *ask++; 349 } 350 while (*bsk != 0) { 351 bVal += bVal*37 + *bsk++; 352 } 353 retVal = -1; 354 if (aVal == bVal) { 355 retVal = 0; 356 } 357 else if (aVal > bVal) { 358 retVal = 1; 359 } 360 return retVal; 361 } 362 363 //--------------------------------------------------------------------------------------- 364 // 365 // doKeyGen() Key Generation Timing Test 366 // 367 //--------------------------------------------------------------------------------------- 368 void doKeyGen() 369 { 370 int line; 371 int loops = 0; 372 int iLoop; 373 int t; 374 int len=-1; 375 376 // Adjust loop count to compensate for file size. Should be order n 377 double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines)); 378 int adj_loopCount = int(dLoopCount); 379 if (adj_loopCount < 1) adj_loopCount = 1; 380 381 382 unsigned long startTime = timeGetTime(); 383 384 if (opt_win) { 385 for (loops=0; loops<adj_loopCount; loops++) { 386 for (line=0; line < gNumFileLines; line++) { 387 if (opt_uselen) { 388 len = gFileLines[line].len; 389 } 390 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 391 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, 392 gFileLines[line].name, len, 393 (unsigned short *)gFileLines[line].winSortKey, 5000); // TODO something with length. 394 } 395 } 396 } 397 } 398 else if (opt_icu) 399 { 400 for (loops=0; loops<adj_loopCount; loops++) { 401 for (line=0; line < gNumFileLines; line++) { 402 if (opt_uselen) { 403 len = gFileLines[line].len; 404 } 405 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 406 t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000); 407 } 408 } 409 } 410 } 411 else if (opt_unix) 412 { 413 for (loops=0; loops<adj_loopCount; loops++) { 414 for (line=0; line < gNumFileLines; line++) { 415 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 416 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000); 417 } 418 } 419 } 420 } 421 422 unsigned long elapsedTime = timeGetTime() - startTime; 423 int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines)); 424 425 if (opt_terse == FALSE) { 426 printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLines); 427 printf("Sort Key Generation: time per key = %d ns\n", ns); 428 } 429 else { 430 printf("%d, ", ns); 431 } 432 433 int totalKeyLen = 0; 434 int totalChars = 0; 435 for (line=0; line<gNumFileLines; line++) { 436 totalChars += u_strlen(gFileLines[line].name); 437 if (opt_win) { 438 totalKeyLen += strlen(gFileLines[line].winSortKey); 439 } 440 else if (opt_icu) { 441 totalKeyLen += strlen(gFileLines[line].icuSortKey); 442 } 443 else if (opt_unix) { 444 totalKeyLen += strlen(gFileLines[line].unixSortKey); 445 } 446 447 } 448 if (opt_terse == FALSE) { 449 printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars); 450 } else { 451 printf("%f, ", (float)totalKeyLen / (float)totalChars); 452 } 453 } 454 455 456 457 //--------------------------------------------------------------------------------------- 458 // 459 // doBinarySearch() Binary Search timing test. Each name from the list 460 // is looked up in the full sorted list of names. 461 // 462 //--------------------------------------------------------------------------------------- 463 void doBinarySearch() 464 { 465 466 gCount = 0; 467 int line; 468 int loops = 0; 469 int iLoop = 0; 470 unsigned long elapsedTime = 0; 471 472 // Adjust loop count to compensate for file size. Should be order n (lookups) * log n (compares/lookup) 473 // Accurate timings do not depend on this being perfect. The correction is just to try to 474 // get total running times of about the right order, so the that user doesn't need to 475 // manually adjust the loop count for every different file size. 476 double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines)); 477 if (opt_usekeys) dLoopCount *= 5; 478 int adj_loopCount = int(dLoopCount); 479 if (adj_loopCount < 1) adj_loopCount = 1; 480 481 482 for (;;) { // not really a loop, just allows "break" to work, to simplify 483 // inadvertantly running more than one test through here. 484 if (opt_strcmp || opt_strcmpCPO) 485 { 486 unsigned long startTime = timeGetTime(); 487 typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *); 488 PF pf = u_strcmp; 489 if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;} 490 //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the difference between int32_t and int 491 // which forces the use of a cast here. 492 493 int r = 0; 494 for (loops=0; loops<adj_loopCount; loops++) { 495 496 for (line=0; line < gNumFileLines; line++) { 497 int hi = gNumFileLines-1; 498 int lo = 0; 499 int guess = -1; 500 for (;;) { 501 int newGuess = (hi + lo) / 2; 502 if (newGuess == guess) 503 break; 504 guess = newGuess; 505 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 506 r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name); 507 } 508 gCount++; 509 if (r== 0) 510 break; 511 if (r < 0) 512 hi = guess; 513 else 514 lo = guess; 515 } 516 } 517 } 518 elapsedTime = timeGetTime() - startTime; 519 break; 520 } 521 522 523 if (opt_icu) 524 { 525 unsigned long startTime = timeGetTime(); 526 UCollationResult r = UCOL_EQUAL; 527 for (loops=0; loops<adj_loopCount; loops++) { 528 529 for (line=0; line < gNumFileLines; line++) { 530 int lineLen = -1; 531 int guessLen = -1; 532 if (opt_uselen) { 533 lineLen = (gSortedLines[line])->len; 534 } 535 int hi = gNumFileLines-1; 536 int lo = 0; 537 int guess = -1; 538 for (;;) { 539 int newGuess = (hi + lo) / 2; 540 if (newGuess == guess) 541 break; 542 guess = newGuess; 543 int ri = 0; 544 if (opt_usekeys) { 545 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 546 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey); 547 } 548 gCount++; 549 r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;} 550 } 551 else 552 { 553 if (opt_uselen) { 554 guessLen = (gSortedLines[guess])->len; 555 } 556 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 557 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); 558 } 559 gCount++; 560 } 561 if (r== UCOL_EQUAL) 562 break; 563 if (r == UCOL_LESS) 564 hi = guess; 565 else 566 lo = guess; 567 } 568 } 569 } 570 elapsedTime = timeGetTime() - startTime; 571 break; 572 } 573 574 if (opt_win) 575 { 576 unsigned long startTime = timeGetTime(); 577 int r = 0; 578 for (loops=0; loops<adj_loopCount; loops++) { 579 580 for (line=0; line < gNumFileLines; line++) { 581 int lineLen = -1; 582 int guessLen = -1; 583 if (opt_uselen) { 584 lineLen = (gSortedLines[line])->len; 585 } 586 int hi = gNumFileLines-1; 587 int lo = 0; 588 int guess = -1; 589 for (;;) { 590 int newGuess = (hi + lo) / 2; 591 if (newGuess == guess) 592 break; 593 guess = newGuess; 594 if (opt_usekeys) { 595 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 596 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey); 597 } 598 gCount++; 599 r+=2; 600 } 601 else 602 { 603 if (opt_uselen) { 604 guessLen = (gSortedLines[guess])->len; 605 } 606 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 607 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); 608 } 609 if (r == 0) { 610 if (opt_terse == FALSE) { 611 fprintf(stderr, "Error returned from Windows CompareStringW.\n"); 612 } 613 exit(-1); 614 } 615 gCount++; 616 } 617 if (r== 2) // strings == 618 break; 619 if (r == 1) // line < guess 620 hi = guess; 621 else // line > guess 622 lo = guess; 623 } 624 } 625 } 626 elapsedTime = timeGetTime() - startTime; 627 break; 628 } 629 630 if (opt_unix) 631 { 632 unsigned long startTime = timeGetTime(); 633 int r = 0; 634 for (loops=0; loops<adj_loopCount; loops++) { 635 636 for (line=0; line < gNumFileLines; line++) { 637 int hi = gNumFileLines-1; 638 int lo = 0; 639 int guess = -1; 640 for (;;) { 641 int newGuess = (hi + lo) / 2; 642 if (newGuess == guess) 643 break; 644 guess = newGuess; 645 if (opt_usekeys) { 646 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 647 r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey); 648 } 649 gCount++; 650 } 651 else 652 { 653 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 654 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName); 655 } 656 errno = 0; 657 if (errno != 0) { 658 fprintf(stderr, "Error %d returned from strcoll.\n", errno); 659 exit(-1); 660 } 661 gCount++; 662 } 663 if (r == 0) // strings == 664 break; 665 if (r < 0) // line < guess 666 hi = guess; 667 else // line > guess 668 lo = guess; 669 } 670 } 671 } 672 elapsedTime = timeGetTime() - startTime; 673 break; 674 } 675 break; 676 } 677 678 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 679 if (opt_terse == FALSE) { 680 printf("binary search: total # of string compares = %d\n", gCount); 681 printf("binary search: compares per loop = %d\n", gCount / loops); 682 printf("binary search: time per compare = %d ns\n", ns); 683 } else { 684 printf("%d, ", ns); 685 } 686 687 } 688 689 690 691 692 //--------------------------------------------------------------------------------------- 693 // 694 // doQSort() The quick sort timing test. Uses the C library qsort function. 695 // 696 //--------------------------------------------------------------------------------------- 697 void doQSort() { 698 int i; 699 Line **sortBuf = new Line *[gNumFileLines]; 700 701 // Adjust loop count to compensate for file size. QSort should be n log(n) 702 double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines)); 703 if (opt_usekeys) dLoopCount *= 5; 704 int adj_loopCount = int(dLoopCount); 705 if (adj_loopCount < 1) adj_loopCount = 1; 706 707 708 gCount = 0; 709 unsigned long startTime = timeGetTime(); 710 if (opt_win && opt_usekeys) { 711 for (i=0; i<opt_loopCount; i++) { 712 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 713 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK); 714 } 715 } 716 717 else if (opt_win && opt_uselen) { 718 for (i=0; i<adj_loopCount; i++) { 719 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 720 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL); 721 } 722 } 723 724 725 else if (opt_win && !opt_uselen) { 726 for (i=0; i<adj_loopCount; i++) { 727 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 728 qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp); 729 } 730 } 731 732 else if (opt_icu && opt_usekeys) { 733 for (i=0; i<adj_loopCount; i++) { 734 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 735 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK); 736 } 737 } 738 739 else if (opt_icu && opt_uselen) { 740 for (i=0; i<adj_loopCount; i++) { 741 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 742 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL); 743 } 744 } 745 746 747 else if (opt_icu && !opt_uselen) { 748 for (i=0; i<adj_loopCount; i++) { 749 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 750 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp); 751 } 752 } 753 754 else if (opt_unix && !opt_usekeys) { 755 for (i=0; i<adj_loopCount; i++) { 756 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 757 qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp); 758 } 759 } 760 761 unsigned long elapsedTime = timeGetTime() - startTime; 762 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 763 if (opt_terse == FALSE) { 764 printf("qsort: total # of string compares = %d\n", gCount); 765 printf("qsort: time per compare = %d ns\n", ns); 766 } else { 767 printf("%d, ", ns); 768 } 769 } 770 771 772 773 //--------------------------------------------------------------------------------------- 774 // 775 // doKeyHist() Output a table of data for 776 // average sort key size vs. string length. 777 // 778 //--------------------------------------------------------------------------------------- 779 void doKeyHist() { 780 int i; 781 int maxLen = 0; 782 783 // Find the maximum string length 784 for (i=0; i<gNumFileLines; i++) { 785 if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len; 786 } 787 788 // Allocate arrays to hold the histogram data 789 int *accumulatedLen = new int[maxLen+1]; 790 int *numKeysOfSize = new int[maxLen+1]; 791 for (i=0; i<=maxLen; i++) { 792 accumulatedLen[i] = 0; 793 numKeysOfSize[i] = 0; 794 } 795 796 // Fill the arrays... 797 for (i=0; i<gNumFileLines; i++) { 798 int len = gFileLines[i].len; 799 accumulatedLen[len] += strlen(gFileLines[i].icuSortKey); 800 numKeysOfSize[len] += 1; 801 } 802 803 // And write out averages 804 printf("String Length, Avg Key Length, Avg Key Len per char\n"); 805 for (i=1; i<=maxLen; i++) { 806 if (numKeysOfSize[i] > 0) { 807 printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i], 808 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i)); 809 } 810 } 811 delete []accumulatedLen; 812 delete []numKeysOfSize ; 813 } 814 815 //--------------------------------------------------------------------------------------- 816 // 817 // doForwardIterTest(UBool) Forward iteration test 818 // argument null-terminated string used 819 // 820 //--------------------------------------------------------------------------------------- 821 void doForwardIterTest(UBool haslen) { 822 int count = 0; 823 824 UErrorCode error = U_ZERO_ERROR; 825 printf("\n\nPerforming forward iteration performance test with "); 826 827 if (haslen) { 828 printf("non-null terminated data -----------\n"); 829 } 830 else { 831 printf("null terminated data -----------\n"); 832 } 833 printf("performance test on strings from file -----------\n"); 834 835 UChar dummytext[] = {0, 0}; 836 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); 837 ucol_setText(iter, dummytext, 1, &error); 838 839 gCount = 0; 840 unsigned long startTime = timeGetTime(); 841 while (count < opt_loopCount) { 842 int linecount = 0; 843 while (linecount < gNumFileLines) { 844 UChar *str = gFileLines[linecount].name; 845 int strlen = haslen?gFileLines[linecount].len:-1; 846 ucol_setText(iter, str, strlen, &error); 847 while (ucol_next(iter, &error) != UCOL_NULLORDER) { 848 gCount++; 849 } 850 851 linecount ++; 852 } 853 count ++; 854 } 855 unsigned long elapsedTime = timeGetTime() - startTime; 856 printf("elapsedTime %ld\n", elapsedTime); 857 858 // empty loop recalculation 859 count = 0; 860 startTime = timeGetTime(); 861 while (count < opt_loopCount) { 862 int linecount = 0; 863 while (linecount < gNumFileLines) { 864 UChar *str = gFileLines[linecount].name; 865 int strlen = haslen?gFileLines[linecount].len:-1; 866 ucol_setText(iter, str, strlen, &error); 867 linecount ++; 868 } 869 count ++; 870 } 871 elapsedTime -= (timeGetTime() - startTime); 872 printf("elapsedTime %ld\n", elapsedTime); 873 874 ucol_closeElements(iter); 875 876 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 877 printf("Total number of strings compared %d in %d loops\n", gNumFileLines, 878 opt_loopCount); 879 printf("Average time per ucol_next() nano seconds %d\n", ns); 880 881 printf("performance test on skipped-5 concatenated strings from file -----------\n"); 882 883 UChar *str; 884 int strlen = 0; 885 // appending all the strings 886 int linecount = 0; 887 while (linecount < gNumFileLines) { 888 strlen += haslen?gFileLines[linecount].len: 889 u_strlen(gFileLines[linecount].name); 890 linecount ++; 891 } 892 str = (UChar *)malloc(sizeof(UChar) * strlen); 893 int strindex = 0; 894 linecount = 0; 895 while (strindex < strlen) { 896 int len = 0; 897 len += haslen?gFileLines[linecount].len: 898 u_strlen(gFileLines[linecount].name); 899 memcpy(str + strindex, gFileLines[linecount].name, 900 sizeof(UChar) * len); 901 strindex += len; 902 linecount ++; 903 } 904 905 printf("Total size of strings %d\n", strlen); 906 907 gCount = 0; 908 count = 0; 909 910 if (!haslen) { 911 strlen = -1; 912 } 913 iter = ucol_openElements(gCol, str, strlen, &error); 914 if (!haslen) { 915 strlen = u_strlen(str); 916 } 917 strlen -= 5; // any left over characters are not iterated, 918 // this is to ensure the backwards and forwards iterators 919 // gets the same position 920 startTime = timeGetTime(); 921 while (count < opt_loopCount) { 922 int count5 = 5; 923 strindex = 0; 924 ucol_setOffset(iter, strindex, &error); 925 while (TRUE) { 926 if (ucol_next(iter, &error) == UCOL_NULLORDER) { 927 break; 928 } 929 gCount++; 930 count5 --; 931 if (count5 == 0) { 932 strindex += 10; 933 if (strindex > strlen) { 934 break; 935 } 936 ucol_setOffset(iter, strindex, &error); 937 count5 = 5; 938 } 939 } 940 count ++; 941 } 942 943 elapsedTime = timeGetTime() - startTime; 944 printf("elapsedTime %ld\n", elapsedTime); 945 946 // empty loop recalculation 947 int tempgCount = 0; 948 count = 0; 949 startTime = timeGetTime(); 950 while (count < opt_loopCount) { 951 int count5 = 5; 952 strindex = 0; 953 ucol_setOffset(iter, strindex, &error); 954 while (TRUE) { 955 tempgCount ++; 956 count5 --; 957 if (count5 == 0) { 958 strindex += 10; 959 if (strindex > strlen) { 960 break; 961 } 962 ucol_setOffset(iter, strindex, &error); 963 count5 = 5; 964 } 965 } 966 count ++; 967 } 968 elapsedTime -= (timeGetTime() - startTime); 969 printf("elapsedTime %ld\n", elapsedTime); 970 971 ucol_closeElements(iter); 972 973 printf("gCount %d\n", gCount); 974 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 975 printf("Average time per ucol_next() nano seconds %d\n", ns); 976 } 977 978 //--------------------------------------------------------------------------------------- 979 // 980 // doBackwardIterTest(UBool) Backwards iteration test 981 // argument null-terminated string used 982 // 983 //--------------------------------------------------------------------------------------- 984 void doBackwardIterTest(UBool haslen) { 985 int count = 0; 986 UErrorCode error = U_ZERO_ERROR; 987 printf("\n\nPerforming backward iteration performance test with "); 988 989 if (haslen) { 990 printf("non-null terminated data -----------\n"); 991 } 992 else { 993 printf("null terminated data -----------\n"); 994 } 995 996 printf("performance test on strings from file -----------\n"); 997 998 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); 999 UChar dummytext[] = {0, 0}; 1000 ucol_setText(iter, dummytext, 1, &error); 1001 1002 gCount = 0; 1003 unsigned long startTime = timeGetTime(); 1004 while (count < opt_loopCount) { 1005 int linecount = 0; 1006 while (linecount < gNumFileLines) { 1007 UChar *str = gFileLines[linecount].name; 1008 int strlen = haslen?gFileLines[linecount].len:-1; 1009 ucol_setText(iter, str, strlen, &error); 1010 while (ucol_previous(iter, &error) != UCOL_NULLORDER) { 1011 gCount ++; 1012 } 1013 1014 linecount ++; 1015 } 1016 count ++; 1017 } 1018 unsigned long elapsedTime = timeGetTime() - startTime; 1019 1020 printf("elapsedTime %ld\n", elapsedTime); 1021 1022 // empty loop recalculation 1023 count = 0; 1024 startTime = timeGetTime(); 1025 while (count < opt_loopCount) { 1026 int linecount = 0; 1027 while (linecount < gNumFileLines) { 1028 UChar *str = gFileLines[linecount].name; 1029 int strlen = haslen?gFileLines[linecount].len:-1; 1030 ucol_setText(iter, str, strlen, &error); 1031 linecount ++; 1032 } 1033 count ++; 1034 } 1035 elapsedTime -= (timeGetTime() - startTime); 1036 1037 printf("elapsedTime %ld\n", elapsedTime); 1038 ucol_closeElements(iter); 1039 1040 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 1041 printf("Total number of strings compared %d in %d loops\n", gNumFileLines, 1042 opt_loopCount); 1043 printf("Average time per ucol_previous() nano seconds %d\n", ns); 1044 1045 printf("performance test on skipped-5 concatenated strings from file -----------\n"); 1046 1047 UChar *str; 1048 int strlen = 0; 1049 // appending all the strings 1050 int linecount = 0; 1051 while (linecount < gNumFileLines) { 1052 strlen += haslen?gFileLines[linecount].len: 1053 u_strlen(gFileLines[linecount].name); 1054 linecount ++; 1055 } 1056 str = (UChar *)malloc(sizeof(UChar) * strlen); 1057 int strindex = 0; 1058 linecount = 0; 1059 while (strindex < strlen) { 1060 int len = 0; 1061 len += haslen?gFileLines[linecount].len: 1062 u_strlen(gFileLines[linecount].name); 1063 memcpy(str + strindex, gFileLines[linecount].name, 1064 sizeof(UChar) * len); 1065 strindex += len; 1066 linecount ++; 1067 } 1068 1069 printf("Total size of strings %d\n", strlen); 1070 1071 gCount = 0; 1072 count = 0; 1073 1074 if (!haslen) { 1075 strlen = -1; 1076 } 1077 1078 iter = ucol_openElements(gCol, str, strlen, &error); 1079 if (!haslen) { 1080 strlen = u_strlen(str); 1081 } 1082 1083 startTime = timeGetTime(); 1084 while (count < opt_loopCount) { 1085 int count5 = 5; 1086 strindex = 5; 1087 ucol_setOffset(iter, strindex, &error); 1088 while (TRUE) { 1089 if (ucol_previous(iter, &error) == UCOL_NULLORDER) { 1090 break; 1091 } 1092 gCount ++; 1093 count5 --; 1094 if (count5 == 0) { 1095 strindex += 10; 1096 if (strindex > strlen) { 1097 break; 1098 } 1099 ucol_setOffset(iter, strindex, &error); 1100 count5 = 5; 1101 } 1102 } 1103 count ++; 1104 } 1105 1106 elapsedTime = timeGetTime() - startTime; 1107 printf("elapsedTime %ld\n", elapsedTime); 1108 1109 // empty loop recalculation 1110 count = 0; 1111 int tempgCount = 0; 1112 startTime = timeGetTime(); 1113 while (count < opt_loopCount) { 1114 int count5 = 5; 1115 strindex = 5; 1116 ucol_setOffset(iter, strindex, &error); 1117 while (TRUE) { 1118 tempgCount ++; 1119 count5 --; 1120 if (count5 == 0) { 1121 strindex += 10; 1122 if (strindex > strlen) { 1123 break; 1124 } 1125 ucol_setOffset(iter, strindex, &error); 1126 count5 = 5; 1127 } 1128 } 1129 count ++; 1130 } 1131 elapsedTime -= (timeGetTime() - startTime); 1132 printf("elapsedTime %ld\n", elapsedTime); 1133 ucol_closeElements(iter); 1134 1135 printf("gCount %d\n", gCount); 1136 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 1137 printf("Average time per ucol_previous() nano seconds %d\n", ns); 1138 } 1139 1140 //--------------------------------------------------------------------------------------- 1141 // 1142 // doIterTest() Iteration test 1143 // 1144 //--------------------------------------------------------------------------------------- 1145 void doIterTest() { 1146 doForwardIterTest(opt_uselen); 1147 doBackwardIterTest(opt_uselen); 1148 } 1149 1150 1151 //---------------------------------------------------------------------------------------- 1152 // 1153 // UnixConvert -- Convert the lines of the file to the encoding for UNIX 1154 // Since it appears that Unicode support is going in the general 1155 // direction of the use of UTF-8 locales, that is the approach 1156 // that is used here. 1157 // 1158 //---------------------------------------------------------------------------------------- 1159 void UnixConvert() { 1160 int line; 1161 1162 UConverter *cvrtr; // An ICU code page converter. 1163 UErrorCode status = U_ZERO_ERROR; 1164 1165 1166 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. 1167 if (U_FAILURE(status)) { 1168 fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status)); 1169 exit(-1); 1170 } 1171 1172 for (line=0; line < gNumFileLines; line++) { 1173 int sizeNeeded = ucnv_fromUChars(cvrtr, 1174 0, // ptr to target buffer. 1175 0, // length of target buffer. 1176 gFileLines[line].name, 1177 -1, // source is null terminated 1178 &status); 1179 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { 1180 //fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); 1181 //exit(-1); 1182 } 1183 status = U_ZERO_ERROR; 1184 gFileLines[line].unixName = new char[sizeNeeded+1]; 1185 sizeNeeded = ucnv_fromUChars(cvrtr, 1186 gFileLines[line].unixName, // ptr to target buffer. 1187 sizeNeeded+1, // length of target buffer. 1188 gFileLines[line].name, 1189 -1, // source is null terminated 1190 &status); 1191 if (U_FAILURE(status)) { 1192 fprintf(stderr, "ICU Conversion Failed.: %d\n", status); 1193 exit(-1); 1194 } 1195 gFileLines[line].unixName[sizeNeeded] = 0; 1196 }; 1197 ucnv_close(cvrtr); 1198 } 1199 1200 1201 //---------------------------------------------------------------------------------------- 1202 // 1203 // class UCharFile Class to hide all the gorp to read a file in 1204 // and produce a stream of UChars. 1205 // 1206 //---------------------------------------------------------------------------------------- 1207 class UCharFile { 1208 public: 1209 UCharFile(const char *fileName); 1210 ~UCharFile(); 1211 UChar get(); 1212 UBool eof() {return fEof;}; 1213 UBool error() {return fError;}; 1214 1215 private: 1216 UCharFile (const UCharFile & /*other*/) {}; // No copy constructor. 1217 UCharFile & operator = (const UCharFile &/*other*/) {return *this;}; // No assignment op 1218 1219 FILE *fFile; 1220 const char *fName; 1221 UBool fEof; 1222 UBool fError; 1223 UChar fPending2ndSurrogate; 1224 1225 enum {UTF16LE, UTF16BE, UTF8} fEncoding; 1226 }; 1227 1228 UCharFile::UCharFile(const char * fileName) { 1229 fEof = FALSE; 1230 fError = FALSE; 1231 fName = fileName; 1232 fFile = fopen(fName, "rb"); 1233 fPending2ndSurrogate = 0; 1234 if (fFile == NULL) { 1235 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); 1236 fError = TRUE; 1237 return; 1238 } 1239 // 1240 // Look for the byte order mark at the start of the file. 1241 // 1242 int BOMC1, BOMC2, BOMC3; 1243 BOMC1 = fgetc(fFile); 1244 BOMC2 = fgetc(fFile); 1245 1246 if (BOMC1 == 0xff && BOMC2 == 0xfe) { 1247 fEncoding = UTF16LE; } 1248 else if (BOMC1 == 0xfe && BOMC2 == 0xff) { 1249 fEncoding = UTF16BE; } 1250 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { 1251 fEncoding = UTF8; } 1252 else 1253 { 1254 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " 1255 "must include a BOM.\n", fileName); 1256 fError = true; 1257 return; 1258 } 1259 } 1260 1261 1262 UCharFile::~UCharFile() { 1263 fclose(fFile); 1264 } 1265 1266 1267 1268 UChar UCharFile::get() { 1269 UChar c; 1270 switch (fEncoding) { 1271 case UTF16LE: 1272 { 1273 int cL, cH; 1274 cL = fgetc(fFile); 1275 cH = fgetc(fFile); 1276 c = cL | (cH << 8); 1277 if (cH == EOF) { 1278 c = 0; 1279 fEof = TRUE; 1280 } 1281 break; 1282 } 1283 case UTF16BE: 1284 { 1285 int cL, cH; 1286 cH = fgetc(fFile); 1287 cL = fgetc(fFile); 1288 c = cL | (cH << 8); 1289 if (cL == EOF) { 1290 c = 0; 1291 fEof = TRUE; 1292 } 1293 break; 1294 } 1295 case UTF8: 1296 { 1297 if (fPending2ndSurrogate != 0) { 1298 c = fPending2ndSurrogate; 1299 fPending2ndSurrogate = 0; 1300 break; 1301 } 1302 1303 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. 1304 if (ch == EOF) { 1305 c = 0; 1306 fEof = TRUE; 1307 break; 1308 } 1309 1310 if (ch <= 0x7f) { 1311 // It's ascii. No further utf-8 conversion. 1312 c = ch; 1313 break; 1314 } 1315 1316 // Figure out the lenght of the char and read the rest of the bytes 1317 // into a temp array. 1318 int nBytes; 1319 if (ch >= 0xF0) {nBytes=4;} 1320 else if (ch >= 0xE0) {nBytes=3;} 1321 else if (ch >= 0xC0) {nBytes=2;} 1322 else { 1323 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); 1324 fError = TRUE; 1325 return 0; 1326 } 1327 1328 unsigned char bytes[10]; 1329 bytes[0] = (unsigned char)ch; 1330 int i; 1331 for (i=1; i<nBytes; i++) { 1332 bytes[i] = fgetc(fFile); 1333 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { 1334 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); 1335 fError = TRUE; 1336 return 0; 1337 } 1338 } 1339 1340 // Convert the bytes from the temp array to a Unicode char. 1341 i = 0; 1342 uint32_t cp; 1343 UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp); 1344 c = (UChar)cp; 1345 1346 if (cp >= 0x10000) { 1347 // The code point needs to be broken up into a utf-16 surrogate pair. 1348 // Process first half this time through the main loop, and 1349 // remember the other half for the next time through. 1350 UChar utf16Buf[3]; 1351 i = 0; 1352 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); 1353 fPending2ndSurrogate = utf16Buf[1]; 1354 c = utf16Buf[0]; 1355 } 1356 break; 1357 }; 1358 default: 1359 c = 0xFFFD; /* Error, unspecified codepage*/ 1360 fprintf(stderr, "UCharFile: Error: unknown fEncoding\n"); 1361 exit(1); 1362 } 1363 return c; 1364 } 1365 1366 //---------------------------------------------------------------------------------------- 1367 // 1368 // openRulesCollator - Command line specified a rules file. Read it in 1369 // and open a collator with it. 1370 // 1371 //---------------------------------------------------------------------------------------- 1372 UCollator *openRulesCollator() { 1373 UCharFile f(opt_rules); 1374 if (f.error()) { 1375 return 0; 1376 } 1377 1378 int bufLen = 10000; 1379 UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar)); 1380 int i = 0; 1381 1382 for(;;) { 1383 buf[i] = f.get(); 1384 if (f.eof()) { 1385 break; 1386 } 1387 if (f.error()) { 1388 return 0; 1389 } 1390 i++; 1391 if (i >= bufLen) { 1392 bufLen += 10000; 1393 buf = (UChar *)realloc(buf, bufLen); 1394 } 1395 } 1396 buf[i] = 0; 1397 1398 UErrorCode status = U_ZERO_ERROR; 1399 UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF, 1400 UCOL_DEFAULT_STRENGTH, NULL, &status); 1401 if (U_FAILURE(status)) { 1402 fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status); 1403 return 0; 1404 } 1405 free(buf); 1406 return coll; 1407 } 1408 1409 1410 1411 1412 1413 //---------------------------------------------------------------------------------------- 1414 // 1415 // Main -- process command line, read in and pre-process the test file, 1416 // call other functions to do the actual tests. 1417 // 1418 //---------------------------------------------------------------------------------------- 1419 int main(int argc, const char** argv) { 1420 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) { 1421 printf(gUsageString); 1422 exit (1); 1423 } 1424 1425 // Make sure that we've only got one API selected. 1426 if (opt_unix || opt_win) opt_icu = FALSE; 1427 if (opt_unix) opt_win = FALSE; 1428 1429 // 1430 // Set up an ICU collator 1431 // 1432 UErrorCode status = U_ZERO_ERROR; 1433 1434 if (opt_rules != 0) { 1435 gCol = openRulesCollator(); 1436 if (gCol == 0) {return -1;} 1437 } 1438 else { 1439 gCol = ucol_open(opt_locale, &status); 1440 if (U_FAILURE(status)) { 1441 fprintf(stderr, "Collator creation failed.: %d\n", status); 1442 return -1; 1443 } 1444 } 1445 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { 1446 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); 1447 } 1448 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { 1449 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); 1450 } 1451 1452 if (opt_norm) { 1453 ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 1454 } 1455 if (opt_french && opt_frenchoff) { 1456 fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options."); 1457 exit(-1); 1458 } 1459 if (opt_french) { 1460 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status); 1461 } 1462 if (opt_frenchoff) { 1463 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status); 1464 } 1465 if (opt_lower) { 1466 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status); 1467 } 1468 if (opt_upper) { 1469 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status); 1470 } 1471 if (opt_case) { 1472 ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status); 1473 } 1474 if (opt_shifted) { 1475 ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status); 1476 } 1477 if (opt_level != 0) { 1478 switch (opt_level) { 1479 case 1: 1480 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status); 1481 break; 1482 case 2: 1483 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status); 1484 break; 1485 case 3: 1486 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status); 1487 break; 1488 case 4: 1489 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status); 1490 break; 1491 case 5: 1492 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status); 1493 break; 1494 default: 1495 fprintf(stderr, "-level param must be between 1 and 5\n"); 1496 exit(-1); 1497 } 1498 } 1499 1500 if (U_FAILURE(status)) { 1501 fprintf(stderr, "Collator attribute setting failed.: %d\n", status); 1502 return -1; 1503 } 1504 1505 1506 // 1507 // Set up a Windows LCID 1508 // 1509 if (opt_langid != 0) { 1510 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); 1511 } 1512 else { 1513 gWinLCID = uloc_getLCID(opt_locale); 1514 } 1515 1516 1517 // 1518 // Set the UNIX locale 1519 // 1520 if (opt_unix) { 1521 if (setlocale(LC_ALL, opt_locale) == 0) { 1522 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); 1523 exit(-1); 1524 } 1525 } 1526 1527 // Read in the input file. 1528 // File assumed to be utf-16. 1529 // Lines go onto heap buffers. Global index array to line starts is created. 1530 // Lines themselves are null terminated. 1531 // 1532 1533 UCharFile f(opt_fName); 1534 if (f.error()) { 1535 exit(-1); 1536 } 1537 1538 const int MAXLINES = 100000; 1539 gFileLines = new Line[MAXLINES]; 1540 UChar buf[1024]; 1541 int column = 0; 1542 1543 // Read the file, split into lines, and save in memory. 1544 // Loop runs once per utf-16 value from the input file, 1545 // (The number of bytes read from file per loop iteration depends on external encoding.) 1546 for (;;) { 1547 1548 UChar c = f.get(); 1549 if (f.error()){ 1550 exit(-1); 1551 } 1552 1553 1554 // We now have a good UTF-16 value in c. 1555 1556 // Watch for CR, LF, EOF; these finish off a line. 1557 if (c == 0xd) { 1558 continue; 1559 } 1560 1561 if (f.eof() || c == 0x0a || c==0x2028) { // Unipad inserts 2028 line separators! 1562 buf[column++] = 0; 1563 if (column > 1) { 1564 gFileLines[gNumFileLines].name = new UChar[column]; 1565 gFileLines[gNumFileLines].len = column-1; 1566 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar)); 1567 gNumFileLines++; 1568 column = 0; 1569 if (gNumFileLines >= MAXLINES) { 1570 fprintf(stderr, "File too big. Max number of lines is %d\n", MAXLINES); 1571 exit(-1); 1572 } 1573 1574 } 1575 if (c == 0xa || c == 0x2028) 1576 continue; 1577 else 1578 break; // EOF 1579 } 1580 buf[column++] = c; 1581 if (column >= 1023) 1582 { 1583 static UBool warnFlag = TRUE; 1584 if (warnFlag) { 1585 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n"); 1586 warnFlag = FALSE; 1587 } 1588 column--; 1589 } 1590 } 1591 1592 if (opt_terse == FALSE) { 1593 printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines); 1594 } 1595 1596 1597 // Convert the lines to the UNIX encoding. 1598 if (opt_unix) { 1599 UnixConvert(); 1600 } 1601 1602 // 1603 // Pre-compute ICU sort keys for the lines of the file. 1604 // 1605 int line; 1606 int32_t t; 1607 1608 for (line=0; line<gNumFileLines; line++) { 1609 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf)); 1610 gFileLines[line].icuSortKey = new char[t]; 1611 1612 if (t > (int32_t)sizeof(buf)) { 1613 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t); 1614 } 1615 else 1616 { 1617 memcpy(gFileLines[line].icuSortKey, buf, t); 1618 } 1619 } 1620 1621 1622 1623 // 1624 // Pre-compute Windows sort keys for the lines of the file. 1625 // 1626 for (line=0; line<gNumFileLines; line++) { 1627 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf)); 1628 gFileLines[line].winSortKey = new char[t]; 1629 if (t > (int32_t)sizeof(buf)) { 1630 t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t); 1631 } 1632 else 1633 { 1634 memcpy(gFileLines[line].winSortKey, buf, t); 1635 } 1636 } 1637 1638 // 1639 // Pre-compute UNIX sort keys for the lines of the file. 1640 // 1641 if (opt_unix) { 1642 for (line=0; line<gNumFileLines; line++) { 1643 t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf)); 1644 gFileLines[line].unixSortKey = new char[t]; 1645 if (t > (int32_t)sizeof(buf)) { 1646 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, sizeof(buf)); 1647 } 1648 else 1649 { 1650 memcpy(gFileLines[line].unixSortKey, buf, t); 1651 } 1652 } 1653 } 1654 1655 1656 // 1657 // Dump file lines, CEs, Sort Keys if requested. 1658 // 1659 if (opt_dump) { 1660 int i; 1661 for (line=0; line<gNumFileLines; line++) { 1662 for (i=0;;i++) { 1663 UChar c = gFileLines[line].name[i]; 1664 if (c == 0) 1665 break; 1666 if (c < 0x20 || c > 0x7e) { 1667 printf("\\u%.4x", c); 1668 } 1669 else { 1670 printf("%c", c); 1671 } 1672 } 1673 printf("\n"); 1674 1675 printf(" CEs: "); 1676 UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status); 1677 int32_t ce; 1678 i = 0; 1679 for (;;) { 1680 ce = ucol_next(CEiter, &status); 1681 if (ce == UCOL_NULLORDER) { 1682 break; 1683 } 1684 printf(" %.8x", ce); 1685 if (++i > 8) { 1686 printf("\n "); 1687 i = 0; 1688 } 1689 } 1690 printf("\n"); 1691 ucol_closeElements(CEiter); 1692 1693 1694 printf(" ICU Sort Key: "); 1695 for (i=0; ; i++) { 1696 unsigned char c = gFileLines[line].icuSortKey[i]; 1697 printf("%02x ", c); 1698 if (c == 0) { 1699 break; 1700 } 1701 if (i > 0 && i % 20 == 0) { 1702 printf("\n "); 1703 } 1704 } 1705 printf("\n"); 1706 } 1707 } 1708 1709 1710 // 1711 // Pre-sort the lines. 1712 // 1713 int i; 1714 gSortedLines = new Line *[gNumFileLines]; 1715 for (i=0; i<gNumFileLines; i++) { 1716 gSortedLines[i] = &gFileLines[i]; 1717 } 1718 1719 if (opt_win) { 1720 qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp); 1721 } 1722 else if (opt_unix) { 1723 qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp); 1724 } 1725 else /* ICU */ 1726 { 1727 qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp); 1728 } 1729 1730 1731 // 1732 // Make up a randomized order, will be used for sorting tests. 1733 // 1734 gRandomLines = new Line *[gNumFileLines]; 1735 for (i=0; i<gNumFileLines; i++) { 1736 gRandomLines[i] = &gFileLines[i]; 1737 } 1738 qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp); 1739 1740 1741 1742 1743 // 1744 // We've got the file read into memory. Go do something with it. 1745 // 1746 1747 if (opt_qsort) doQSort(); 1748 if (opt_binsearch) doBinarySearch(); 1749 if (opt_keygen) doKeyGen(); 1750 if (opt_keyhist) doKeyHist(); 1751 if (opt_itertest) doIterTest(); 1752 1753 return 0; 1754 1755 } 1756