Home | History | Annotate | Download | only in collationperf
      1 /***********************************************************************
      2  *  2016 and later: Unicode, Inc. and others.
      3  * License & terms of use: http://www.unicode.org/copyright.html#License
      4  ***********************************************************************
      5  ***********************************************************************
      6  * COPYRIGHT:
      7  * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
      8  *
      9  ***********************************************************************/
     10 /********************************************************************************
     11 *
     12 * File CALLCOLL.C
     13 *
     14 * Modification History:
     15 *        Name                     Description
     16 *     Andy Heninger             First Version
     17 *
     18 *********************************************************************************
     19 */
     20 
     21 //
     22 //  This program tests string collation and sort key generation performance.
     23 //      Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
     24 //      A file of names is required as input, one per line.  It must be in utf-8 or utf-16 format,
     25 //      and include a byte order mark.  Either LE or BE format is OK.
     26 //
     27 
     28 const char gUsageString[] =
     29  "usage:  collperf options...\n"
     30     "-help                      Display this message.\n"
     31     "-file file_name            utf-16 format file of names.\n"
     32     "-locale name               ICU locale to use.  Default is en_US\n"
     33     "-rules file_name           Collation rules file (overrides locale)\n"
     34     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
     35     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
     36     "-win                       Run test using Windows native services.  (ICU is default)\n"
     37     "-unix                      Run test using Unix strxfrm, strcoll services.\n"
     38     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
     39     "-usekeys                   Run tests using sortkeys rather than strcoll\n"
     40     "-strcmp                    Run tests using u_strcmp rather than strcoll\n"
     41     "-strcmpCPO                 Run tests using u_strcmpCodePointOrder rather than strcoll\n"
     42     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
     43     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
     44     "                               under test at each call point.  For measuring test overhead.\n"
     45     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
     46     "-french                    French accent ordering\n"
     47     "-frenchoff                 No French accent ordering (for use with French locales.)\n"
     48     "-norm                      Normalizing mode on\n"
     49     "-shifted                   Shifted mode\n"
     50     "-lower                     Lower case first\n"
     51     "-upper                     Upper case first\n"
     52     "-case                      Enable separate case level\n"
     53     "-level n                   Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
     54     "-keyhist                   Produce a table sort key size vs. string length\n"
     55     "-binsearch                 Binary Search timing test\n"
     56     "-keygen                    Sort Key Generation timing test\n"
     57     "-qsort                     Quicksort timing test\n"
     58     "-iter                      Iteration Performance Test\n"
     59     "-dump                      Display strings, sort keys and CEs.\n"
     60     ;
     61 
     62 
     63 
     64 #include <stdio.h>
     65 #include <string.h>
     66 #include <stdlib.h>
     67 #include <math.h>
     68 #include <locale.h>
     69 #include <errno.h>
     70 
     71 #include <unicode/utypes.h>
     72 #include <unicode/ucol.h>
     73 #include <unicode/ucoleitr.h>
     74 #include <unicode/uloc.h>
     75 #include <unicode/ustring.h>
     76 #include <unicode/ures.h>
     77 #include <unicode/uchar.h>
     78 #include <unicode/ucnv.h>
     79 #include <unicode/utf8.h>
     80 
     81 #ifdef WIN32
     82 #include <windows.h>
     83 #else
     84 //
     85 //  Stubs for Windows API functions when building on UNIXes.
     86 //
     87 typedef int DWORD;
     88 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
     89 #include <sys/time.h>
     90 unsigned long timeGetTime() {
     91     struct timeval t;
     92     gettimeofday(&t, 0);
     93     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
     94     val += t.tv_usec / 1000;
     95     return val;
     96 }
     97 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
     98 const int LCMAP_SORTKEY = 0;
     99 #define MAKELCID(a,b) 0
    100 const int SORT_DEFAULT = 0;
    101 #endif
    102 
    103 
    104 
    105 //
    106 //  Command line option variables
    107 //     These global variables are set according to the options specified
    108 //     on the command line by the user.
    109 char * opt_fName      = 0;
    110 const char * opt_locale     = "en_US";
    111 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
    112 char * opt_rules      = 0;
    113 UBool  opt_help       = FALSE;
    114 int    opt_loopCount  = 1;
    115 int    opt_iLoopCount = 1;
    116 UBool  opt_terse      = FALSE;
    117 UBool  opt_qsort      = FALSE;
    118 UBool  opt_binsearch  = FALSE;
    119 UBool  opt_icu        = TRUE;
    120 UBool  opt_win        = FALSE;      // Run with Windows native functions.
    121 UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
    122 UBool  opt_uselen     = FALSE;
    123 UBool  opt_usekeys    = FALSE;
    124 UBool  opt_strcmp     = FALSE;
    125 UBool  opt_strcmpCPO  = FALSE;
    126 UBool  opt_norm       = FALSE;
    127 UBool  opt_keygen     = FALSE;
    128 UBool  opt_french     = FALSE;
    129 UBool  opt_frenchoff  = FALSE;
    130 UBool  opt_shifted    = FALSE;
    131 UBool  opt_lower      = FALSE;
    132 UBool  opt_upper      = FALSE;
    133 UBool  opt_case       = FALSE;
    134 int    opt_level      = 0;
    135 UBool  opt_keyhist    = FALSE;
    136 UBool  opt_itertest   = FALSE;
    137 UBool  opt_dump       = FALSE;
    138 
    139 
    140 
    141 //
    142 //   Definitions for the command line options
    143 //
    144 struct OptSpec {
    145     const char *name;
    146     enum {FLAG, NUM, STRING} type;
    147     void *pVar;
    148 };
    149 
    150 OptSpec opts[] = {
    151     {"-file",        OptSpec::STRING, &opt_fName},
    152     {"-locale",      OptSpec::STRING, &opt_locale},
    153     {"-langid",      OptSpec::NUM,    &opt_langid},
    154     {"-rules",       OptSpec::STRING, &opt_rules},
    155     {"-qsort",       OptSpec::FLAG,   &opt_qsort},
    156     {"-binsearch",   OptSpec::FLAG,   &opt_binsearch},
    157     {"-iter",        OptSpec::FLAG,   &opt_itertest},
    158     {"-win",         OptSpec::FLAG,   &opt_win},
    159     {"-unix",        OptSpec::FLAG,   &opt_unix},
    160     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
    161     {"-usekeys",     OptSpec::FLAG,   &opt_usekeys},
    162     {"-strcmp",      OptSpec::FLAG,   &opt_strcmp},
    163     {"-strcmpCPO",   OptSpec::FLAG,   &opt_strcmpCPO},
    164     {"-norm",        OptSpec::FLAG,   &opt_norm},
    165     {"-french",      OptSpec::FLAG,   &opt_french},
    166     {"-frenchoff",   OptSpec::FLAG,   &opt_frenchoff},
    167     {"-shifted",     OptSpec::FLAG,   &opt_shifted},
    168     {"-lower",       OptSpec::FLAG,   &opt_lower},
    169     {"-upper",       OptSpec::FLAG,   &opt_upper},
    170     {"-case",        OptSpec::FLAG,   &opt_case},
    171     {"-level",       OptSpec::NUM,    &opt_level},
    172     {"-keyhist",     OptSpec::FLAG,   &opt_keyhist},
    173     {"-keygen",      OptSpec::FLAG,   &opt_keygen},
    174     {"-loop",        OptSpec::NUM,    &opt_loopCount},
    175     {"-iloop",       OptSpec::NUM,    &opt_iLoopCount},
    176     {"-terse",       OptSpec::FLAG,   &opt_terse},
    177     {"-dump",        OptSpec::FLAG,   &opt_dump},
    178     {"-help",        OptSpec::FLAG,   &opt_help},
    179     {"-?",           OptSpec::FLAG,   &opt_help},
    180     {0, OptSpec::FLAG, 0}
    181 };
    182 
    183 
    184 //---------------------------------------------------------------------------
    185 //
    186 //  Global variables pointing to and describing the test file
    187 //
    188 //---------------------------------------------------------------------------
    189 
    190 //
    191 //   struct Line
    192 //
    193 //      Each line from the source file (containing a name, presumably) gets
    194 //      one of these structs.
    195 //
    196 struct  Line {
    197     UChar     *name;
    198     int        len;
    199     char      *winSortKey;
    200     char      *icuSortKey;
    201     char      *unixSortKey;
    202     char      *unixName;
    203 };
    204 
    205 
    206 
    207 Line          *gFileLines;           // Ptr to array of Line structs, one per line in the file.
    208 int            gNumFileLines;
    209 UCollator     *gCol;
    210 DWORD          gWinLCID;
    211 
    212 Line          **gSortedLines;
    213 Line          **gRandomLines;
    214 int            gCount;
    215 
    216 
    217 
    218 //---------------------------------------------------------------------------
    219 //
    220 //  ProcessOptions()    Function to read the command line options.
    221 //
    222 //---------------------------------------------------------------------------
    223 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
    224 {
    225     int         i;
    226     int         argNum;
    227     const char  *pArgName;
    228     OptSpec    *pOpt;
    229 
    230     for (argNum=1; argNum<argc; argNum++) {
    231         pArgName = argv[argNum];
    232         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
    233             if (strcmp(pOpt->name, pArgName) == 0) {
    234                 switch (pOpt->type) {
    235                 case OptSpec::FLAG:
    236                     *(UBool *)(pOpt->pVar) = TRUE;
    237                     break;
    238                 case OptSpec::STRING:
    239                     argNum ++;
    240                     if (argNum >= argc) {
    241                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
    242                         return FALSE;
    243                     }
    244                     *(const char **)(pOpt->pVar)  = argv[argNum];
    245                     break;
    246                 case OptSpec::NUM:
    247                     argNum ++;
    248                     if (argNum >= argc) {
    249                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
    250                         return FALSE;
    251                     }
    252                     char *endp;
    253                     i = strtol(argv[argNum], &endp, 0);
    254                     if (endp == argv[argNum]) {
    255                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
    256                         return FALSE;
    257                     }
    258                     *(int *)(pOpt->pVar) = i;
    259                 }
    260                 break;
    261             }
    262         }
    263         if (pOpt->name == 0)
    264         {
    265             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
    266             return FALSE;
    267         }
    268     }
    269 return TRUE;
    270 }
    271 
    272 //---------------------------------------------------------------------------------------
    273 //
    274 //   Comparison functions for use by qsort.
    275 //
    276 //       Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
    277 //           or null terminated.
    278 //
    279 //---------------------------------------------------------------------------------------
    280 int ICUstrcmpK(const void *a, const void *b) {
    281     gCount++;
    282     int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
    283     return t;
    284 }
    285 
    286 
    287 int ICUstrcmpL(const void *a, const void *b) {
    288     gCount++;
    289     UCollationResult t;
    290     t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
    291     if (t == UCOL_LESS) return -1;
    292     if (t == UCOL_GREATER) return +1;
    293     return 0;
    294 }
    295 
    296 
    297 int ICUstrcmp(const void *a, const void *b) {
    298     gCount++;
    299     UCollationResult t;
    300     t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
    301     if (t == UCOL_LESS) return -1;
    302     if (t == UCOL_GREATER) return +1;
    303     return 0;
    304 }
    305 
    306 
    307 int Winstrcmp(const void *a, const void *b) {
    308     gCount++;
    309     int t;
    310     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
    311     return t-2;
    312 }
    313 
    314 
    315 int UNIXstrcmp(const void *a, const void *b) {
    316     gCount++;
    317     int t;
    318     t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
    319     return t;
    320 }
    321 
    322 
    323 int WinstrcmpL(const void *a, const void *b) {
    324     gCount++;
    325     int t;
    326     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
    327     return t-2;
    328 }
    329 
    330 
    331 int WinstrcmpK(const void *a, const void *b) {
    332     gCount++;
    333     int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
    334     return t;
    335 }
    336 
    337 
    338 //---------------------------------------------------------------------------------------
    339 //
    340 //   Function for sorting the names (lines) into a random order.
    341 //      Order is based on a hash of the  ICU Sort key for the lines
    342 //      The randomized order is used as input for the sorting timing tests.
    343 //
    344 //---------------------------------------------------------------------------------------
    345 int ICURandomCmp(const void *a, const void *b) {
    346     char  *ask = (*(Line **)a)->icuSortKey;
    347     char  *bsk = (*(Line **)b)->icuSortKey;
    348     int   aVal = 0;
    349     int   bVal = 0;
    350     int   retVal;
    351     while (*ask != 0) {
    352         aVal += aVal*37 + *ask++;
    353     }
    354     while (*bsk != 0) {
    355         bVal += bVal*37 + *bsk++;
    356     }
    357     retVal = -1;
    358     if (aVal == bVal) {
    359         retVal = 0;
    360     }
    361     else if (aVal > bVal) {
    362         retVal = 1;
    363     }
    364     return retVal;
    365 }
    366 
    367 //---------------------------------------------------------------------------------------
    368 //
    369 //   doKeyGen()     Key Generation Timing Test
    370 //
    371 //---------------------------------------------------------------------------------------
    372 void doKeyGen()
    373 {
    374     int  line;
    375     int  loops = 0;
    376     int  iLoop;
    377     int  t;
    378     int  len=-1;
    379 
    380     // Adjust loop count to compensate for file size.   Should be order n
    381     double dLoopCount = double(opt_loopCount) * (1000. /  double(gNumFileLines));
    382     int adj_loopCount = int(dLoopCount);
    383     if (adj_loopCount < 1) adj_loopCount = 1;
    384 
    385 
    386     unsigned long startTime = timeGetTime();
    387 
    388     if (opt_win) {
    389         for (loops=0; loops<adj_loopCount; loops++) {
    390             for (line=0; line < gNumFileLines; line++) {
    391                 if (opt_uselen) {
    392                     len = gFileLines[line].len;
    393                 }
    394                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    395                     t=LCMapStringW(gWinLCID, LCMAP_SORTKEY,
    396                         gFileLines[line].name, len,
    397                         (unsigned short *)gFileLines[line].winSortKey, 5000);    // TODO  something with length.
    398                 }
    399             }
    400         }
    401     }
    402     else if (opt_icu)
    403     {
    404         for (loops=0; loops<adj_loopCount; loops++) {
    405             for (line=0; line < gNumFileLines; line++) {
    406                 if (opt_uselen) {
    407                     len = gFileLines[line].len;
    408                 }
    409                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    410                     t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
    411                 }
    412             }
    413         }
    414     }
    415     else if (opt_unix)
    416     {
    417         for (loops=0; loops<adj_loopCount; loops++) {
    418             for (line=0; line < gNumFileLines; line++) {
    419                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    420                 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
    421                 }
    422             }
    423         }
    424     }
    425 
    426     unsigned long elapsedTime = timeGetTime() - startTime;
    427     int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
    428 
    429     if (opt_terse == FALSE) {
    430         printf("Sort Key Generation:  total # of keys = %d\n", loops*gNumFileLines);
    431         printf("Sort Key Generation:  time per key = %d ns\n", ns);
    432     }
    433     else {
    434         printf("%d,  ", ns);
    435     }
    436 
    437     int   totalKeyLen = 0;
    438     int   totalChars  = 0;
    439     for (line=0; line<gNumFileLines; line++) {
    440         totalChars += u_strlen(gFileLines[line].name);
    441         if (opt_win) {
    442             totalKeyLen += strlen(gFileLines[line].winSortKey);
    443         }
    444         else if (opt_icu) {
    445             totalKeyLen += strlen(gFileLines[line].icuSortKey);
    446         }
    447         else if (opt_unix) {
    448             totalKeyLen += strlen(gFileLines[line].unixSortKey);
    449         }
    450 
    451     }
    452     if (opt_terse == FALSE) {
    453         printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
    454     } else {
    455         printf("%f, ", (float)totalKeyLen / (float)totalChars);
    456     }
    457 }
    458 
    459 
    460 
    461 //---------------------------------------------------------------------------------------
    462 //
    463 //    doBinarySearch()    Binary Search timing test.  Each name from the list
    464 //                        is looked up in the full sorted list of names.
    465 //
    466 //---------------------------------------------------------------------------------------
    467 void doBinarySearch()
    468 {
    469 
    470     gCount = 0;
    471     int  line;
    472     int  loops = 0;
    473     int  iLoop = 0;
    474     unsigned long elapsedTime = 0;
    475 
    476     // Adjust loop count to compensate for file size.   Should be order n (lookups) * log n  (compares/lookup)
    477     // Accurate timings do not depend on this being perfect.  The correction is just to try to
    478     //   get total running times of about the right order, so the that user doesn't need to
    479     //   manually adjust the loop count for every different file size.
    480     double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
    481     if (opt_usekeys) dLoopCount *= 5;
    482     int adj_loopCount = int(dLoopCount);
    483     if (adj_loopCount < 1) adj_loopCount = 1;
    484 
    485 
    486     for (;;) {  // not really a loop, just allows "break" to work, to simplify
    487                 //   inadvertantly running more than one test through here.
    488         if (opt_strcmp || opt_strcmpCPO)
    489         {
    490             unsigned long startTime = timeGetTime();
    491             typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
    492             PF pf = u_strcmp;
    493             if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
    494             //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;}   // Damn the difference between int32_t and int
    495                                                             //   which forces the use of a cast here.
    496 
    497             int r = 0;
    498             for (loops=0; loops<adj_loopCount; loops++) {
    499 
    500                 for (line=0; line < gNumFileLines; line++) {
    501                     int hi      = gNumFileLines-1;
    502                     int lo      = 0;
    503                     int  guess = -1;
    504                     for (;;) {
    505                         int newGuess = (hi + lo) / 2;
    506                         if (newGuess == guess)
    507                             break;
    508                         guess = newGuess;
    509                         for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    510                             r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
    511                         }
    512                         gCount++;
    513                         if (r== 0)
    514                             break;
    515                         if (r < 0)
    516                             hi = guess;
    517                         else
    518                             lo   = guess;
    519                     }
    520                 }
    521             }
    522             elapsedTime = timeGetTime() - startTime;
    523             break;
    524         }
    525 
    526 
    527         if (opt_icu)
    528         {
    529             unsigned long startTime = timeGetTime();
    530             UCollationResult  r = UCOL_EQUAL;
    531             for (loops=0; loops<adj_loopCount; loops++) {
    532 
    533                 for (line=0; line < gNumFileLines; line++) {
    534                     int lineLen  = -1;
    535                     int guessLen = -1;
    536                     if (opt_uselen) {
    537                         lineLen = (gSortedLines[line])->len;
    538                     }
    539                     int hi      = gNumFileLines-1;
    540                     int lo      = 0;
    541                     int  guess = -1;
    542                     for (;;) {
    543                         int newGuess = (hi + lo) / 2;
    544                         if (newGuess == guess)
    545                             break;
    546                         guess = newGuess;
    547                         int ri = 0;
    548                         if (opt_usekeys) {
    549                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    550                                 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
    551                             }
    552                             gCount++;
    553                             r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
    554                         }
    555                         else
    556                         {
    557                             if (opt_uselen) {
    558                                 guessLen = (gSortedLines[guess])->len;
    559                             }
    560                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    561                                 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
    562                             }
    563                             gCount++;
    564                         }
    565                         if (r== UCOL_EQUAL)
    566                             break;
    567                         if (r == UCOL_LESS)
    568                             hi = guess;
    569                         else
    570                             lo   = guess;
    571                     }
    572                 }
    573             }
    574             elapsedTime = timeGetTime() - startTime;
    575             break;
    576         }
    577 
    578         if (opt_win)
    579         {
    580             unsigned long startTime = timeGetTime();
    581             int r = 0;
    582             for (loops=0; loops<adj_loopCount; loops++) {
    583 
    584                 for (line=0; line < gNumFileLines; line++) {
    585                     int lineLen  = -1;
    586                     int guessLen = -1;
    587                     if (opt_uselen) {
    588                         lineLen = (gSortedLines[line])->len;
    589                     }
    590                     int hi   = gNumFileLines-1;
    591                     int lo   = 0;
    592                     int  guess = -1;
    593                     for (;;) {
    594                         int newGuess = (hi + lo) / 2;
    595                         if (newGuess == guess)
    596                             break;
    597                         guess = newGuess;
    598                         if (opt_usekeys) {
    599                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    600                                 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
    601                             }
    602                             gCount++;
    603                             r+=2;
    604                         }
    605                         else
    606                         {
    607                             if (opt_uselen) {
    608                                 guessLen = (gSortedLines[guess])->len;
    609                             }
    610                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    611                                 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
    612                             }
    613                             if (r == 0) {
    614                                 if (opt_terse == FALSE) {
    615                                     fprintf(stderr, "Error returned from Windows CompareStringW.\n");
    616                                 }
    617                                 exit(-1);
    618                             }
    619                             gCount++;
    620                         }
    621                         if (r== 2)   //  strings ==
    622                             break;
    623                         if (r == 1)  //  line < guess
    624                             hi = guess;
    625                         else         //  line > guess
    626                             lo   = guess;
    627                     }
    628                 }
    629             }
    630             elapsedTime = timeGetTime() - startTime;
    631             break;
    632         }
    633 
    634         if (opt_unix)
    635         {
    636             unsigned long startTime = timeGetTime();
    637             int r = 0;
    638             for (loops=0; loops<adj_loopCount; loops++) {
    639 
    640                 for (line=0; line < gNumFileLines; line++) {
    641                     int hi   = gNumFileLines-1;
    642                     int lo   = 0;
    643                     int  guess = -1;
    644                     for (;;) {
    645                         int newGuess = (hi + lo) / 2;
    646                         if (newGuess == guess)
    647                             break;
    648                         guess = newGuess;
    649                         if (opt_usekeys) {
    650                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    651                                  r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
    652                             }
    653                             gCount++;
    654                         }
    655                         else
    656                         {
    657                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    658                                 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
    659                             }
    660                             errno = 0;
    661                             if (errno != 0) {
    662                                 fprintf(stderr, "Error %d returned from strcoll.\n", errno);
    663                                 exit(-1);
    664                             }
    665                             gCount++;
    666                         }
    667                         if (r == 0)   //  strings ==
    668                             break;
    669                         if (r < 0)  //  line < guess
    670                             hi = guess;
    671                         else         //  line > guess
    672                             lo   = guess;
    673                     }
    674                 }
    675             }
    676             elapsedTime = timeGetTime() - startTime;
    677             break;
    678         }
    679         break;
    680     }
    681 
    682     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
    683     if (opt_terse == FALSE) {
    684         printf("binary search:  total # of string compares = %d\n", gCount);
    685         printf("binary search:  compares per loop = %d\n", gCount / loops);
    686         printf("binary search:  time per compare = %d ns\n", ns);
    687     } else {
    688         printf("%d, ", ns);
    689     }
    690 
    691 }
    692 
    693 
    694 
    695 
    696 //---------------------------------------------------------------------------------------
    697 //
    698 //   doQSort()    The quick sort timing test.  Uses the C library qsort function.
    699 //
    700 //---------------------------------------------------------------------------------------
    701 void doQSort() {
    702     int i;
    703     Line **sortBuf = new Line *[gNumFileLines];
    704 
    705     // Adjust loop count to compensate for file size.   QSort should be n log(n)
    706     double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
    707     if (opt_usekeys) dLoopCount *= 5;
    708     int adj_loopCount = int(dLoopCount);
    709     if (adj_loopCount < 1) adj_loopCount = 1;
    710 
    711 
    712     gCount = 0;
    713     unsigned long startTime = timeGetTime();
    714     if (opt_win && opt_usekeys) {
    715         for (i=0; i<opt_loopCount; i++) {
    716             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    717             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
    718         }
    719     }
    720 
    721     else if (opt_win && opt_uselen) {
    722         for (i=0; i<adj_loopCount; i++) {
    723             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    724             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
    725         }
    726     }
    727 
    728 
    729     else if (opt_win && !opt_uselen) {
    730         for (i=0; i<adj_loopCount; i++) {
    731             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    732             qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
    733         }
    734     }
    735 
    736     else if (opt_icu && opt_usekeys) {
    737         for (i=0; i<adj_loopCount; i++) {
    738             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    739             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
    740         }
    741     }
    742 
    743     else if (opt_icu && opt_uselen) {
    744         for (i=0; i<adj_loopCount; i++) {
    745             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    746             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
    747         }
    748     }
    749 
    750 
    751     else if (opt_icu && !opt_uselen) {
    752         for (i=0; i<adj_loopCount; i++) {
    753             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    754             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
    755         }
    756     }
    757 
    758     else if (opt_unix && !opt_usekeys) {
    759         for (i=0; i<adj_loopCount; i++) {
    760             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    761             qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
    762         }
    763     }
    764 
    765     unsigned long elapsedTime = timeGetTime() - startTime;
    766     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
    767     if (opt_terse == FALSE) {
    768         printf("qsort:  total # of string compares = %d\n", gCount);
    769         printf("qsort:  time per compare = %d ns\n", ns);
    770     } else {
    771         printf("%d, ", ns);
    772     }
    773 }
    774 
    775 
    776 
    777 //---------------------------------------------------------------------------------------
    778 //
    779 //    doKeyHist()       Output a table of data for
    780 //                        average sort key size vs. string length.
    781 //
    782 //---------------------------------------------------------------------------------------
    783 void doKeyHist() {
    784     int     i;
    785     int     maxLen = 0;
    786 
    787     // Find the maximum string length
    788     for (i=0; i<gNumFileLines; i++) {
    789         if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
    790     }
    791 
    792     // Allocate arrays to hold the histogram data
    793     int *accumulatedLen  = new int[maxLen+1];
    794     int *numKeysOfSize   = new int[maxLen+1];
    795     for (i=0; i<=maxLen; i++) {
    796         accumulatedLen[i] = 0;
    797         numKeysOfSize[i] = 0;
    798     }
    799 
    800     // Fill the arrays...
    801     for (i=0; i<gNumFileLines; i++) {
    802         int len = gFileLines[i].len;
    803         accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
    804         numKeysOfSize[len] += 1;
    805     }
    806 
    807     // And write out averages
    808     printf("String Length,  Avg Key Length,  Avg Key Len per char\n");
    809     for (i=1; i<=maxLen; i++) {
    810         if (numKeysOfSize[i] > 0) {
    811             printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
    812                 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
    813         }
    814     }
    815     delete []accumulatedLen;
    816     delete []numKeysOfSize ;
    817 }
    818 
    819 //---------------------------------------------------------------------------------------
    820 //
    821 //    doForwardIterTest(UBool)       Forward iteration test
    822 //                                   argument null-terminated string used
    823 //
    824 //---------------------------------------------------------------------------------------
    825 void doForwardIterTest(UBool haslen) {
    826     int count = 0;
    827 
    828     UErrorCode error = U_ZERO_ERROR;
    829     printf("\n\nPerforming forward iteration performance test with ");
    830 
    831     if (haslen) {
    832         printf("non-null terminated data -----------\n");
    833     }
    834     else {
    835         printf("null terminated data -----------\n");
    836     }
    837     printf("performance test on strings from file -----------\n");
    838 
    839     UChar dummytext[] = {0, 0};
    840     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
    841     ucol_setText(iter, dummytext, 1, &error);
    842 
    843     gCount = 0;
    844     unsigned long startTime = timeGetTime();
    845     while (count < opt_loopCount) {
    846         int linecount = 0;
    847         while (linecount < gNumFileLines) {
    848             UChar *str = gFileLines[linecount].name;
    849             int strlen = haslen?gFileLines[linecount].len:-1;
    850             ucol_setText(iter, str, strlen, &error);
    851             while (ucol_next(iter, &error) != UCOL_NULLORDER) {
    852                 gCount++;
    853             }
    854 
    855             linecount ++;
    856         }
    857         count ++;
    858     }
    859     unsigned long elapsedTime = timeGetTime() - startTime;
    860     printf("elapsedTime %ld\n", elapsedTime);
    861 
    862     // empty loop recalculation
    863     count = 0;
    864     startTime = timeGetTime();
    865     while (count < opt_loopCount) {
    866         int linecount = 0;
    867         while (linecount < gNumFileLines) {
    868             UChar *str = gFileLines[linecount].name;
    869             int strlen = haslen?gFileLines[linecount].len:-1;
    870             ucol_setText(iter, str, strlen, &error);
    871             linecount ++;
    872         }
    873         count ++;
    874     }
    875     elapsedTime -= (timeGetTime() - startTime);
    876     printf("elapsedTime %ld\n", elapsedTime);
    877 
    878     ucol_closeElements(iter);
    879 
    880     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
    881     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
    882                                                                 opt_loopCount);
    883     printf("Average time per ucol_next() nano seconds %d\n", ns);
    884 
    885     printf("performance test on skipped-5 concatenated strings from file -----------\n");
    886 
    887     UChar *str;
    888     int    strlen = 0;
    889     // appending all the strings
    890     int linecount = 0;
    891     while (linecount < gNumFileLines) {
    892         strlen += haslen?gFileLines[linecount].len:
    893                                       u_strlen(gFileLines[linecount].name);
    894         linecount ++;
    895     }
    896     str = (UChar *)malloc(sizeof(UChar) * strlen);
    897     int strindex = 0;
    898     linecount = 0;
    899     while (strindex < strlen) {
    900         int len = 0;
    901         len += haslen?gFileLines[linecount].len:
    902                                       u_strlen(gFileLines[linecount].name);
    903         memcpy(str + strindex, gFileLines[linecount].name,
    904                sizeof(UChar) * len);
    905         strindex += len;
    906         linecount ++;
    907     }
    908 
    909     printf("Total size of strings %d\n", strlen);
    910 
    911     gCount = 0;
    912     count  = 0;
    913 
    914     if (!haslen) {
    915         strlen = -1;
    916     }
    917     iter = ucol_openElements(gCol, str, strlen, &error);
    918     if (!haslen) {
    919         strlen = u_strlen(str);
    920     }
    921     strlen -= 5; // any left over characters are not iterated,
    922                  // this is to ensure the backwards and forwards iterators
    923                  // gets the same position
    924     startTime = timeGetTime();
    925     while (count < opt_loopCount) {
    926         int count5 = 5;
    927         strindex = 0;
    928         ucol_setOffset(iter, strindex, &error);
    929         while (TRUE) {
    930             if (ucol_next(iter, &error) == UCOL_NULLORDER) {
    931                 break;
    932             }
    933             gCount++;
    934             count5 --;
    935             if (count5 == 0) {
    936                 strindex += 10;
    937                 if (strindex > strlen) {
    938                     break;
    939                 }
    940                 ucol_setOffset(iter, strindex, &error);
    941                 count5 = 5;
    942             }
    943         }
    944         count ++;
    945     }
    946 
    947     elapsedTime = timeGetTime() - startTime;
    948     printf("elapsedTime %ld\n", elapsedTime);
    949 
    950     // empty loop recalculation
    951     int tempgCount = 0;
    952     count = 0;
    953     startTime = timeGetTime();
    954     while (count < opt_loopCount) {
    955         int count5 = 5;
    956         strindex = 0;
    957         ucol_setOffset(iter, strindex, &error);
    958         while (TRUE) {
    959             tempgCount ++;
    960             count5 --;
    961             if (count5 == 0) {
    962                 strindex += 10;
    963                 if (strindex > strlen) {
    964                     break;
    965                 }
    966                 ucol_setOffset(iter, strindex, &error);
    967                 count5 = 5;
    968             }
    969         }
    970         count ++;
    971     }
    972     elapsedTime -= (timeGetTime() - startTime);
    973     printf("elapsedTime %ld\n", elapsedTime);
    974 
    975     ucol_closeElements(iter);
    976 
    977     printf("gCount %d\n", gCount);
    978     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
    979     printf("Average time per ucol_next() nano seconds %d\n", ns);
    980 }
    981 
    982 //---------------------------------------------------------------------------------------
    983 //
    984 //    doBackwardIterTest(UBool)      Backwards iteration test
    985 //                                   argument null-terminated string used
    986 //
    987 //---------------------------------------------------------------------------------------
    988 void doBackwardIterTest(UBool haslen) {
    989     int count = 0;
    990     UErrorCode error = U_ZERO_ERROR;
    991     printf("\n\nPerforming backward iteration performance test with ");
    992 
    993     if (haslen) {
    994         printf("non-null terminated data -----------\n");
    995     }
    996     else {
    997         printf("null terminated data -----------\n");
    998     }
    999 
   1000     printf("performance test on strings from file -----------\n");
   1001 
   1002     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
   1003     UChar dummytext[] = {0, 0};
   1004     ucol_setText(iter, dummytext, 1, &error);
   1005 
   1006     gCount = 0;
   1007     unsigned long startTime = timeGetTime();
   1008     while (count < opt_loopCount) {
   1009         int linecount = 0;
   1010         while (linecount < gNumFileLines) {
   1011             UChar *str = gFileLines[linecount].name;
   1012             int strlen = haslen?gFileLines[linecount].len:-1;
   1013             ucol_setText(iter, str, strlen, &error);
   1014             while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
   1015                 gCount ++;
   1016             }
   1017 
   1018             linecount ++;
   1019         }
   1020         count ++;
   1021     }
   1022     unsigned long elapsedTime = timeGetTime() - startTime;
   1023 
   1024     printf("elapsedTime %ld\n", elapsedTime);
   1025 
   1026     // empty loop recalculation
   1027     count = 0;
   1028     startTime = timeGetTime();
   1029     while (count < opt_loopCount) {
   1030         int linecount = 0;
   1031         while (linecount < gNumFileLines) {
   1032             UChar *str = gFileLines[linecount].name;
   1033             int strlen = haslen?gFileLines[linecount].len:-1;
   1034             ucol_setText(iter, str, strlen, &error);
   1035             linecount ++;
   1036         }
   1037         count ++;
   1038     }
   1039     elapsedTime -= (timeGetTime() - startTime);
   1040 
   1041     printf("elapsedTime %ld\n", elapsedTime);
   1042     ucol_closeElements(iter);
   1043 
   1044     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
   1045     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
   1046                                                                 opt_loopCount);
   1047     printf("Average time per ucol_previous() nano seconds %d\n", ns);
   1048 
   1049     printf("performance test on skipped-5 concatenated strings from file -----------\n");
   1050 
   1051     UChar *str;
   1052     int    strlen = 0;
   1053     // appending all the strings
   1054     int linecount = 0;
   1055     while (linecount < gNumFileLines) {
   1056         strlen += haslen?gFileLines[linecount].len:
   1057                                       u_strlen(gFileLines[linecount].name);
   1058         linecount ++;
   1059     }
   1060     str = (UChar *)malloc(sizeof(UChar) * strlen);
   1061     int strindex = 0;
   1062     linecount = 0;
   1063     while (strindex < strlen) {
   1064         int len = 0;
   1065         len += haslen?gFileLines[linecount].len:
   1066                                       u_strlen(gFileLines[linecount].name);
   1067         memcpy(str + strindex, gFileLines[linecount].name,
   1068                sizeof(UChar) * len);
   1069         strindex += len;
   1070         linecount ++;
   1071     }
   1072 
   1073     printf("Total size of strings %d\n", strlen);
   1074 
   1075     gCount = 0;
   1076     count  = 0;
   1077 
   1078     if (!haslen) {
   1079         strlen = -1;
   1080     }
   1081 
   1082     iter = ucol_openElements(gCol, str, strlen, &error);
   1083     if (!haslen) {
   1084         strlen = u_strlen(str);
   1085     }
   1086 
   1087     startTime = timeGetTime();
   1088     while (count < opt_loopCount) {
   1089         int count5 = 5;
   1090         strindex = 5;
   1091         ucol_setOffset(iter, strindex, &error);
   1092         while (TRUE) {
   1093             if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
   1094                 break;
   1095             }
   1096              gCount ++;
   1097              count5 --;
   1098              if (count5 == 0) {
   1099                  strindex += 10;
   1100                  if (strindex > strlen) {
   1101                     break;
   1102                  }
   1103                  ucol_setOffset(iter, strindex, &error);
   1104                  count5 = 5;
   1105              }
   1106         }
   1107         count ++;
   1108     }
   1109 
   1110     elapsedTime = timeGetTime() - startTime;
   1111     printf("elapsedTime %ld\n", elapsedTime);
   1112 
   1113     // empty loop recalculation
   1114     count = 0;
   1115     int tempgCount = 0;
   1116     startTime = timeGetTime();
   1117     while (count < opt_loopCount) {
   1118         int count5 = 5;
   1119         strindex = 5;
   1120         ucol_setOffset(iter, strindex, &error);
   1121         while (TRUE) {
   1122              tempgCount ++;
   1123              count5 --;
   1124              if (count5 == 0) {
   1125                  strindex += 10;
   1126                  if (strindex > strlen) {
   1127                     break;
   1128                  }
   1129                  ucol_setOffset(iter, strindex, &error);
   1130                  count5 = 5;
   1131              }
   1132         }
   1133         count ++;
   1134     }
   1135     elapsedTime -= (timeGetTime() - startTime);
   1136     printf("elapsedTime %ld\n", elapsedTime);
   1137     ucol_closeElements(iter);
   1138 
   1139     printf("gCount %d\n", gCount);
   1140     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
   1141     printf("Average time per ucol_previous() nano seconds %d\n", ns);
   1142 }
   1143 
   1144 //---------------------------------------------------------------------------------------
   1145 //
   1146 //    doIterTest()       Iteration test
   1147 //
   1148 //---------------------------------------------------------------------------------------
   1149 void doIterTest() {
   1150     doForwardIterTest(opt_uselen);
   1151     doBackwardIterTest(opt_uselen);
   1152 }
   1153 
   1154 
   1155 //----------------------------------------------------------------------------------------
   1156 //
   1157 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
   1158 //                    Since it appears that Unicode support is going in the general
   1159 //                    direction of the use of UTF-8 locales, that is the approach
   1160 //                    that is used here.
   1161 //
   1162 //----------------------------------------------------------------------------------------
   1163 void  UnixConvert() {
   1164     int    line;
   1165 
   1166     UConverter   *cvrtr;    // An ICU code page converter.
   1167     UErrorCode    status = U_ZERO_ERROR;
   1168 
   1169 
   1170     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
   1171     if (U_FAILURE(status)) {
   1172         fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
   1173         exit(-1);
   1174     }
   1175 
   1176     for (line=0; line < gNumFileLines; line++) {
   1177         int sizeNeeded = ucnv_fromUChars(cvrtr,
   1178                                          0,            // ptr to target buffer.
   1179                                          0,            // length of target buffer.
   1180                                          gFileLines[line].name,
   1181                                          -1,           //  source is null terminated
   1182                                          &status);
   1183         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
   1184             //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
   1185             //exit(-1);
   1186         }
   1187         status = U_ZERO_ERROR;
   1188         gFileLines[line].unixName = new char[sizeNeeded+1];
   1189         sizeNeeded = ucnv_fromUChars(cvrtr,
   1190                                          gFileLines[line].unixName, // ptr to target buffer.
   1191                                          sizeNeeded+1, // length of target buffer.
   1192                                          gFileLines[line].name,
   1193                                          -1,           //  source is null terminated
   1194                                          &status);
   1195         if (U_FAILURE(status)) {
   1196             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
   1197             exit(-1);
   1198         }
   1199         gFileLines[line].unixName[sizeNeeded] = 0;
   1200     };
   1201     ucnv_close(cvrtr);
   1202 }
   1203 
   1204 
   1205 //----------------------------------------------------------------------------------------
   1206 //
   1207 //  class UCharFile   Class to hide all the gorp to read a file in
   1208 //                    and produce a stream of UChars.
   1209 //
   1210 //----------------------------------------------------------------------------------------
   1211 class UCharFile {
   1212 public:
   1213     UCharFile(const char *fileName);
   1214     ~UCharFile();
   1215     UChar   get();
   1216     UBool   eof() {return fEof;};
   1217     UBool   error() {return fError;};
   1218 
   1219 private:
   1220     UCharFile (const UCharFile & /*other*/) {};                         // No copy constructor.
   1221     UCharFile & operator = (const UCharFile &/*other*/) {return *this;};   // No assignment op
   1222 
   1223     FILE         *fFile;
   1224     const char   *fName;
   1225     UBool        fEof;
   1226     UBool        fError;
   1227     UChar        fPending2ndSurrogate;
   1228 
   1229     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
   1230 };
   1231 
   1232 UCharFile::UCharFile(const char * fileName) {
   1233     fEof                 = FALSE;
   1234     fError               = FALSE;
   1235     fName                = fileName;
   1236     fFile                = fopen(fName, "rb");
   1237     fPending2ndSurrogate = 0;
   1238     if (fFile == NULL) {
   1239         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
   1240         fError = TRUE;
   1241         return;
   1242     }
   1243     //
   1244     //  Look for the byte order mark at the start of the file.
   1245     //
   1246     int BOMC1, BOMC2, BOMC3;
   1247     BOMC1 = fgetc(fFile);
   1248     BOMC2 = fgetc(fFile);
   1249 
   1250     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
   1251         fEncoding = UTF16LE; }
   1252     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
   1253         fEncoding = UTF16BE; }
   1254     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
   1255         fEncoding = UTF8; }
   1256     else
   1257     {
   1258         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
   1259             "must include a BOM.\n", fileName);
   1260         fError = true;
   1261         return;
   1262     }
   1263 }
   1264 
   1265 
   1266 UCharFile::~UCharFile() {
   1267     fclose(fFile);
   1268 }
   1269 
   1270 
   1271 
   1272 UChar UCharFile::get() {
   1273     UChar   c;
   1274     switch (fEncoding) {
   1275     case UTF16LE:
   1276         {
   1277             int  cL, cH;
   1278             cL = fgetc(fFile);
   1279             cH = fgetc(fFile);
   1280             c  = cL  | (cH << 8);
   1281             if (cH == EOF) {
   1282                 c   = 0;
   1283                 fEof = TRUE;
   1284             }
   1285             break;
   1286         }
   1287     case UTF16BE:
   1288         {
   1289             int  cL, cH;
   1290             cH = fgetc(fFile);
   1291             cL = fgetc(fFile);
   1292             c  = cL  | (cH << 8);
   1293             if (cL == EOF) {
   1294                 c   = 0;
   1295                 fEof = TRUE;
   1296             }
   1297             break;
   1298         }
   1299     case UTF8:
   1300         {
   1301             if (fPending2ndSurrogate != 0) {
   1302                 c = fPending2ndSurrogate;
   1303                 fPending2ndSurrogate = 0;
   1304                 break;
   1305             }
   1306 
   1307             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
   1308             if (ch == EOF) {
   1309                 c = 0;
   1310                 fEof = TRUE;
   1311                 break;
   1312             }
   1313 
   1314             if (ch <= 0x7f) {
   1315                 // It's ascii.  No further utf-8 conversion.
   1316                 c = ch;
   1317                 break;
   1318             }
   1319 
   1320             // Figure out the lenght of the char and read the rest of the bytes
   1321             //   into a temp array.
   1322             int nBytes;
   1323             if (ch >= 0xF0) {nBytes=4;}
   1324             else if (ch >= 0xE0) {nBytes=3;}
   1325             else if (ch >= 0xC0) {nBytes=2;}
   1326             else {
   1327                 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
   1328                 fError = TRUE;
   1329                 return 0;
   1330             }
   1331 
   1332             unsigned char  bytes[10];
   1333             bytes[0] = (unsigned char)ch;
   1334             int i;
   1335             for (i=1; i<nBytes; i++) {
   1336                 bytes[i] = fgetc(fFile);
   1337                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
   1338                     fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
   1339                     fError = TRUE;
   1340                     return 0;
   1341                 }
   1342             }
   1343 
   1344             // Convert the bytes from the temp array to a Unicode char.
   1345             i = 0;
   1346             uint32_t  cp;
   1347             U8_NEXT_UNSAFE(bytes, i, cp);
   1348             c = (UChar)cp;
   1349 
   1350             if (cp >= 0x10000) {
   1351                 // The code point needs to be broken up into a utf-16 surrogate pair.
   1352                 //  Process first half this time through the main loop, and
   1353                 //   remember the other half for the next time through.
   1354                 UChar utf16Buf[3];
   1355                 i = 0;
   1356                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
   1357                 fPending2ndSurrogate = utf16Buf[1];
   1358                 c = utf16Buf[0];
   1359             }
   1360             break;
   1361         };
   1362     default:
   1363         c = 0xFFFD; /* Error, unspecified codepage*/
   1364         fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
   1365         exit(1);
   1366     }
   1367     return c;
   1368 }
   1369 
   1370 //----------------------------------------------------------------------------------------
   1371 //
   1372 //   openRulesCollator  - Command line specified a rules file.  Read it in
   1373 //                        and open a collator with it.
   1374 //
   1375 //----------------------------------------------------------------------------------------
   1376 UCollator *openRulesCollator() {
   1377     UCharFile f(opt_rules);
   1378     if (f.error()) {
   1379         return 0;
   1380     }
   1381 
   1382     int  bufLen = 10000;
   1383     UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
   1384     UChar *tmp;
   1385     int i = 0;
   1386 
   1387     for(;;) {
   1388         buf[i] = f.get();
   1389         if (f.eof()) {
   1390             break;
   1391         }
   1392         if (f.error()) {
   1393             return 0;
   1394         }
   1395         i++;
   1396         if (i >= bufLen) {
   1397             tmp = buf;
   1398             bufLen += 10000;
   1399             buf = (UChar *)realloc(buf, bufLen);
   1400             if (buf == NULL) {
   1401                 free(tmp);
   1402                 return 0;
   1403             }
   1404         }
   1405     }
   1406     buf[i] = 0;
   1407 
   1408     UErrorCode    status = U_ZERO_ERROR;
   1409     UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
   1410                                          UCOL_DEFAULT_STRENGTH, NULL, &status);
   1411     if (U_FAILURE(status)) {
   1412         fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
   1413         return 0;
   1414     }
   1415     free(buf);
   1416     return coll;
   1417 }
   1418 
   1419 
   1420 
   1421 
   1422 
   1423 //----------------------------------------------------------------------------------------
   1424 //
   1425 //    Main   --  process command line, read in and pre-process the test file,
   1426 //                 call other functions to do the actual tests.
   1427 //
   1428 //----------------------------------------------------------------------------------------
   1429 int main(int argc, const char** argv) {
   1430     if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
   1431         printf(gUsageString);
   1432         exit (1);
   1433     }
   1434 
   1435     // Make sure that we've only got one API selected.
   1436     if (opt_unix || opt_win) opt_icu = FALSE;
   1437     if (opt_unix) opt_win = FALSE;
   1438 
   1439     //
   1440     //  Set up an ICU collator
   1441     //
   1442     UErrorCode          status = U_ZERO_ERROR;
   1443 
   1444     if (opt_rules != 0) {
   1445         gCol = openRulesCollator();
   1446         if (gCol == 0) {return -1;}
   1447     }
   1448     else {
   1449         gCol = ucol_open(opt_locale, &status);
   1450         if (U_FAILURE(status)) {
   1451             fprintf(stderr, "Collator creation failed.: %d\n", status);
   1452             return -1;
   1453         }
   1454     }
   1455     if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
   1456         fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
   1457     }
   1458     if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
   1459         fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
   1460     }
   1461 
   1462     if (opt_norm) {
   1463         ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
   1464     }
   1465     if (opt_french && opt_frenchoff) {
   1466         fprintf(stderr, "collperf:  Error, specified both -french and -frenchoff options.");
   1467         exit(-1);
   1468     }
   1469     if (opt_french) {
   1470         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
   1471     }
   1472     if (opt_frenchoff) {
   1473         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
   1474     }
   1475     if (opt_lower) {
   1476         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
   1477     }
   1478     if (opt_upper) {
   1479         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
   1480     }
   1481     if (opt_case) {
   1482         ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
   1483     }
   1484     if (opt_shifted) {
   1485         ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
   1486     }
   1487     if (opt_level != 0) {
   1488         switch (opt_level) {
   1489         case 1:
   1490             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
   1491             break;
   1492         case 2:
   1493             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
   1494             break;
   1495         case 3:
   1496             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
   1497             break;
   1498         case 4:
   1499             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
   1500             break;
   1501         case 5:
   1502             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
   1503             break;
   1504         default:
   1505             fprintf(stderr, "-level param must be between 1 and 5\n");
   1506             exit(-1);
   1507         }
   1508     }
   1509 
   1510     if (U_FAILURE(status)) {
   1511         fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
   1512         return -1;
   1513     }
   1514 
   1515 
   1516     //
   1517     //  Set up a Windows LCID
   1518     //
   1519     if (opt_langid != 0) {
   1520         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
   1521     }
   1522     else {
   1523         gWinLCID = uloc_getLCID(opt_locale);
   1524     }
   1525 
   1526 
   1527     //
   1528     //  Set the UNIX locale
   1529     //
   1530     if (opt_unix) {
   1531         if (setlocale(LC_ALL, opt_locale) == 0) {
   1532             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
   1533             exit(-1);
   1534         }
   1535     }
   1536 
   1537     // Read in  the input file.
   1538     //   File assumed to be utf-16.
   1539     //   Lines go onto heap buffers.  Global index array to line starts is created.
   1540     //   Lines themselves are null terminated.
   1541     //
   1542 
   1543     UCharFile f(opt_fName);
   1544     if (f.error()) {
   1545         exit(-1);
   1546     }
   1547 
   1548     const int MAXLINES = 100000;
   1549     gFileLines = new Line[MAXLINES];
   1550     UChar buf[1024];
   1551     int   column = 0;
   1552 
   1553     //  Read the file, split into lines, and save in memory.
   1554     //  Loop runs once per utf-16 value from the input file,
   1555     //    (The number of bytes read from file per loop iteration depends on external encoding.)
   1556     for (;;) {
   1557 
   1558         UChar c = f.get();
   1559         if (f.error()){
   1560             exit(-1);
   1561         }
   1562 
   1563 
   1564         // We now have a good UTF-16 value in c.
   1565 
   1566         // Watch for CR, LF, EOF; these finish off a line.
   1567         if (c == 0xd) {
   1568             continue;
   1569         }
   1570 
   1571         if (f.eof() || c == 0x0a || c==0x2028) {  // Unipad inserts 2028 line separators!
   1572             buf[column++] = 0;
   1573             if (column > 1) {
   1574                 gFileLines[gNumFileLines].name  = new UChar[column];
   1575                 gFileLines[gNumFileLines].len   = column-1;
   1576                 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
   1577                 gNumFileLines++;
   1578                 column = 0;
   1579                 if (gNumFileLines >= MAXLINES) {
   1580                     fprintf(stderr, "File too big.  Max number of lines is %d\n", MAXLINES);
   1581                     exit(-1);
   1582                 }
   1583 
   1584             }
   1585             if (c == 0xa || c == 0x2028)
   1586                 continue;
   1587             else
   1588                 break;  // EOF
   1589         }
   1590         buf[column++] = c;
   1591         if (column >= 1023)
   1592         {
   1593             static UBool warnFlag = TRUE;
   1594             if (warnFlag) {
   1595                 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
   1596                 warnFlag = FALSE;
   1597             }
   1598             column--;
   1599         }
   1600     }
   1601 
   1602     if (opt_terse == FALSE) {
   1603         printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
   1604     }
   1605 
   1606 
   1607     // Convert the lines to the UNIX encoding.
   1608     if (opt_unix) {
   1609         UnixConvert();
   1610     }
   1611 
   1612     //
   1613     //  Pre-compute ICU sort keys for the lines of the file.
   1614     //
   1615     int line;
   1616     int32_t t;
   1617 
   1618     for (line=0; line<gNumFileLines; line++) {
   1619          t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
   1620          gFileLines[line].icuSortKey  = new char[t];
   1621 
   1622          if (t > (int32_t)sizeof(buf)) {
   1623              t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
   1624          }
   1625          else
   1626          {
   1627              memcpy(gFileLines[line].icuSortKey, buf, t);
   1628          }
   1629     }
   1630 
   1631 
   1632 
   1633     //
   1634     //  Pre-compute Windows sort keys for the lines of the file.
   1635     //
   1636     for (line=0; line<gNumFileLines; line++) {
   1637          t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
   1638          gFileLines[line].winSortKey  = new char[t];
   1639          if (t > (int32_t)sizeof(buf)) {
   1640              t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t);
   1641          }
   1642          else
   1643          {
   1644              memcpy(gFileLines[line].winSortKey, buf, t);
   1645          }
   1646     }
   1647 
   1648     //
   1649     //  Pre-compute UNIX sort keys for the lines of the file.
   1650     //
   1651     if (opt_unix) {
   1652         for (line=0; line<gNumFileLines; line++) {
   1653             t=strxfrm((char *)buf,  gFileLines[line].unixName,  sizeof(buf));
   1654             gFileLines[line].unixSortKey  = new char[t];
   1655             if (t > (int32_t)sizeof(buf)) {
   1656                 t = strxfrm(gFileLines[line].unixSortKey,  gFileLines[line].unixName,  sizeof(buf));
   1657             }
   1658             else
   1659             {
   1660                 memcpy(gFileLines[line].unixSortKey, buf, t);
   1661             }
   1662         }
   1663     }
   1664 
   1665 
   1666     //
   1667     //  Dump file lines, CEs, Sort Keys if requested.
   1668     //
   1669     if (opt_dump) {
   1670         int  i;
   1671         for (line=0; line<gNumFileLines; line++) {
   1672             for (i=0;;i++) {
   1673                 UChar  c = gFileLines[line].name[i];
   1674                 if (c == 0)
   1675                     break;
   1676                 if (c < 0x20 || c > 0x7e) {
   1677                     printf("\\u%.4x", c);
   1678                 }
   1679                 else {
   1680                     printf("%c", c);
   1681                 }
   1682             }
   1683             printf("\n");
   1684 
   1685             printf("   CEs: ");
   1686             UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
   1687             int32_t ce;
   1688             i = 0;
   1689             for (;;) {
   1690                 ce = ucol_next(CEiter, &status);
   1691                 if (ce == UCOL_NULLORDER) {
   1692                     break;
   1693                 }
   1694                 printf(" %.8x", ce);
   1695                 if (++i > 8) {
   1696                     printf("\n        ");
   1697                     i = 0;
   1698                 }
   1699             }
   1700             printf("\n");
   1701             ucol_closeElements(CEiter);
   1702 
   1703 
   1704             printf("   ICU Sort Key: ");
   1705             for (i=0; ; i++) {
   1706                 unsigned char c = gFileLines[line].icuSortKey[i];
   1707                 printf("%02x ", c);
   1708                 if (c == 0) {
   1709                     break;
   1710                 }
   1711                 if (i > 0 && i % 20 == 0) {
   1712                     printf("\n                 ");
   1713                 }
   1714            }
   1715             printf("\n");
   1716         }
   1717     }
   1718 
   1719 
   1720     //
   1721     //  Pre-sort the lines.
   1722     //
   1723     int i;
   1724     gSortedLines = new Line *[gNumFileLines];
   1725     for (i=0; i<gNumFileLines; i++) {
   1726         gSortedLines[i] = &gFileLines[i];
   1727     }
   1728 
   1729     if (opt_win) {
   1730         qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
   1731     }
   1732     else if (opt_unix) {
   1733         qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
   1734     }
   1735     else   /* ICU */
   1736     {
   1737         qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
   1738     }
   1739 
   1740 
   1741     //
   1742     //  Make up a randomized order, will be used for sorting tests.
   1743     //
   1744     gRandomLines = new Line *[gNumFileLines];
   1745     for (i=0; i<gNumFileLines; i++) {
   1746         gRandomLines[i] = &gFileLines[i];
   1747     }
   1748     qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
   1749 
   1750 
   1751 
   1752 
   1753     //
   1754     //  We've got the file read into memory.  Go do something with it.
   1755     //
   1756 
   1757     if (opt_qsort)     doQSort();
   1758     if (opt_binsearch) doBinarySearch();
   1759     if (opt_keygen)    doKeyGen();
   1760     if (opt_keyhist)   doKeyHist();
   1761     if (opt_itertest)  doIterTest();
   1762 
   1763     return 0;
   1764 
   1765 }
   1766