Home | History | Annotate | Download | only in collationperf
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (C) 2001-2010 IBM, Inc.   All Rights Reserved.
      4  *
      5  ********************************************************************/
      6 /********************************************************************************
      7 *
      8 * File CALLCOLL.C
      9 *
     10 * Modification History:
     11 *        Name                     Description
     12 *     Andy Heninger             First Version
     13 *
     14 *********************************************************************************
     15 */
     16 
     17 //
     18 //  This program tests string collation and sort key generation performance.
     19 //      Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
     20 //      A file of names is required as input, one per line.  It must be in utf-8 or utf-16 format,
     21 //      and include a byte order mark.  Either LE or BE format is OK.
     22 //
     23 
     24 const char gUsageString[] =
     25  "usage:  collperf options...\n"
     26     "-help                      Display this message.\n"
     27     "-file file_name            utf-16 format file of names.\n"
     28     "-locale name               ICU locale to use.  Default is en_US\n"
     29     "-rules file_name           Collation rules file (overrides locale)\n"
     30     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
     31     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
     32     "-win                       Run test using Windows native services.  (ICU is default)\n"
     33     "-unix                      Run test using Unix strxfrm, strcoll services.\n"
     34     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
     35     "-usekeys                   Run tests using sortkeys rather than strcoll\n"
     36     "-strcmp                    Run tests using u_strcmp rather than strcoll\n"
     37     "-strcmpCPO                 Run tests using u_strcmpCodePointOrder rather than strcoll\n"
     38     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
     39     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
     40     "                               under test at each call point.  For measuring test overhead.\n"
     41     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
     42     "-french                    French accent ordering\n"
     43     "-frenchoff                 No French accent ordering (for use with French locales.)\n"
     44     "-norm                      Normalizing mode on\n"
     45     "-shifted                   Shifted mode\n"
     46     "-lower                     Lower case first\n"
     47     "-upper                     Upper case first\n"
     48     "-case                      Enable separate case level\n"
     49     "-level n                   Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
     50     "-keyhist                   Produce a table sort key size vs. string length\n"
     51     "-binsearch                 Binary Search timing test\n"
     52     "-keygen                    Sort Key Generation timing test\n"
     53     "-qsort                     Quicksort timing test\n"
     54     "-iter                      Iteration Performance Test\n"
     55     "-dump                      Display strings, sort keys and CEs.\n"
     56     ;
     57 
     58 
     59 
     60 #include <stdio.h>
     61 #include <string.h>
     62 #include <stdlib.h>
     63 #include <math.h>
     64 #include <locale.h>
     65 #include <errno.h>
     66 
     67 #include <unicode/utypes.h>
     68 #include <unicode/ucol.h>
     69 #include <unicode/ucoleitr.h>
     70 #include <unicode/uloc.h>
     71 #include <unicode/ustring.h>
     72 #include <unicode/ures.h>
     73 #include <unicode/uchar.h>
     74 #include <unicode/ucnv.h>
     75 #include <unicode/utf8.h>
     76 
     77 #ifdef WIN32
     78 #include <windows.h>
     79 #else
     80 //
     81 //  Stubs for Windows API functions when building on UNIXes.
     82 //
     83 typedef int DWORD;
     84 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
     85 #include <sys/time.h>
     86 unsigned long timeGetTime() {
     87     struct timeval t;
     88     gettimeofday(&t, 0);
     89     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
     90     val += t.tv_usec / 1000;
     91     return val;
     92 }
     93 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
     94 const int LCMAP_SORTKEY = 0;
     95 #define MAKELCID(a,b) 0
     96 const int SORT_DEFAULT = 0;
     97 #endif
     98 
     99 
    100 
    101 //
    102 //  Command line option variables
    103 //     These global variables are set according to the options specified
    104 //     on the command line by the user.
    105 char * opt_fName      = 0;
    106 const char * opt_locale     = "en_US";
    107 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
    108 char * opt_rules      = 0;
    109 UBool  opt_help       = FALSE;
    110 int    opt_loopCount  = 1;
    111 int    opt_iLoopCount = 1;
    112 UBool  opt_terse      = FALSE;
    113 UBool  opt_qsort      = FALSE;
    114 UBool  opt_binsearch  = FALSE;
    115 UBool  opt_icu        = TRUE;
    116 UBool  opt_win        = FALSE;      // Run with Windows native functions.
    117 UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
    118 UBool  opt_uselen     = FALSE;
    119 UBool  opt_usekeys    = FALSE;
    120 UBool  opt_strcmp     = FALSE;
    121 UBool  opt_strcmpCPO  = FALSE;
    122 UBool  opt_norm       = FALSE;
    123 UBool  opt_keygen     = FALSE;
    124 UBool  opt_french     = FALSE;
    125 UBool  opt_frenchoff  = FALSE;
    126 UBool  opt_shifted    = FALSE;
    127 UBool  opt_lower      = FALSE;
    128 UBool  opt_upper      = FALSE;
    129 UBool  opt_case       = FALSE;
    130 int    opt_level      = 0;
    131 UBool  opt_keyhist    = FALSE;
    132 UBool  opt_itertest   = FALSE;
    133 UBool  opt_dump       = FALSE;
    134 
    135 
    136 
    137 //
    138 //   Definitions for the command line options
    139 //
    140 struct OptSpec {
    141     const char *name;
    142     enum {FLAG, NUM, STRING} type;
    143     void *pVar;
    144 };
    145 
    146 OptSpec opts[] = {
    147     {"-file",        OptSpec::STRING, &opt_fName},
    148     {"-locale",      OptSpec::STRING, &opt_locale},
    149     {"-langid",      OptSpec::NUM,    &opt_langid},
    150     {"-rules",       OptSpec::STRING, &opt_rules},
    151     {"-qsort",       OptSpec::FLAG,   &opt_qsort},
    152     {"-binsearch",   OptSpec::FLAG,   &opt_binsearch},
    153     {"-iter",        OptSpec::FLAG,   &opt_itertest},
    154     {"-win",         OptSpec::FLAG,   &opt_win},
    155     {"-unix",        OptSpec::FLAG,   &opt_unix},
    156     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
    157     {"-usekeys",     OptSpec::FLAG,   &opt_usekeys},
    158     {"-strcmp",      OptSpec::FLAG,   &opt_strcmp},
    159     {"-strcmpCPO",   OptSpec::FLAG,   &opt_strcmpCPO},
    160     {"-norm",        OptSpec::FLAG,   &opt_norm},
    161     {"-french",      OptSpec::FLAG,   &opt_french},
    162     {"-frenchoff",   OptSpec::FLAG,   &opt_frenchoff},
    163     {"-shifted",     OptSpec::FLAG,   &opt_shifted},
    164     {"-lower",       OptSpec::FLAG,   &opt_lower},
    165     {"-upper",       OptSpec::FLAG,   &opt_upper},
    166     {"-case",        OptSpec::FLAG,   &opt_case},
    167     {"-level",       OptSpec::NUM,    &opt_level},
    168     {"-keyhist",     OptSpec::FLAG,   &opt_keyhist},
    169     {"-keygen",      OptSpec::FLAG,   &opt_keygen},
    170     {"-loop",        OptSpec::NUM,    &opt_loopCount},
    171     {"-iloop",       OptSpec::NUM,    &opt_iLoopCount},
    172     {"-terse",       OptSpec::FLAG,   &opt_terse},
    173     {"-dump",        OptSpec::FLAG,   &opt_dump},
    174     {"-help",        OptSpec::FLAG,   &opt_help},
    175     {"-?",           OptSpec::FLAG,   &opt_help},
    176     {0, OptSpec::FLAG, 0}
    177 };
    178 
    179 
    180 //---------------------------------------------------------------------------
    181 //
    182 //  Global variables pointing to and describing the test file
    183 //
    184 //---------------------------------------------------------------------------
    185 
    186 //
    187 //   struct Line
    188 //
    189 //      Each line from the source file (containing a name, presumably) gets
    190 //      one of these structs.
    191 //
    192 struct  Line {
    193     UChar     *name;
    194     int        len;
    195     char      *winSortKey;
    196     char      *icuSortKey;
    197     char      *unixSortKey;
    198     char      *unixName;
    199 };
    200 
    201 
    202 
    203 Line          *gFileLines;           // Ptr to array of Line structs, one per line in the file.
    204 int            gNumFileLines;
    205 UCollator     *gCol;
    206 DWORD          gWinLCID;
    207 
    208 Line          **gSortedLines;
    209 Line          **gRandomLines;
    210 int            gCount;
    211 
    212 
    213 
    214 //---------------------------------------------------------------------------
    215 //
    216 //  ProcessOptions()    Function to read the command line options.
    217 //
    218 //---------------------------------------------------------------------------
    219 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
    220 {
    221     int         i;
    222     int         argNum;
    223     const char  *pArgName;
    224     OptSpec    *pOpt;
    225 
    226     for (argNum=1; argNum<argc; argNum++) {
    227         pArgName = argv[argNum];
    228         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
    229             if (strcmp(pOpt->name, pArgName) == 0) {
    230                 switch (pOpt->type) {
    231                 case OptSpec::FLAG:
    232                     *(UBool *)(pOpt->pVar) = TRUE;
    233                     break;
    234                 case OptSpec::STRING:
    235                     argNum ++;
    236                     if (argNum >= argc) {
    237                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
    238                         return FALSE;
    239                     }
    240                     *(const char **)(pOpt->pVar)  = argv[argNum];
    241                     break;
    242                 case OptSpec::NUM:
    243                     argNum ++;
    244                     if (argNum >= argc) {
    245                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
    246                         return FALSE;
    247                     }
    248                     char *endp;
    249                     i = strtol(argv[argNum], &endp, 0);
    250                     if (endp == argv[argNum]) {
    251                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
    252                         return FALSE;
    253                     }
    254                     *(int *)(pOpt->pVar) = i;
    255                 }
    256                 break;
    257             }
    258         }
    259         if (pOpt->name == 0)
    260         {
    261             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
    262             return FALSE;
    263         }
    264     }
    265 return TRUE;
    266 }
    267 
    268 //---------------------------------------------------------------------------------------
    269 //
    270 //   Comparison functions for use by qsort.
    271 //
    272 //       Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
    273 //           or null terminated.
    274 //
    275 //---------------------------------------------------------------------------------------
    276 int ICUstrcmpK(const void *a, const void *b) {
    277     gCount++;
    278     int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
    279     return t;
    280 }
    281 
    282 
    283 int ICUstrcmpL(const void *a, const void *b) {
    284     gCount++;
    285     UCollationResult t;
    286     t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
    287     if (t == UCOL_LESS) return -1;
    288     if (t == UCOL_GREATER) return +1;
    289     return 0;
    290 }
    291 
    292 
    293 int ICUstrcmp(const void *a, const void *b) {
    294     gCount++;
    295     UCollationResult t;
    296     t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
    297     if (t == UCOL_LESS) return -1;
    298     if (t == UCOL_GREATER) return +1;
    299     return 0;
    300 }
    301 
    302 
    303 int Winstrcmp(const void *a, const void *b) {
    304     gCount++;
    305     int t;
    306     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
    307     return t-2;
    308 }
    309 
    310 
    311 int UNIXstrcmp(const void *a, const void *b) {
    312     gCount++;
    313     int t;
    314     t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
    315     return t;
    316 }
    317 
    318 
    319 int WinstrcmpL(const void *a, const void *b) {
    320     gCount++;
    321     int t;
    322     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
    323     return t-2;
    324 }
    325 
    326 
    327 int WinstrcmpK(const void *a, const void *b) {
    328     gCount++;
    329     int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
    330     return t;
    331 }
    332 
    333 
    334 //---------------------------------------------------------------------------------------
    335 //
    336 //   Function for sorting the names (lines) into a random order.
    337 //      Order is based on a hash of the  ICU Sort key for the lines
    338 //      The randomized order is used as input for the sorting timing tests.
    339 //
    340 //---------------------------------------------------------------------------------------
    341 int ICURandomCmp(const void *a, const void *b) {
    342     char  *ask = (*(Line **)a)->icuSortKey;
    343     char  *bsk = (*(Line **)b)->icuSortKey;
    344     int   aVal = 0;
    345     int   bVal = 0;
    346     int   retVal;
    347     while (*ask != 0) {
    348         aVal += aVal*37 + *ask++;
    349     }
    350     while (*bsk != 0) {
    351         bVal += bVal*37 + *bsk++;
    352     }
    353     retVal = -1;
    354     if (aVal == bVal) {
    355         retVal = 0;
    356     }
    357     else if (aVal > bVal) {
    358         retVal = 1;
    359     }
    360     return retVal;
    361 }
    362 
    363 //---------------------------------------------------------------------------------------
    364 //
    365 //   doKeyGen()     Key Generation Timing Test
    366 //
    367 //---------------------------------------------------------------------------------------
    368 void doKeyGen()
    369 {
    370     int  line;
    371     int  loops = 0;
    372     int  iLoop;
    373     int  t;
    374     int  len=-1;
    375 
    376     // Adjust loop count to compensate for file size.   Should be order n
    377     double dLoopCount = double(opt_loopCount) * (1000. /  double(gNumFileLines));
    378     int adj_loopCount = int(dLoopCount);
    379     if (adj_loopCount < 1) adj_loopCount = 1;
    380 
    381 
    382     unsigned long startTime = timeGetTime();
    383 
    384     if (opt_win) {
    385         for (loops=0; loops<adj_loopCount; loops++) {
    386             for (line=0; line < gNumFileLines; line++) {
    387                 if (opt_uselen) {
    388                     len = gFileLines[line].len;
    389                 }
    390                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    391                     t=LCMapStringW(gWinLCID, LCMAP_SORTKEY,
    392                         gFileLines[line].name, len,
    393                         (unsigned short *)gFileLines[line].winSortKey, 5000);    // TODO  something with length.
    394                 }
    395             }
    396         }
    397     }
    398     else if (opt_icu)
    399     {
    400         for (loops=0; loops<adj_loopCount; loops++) {
    401             for (line=0; line < gNumFileLines; line++) {
    402                 if (opt_uselen) {
    403                     len = gFileLines[line].len;
    404                 }
    405                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    406                     t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
    407                 }
    408             }
    409         }
    410     }
    411     else if (opt_unix)
    412     {
    413         for (loops=0; loops<adj_loopCount; loops++) {
    414             for (line=0; line < gNumFileLines; line++) {
    415                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    416                 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
    417                 }
    418             }
    419         }
    420     }
    421 
    422     unsigned long elapsedTime = timeGetTime() - startTime;
    423     int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
    424 
    425     if (opt_terse == FALSE) {
    426         printf("Sort Key Generation:  total # of keys = %d\n", loops*gNumFileLines);
    427         printf("Sort Key Generation:  time per key = %d ns\n", ns);
    428     }
    429     else {
    430         printf("%d,  ", ns);
    431     }
    432 
    433     int   totalKeyLen = 0;
    434     int   totalChars  = 0;
    435     for (line=0; line<gNumFileLines; line++) {
    436         totalChars += u_strlen(gFileLines[line].name);
    437         if (opt_win) {
    438             totalKeyLen += strlen(gFileLines[line].winSortKey);
    439         }
    440         else if (opt_icu) {
    441             totalKeyLen += strlen(gFileLines[line].icuSortKey);
    442         }
    443         else if (opt_unix) {
    444             totalKeyLen += strlen(gFileLines[line].unixSortKey);
    445         }
    446 
    447     }
    448     if (opt_terse == FALSE) {
    449         printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
    450     } else {
    451         printf("%f, ", (float)totalKeyLen / (float)totalChars);
    452     }
    453 }
    454 
    455 
    456 
    457 //---------------------------------------------------------------------------------------
    458 //
    459 //    doBinarySearch()    Binary Search timing test.  Each name from the list
    460 //                        is looked up in the full sorted list of names.
    461 //
    462 //---------------------------------------------------------------------------------------
    463 void doBinarySearch()
    464 {
    465 
    466     gCount = 0;
    467     int  line;
    468     int  loops = 0;
    469     int  iLoop = 0;
    470     unsigned long elapsedTime = 0;
    471 
    472     // Adjust loop count to compensate for file size.   Should be order n (lookups) * log n  (compares/lookup)
    473     // Accurate timings do not depend on this being perfect.  The correction is just to try to
    474     //   get total running times of about the right order, so the that user doesn't need to
    475     //   manually adjust the loop count for every different file size.
    476     double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines));
    477     if (opt_usekeys) dLoopCount *= 5;
    478     int adj_loopCount = int(dLoopCount);
    479     if (adj_loopCount < 1) adj_loopCount = 1;
    480 
    481 
    482     for (;;) {  // not really a loop, just allows "break" to work, to simplify
    483                 //   inadvertantly running more than one test through here.
    484         if (opt_strcmp || opt_strcmpCPO)
    485         {
    486             unsigned long startTime = timeGetTime();
    487             typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
    488             PF pf = u_strcmp;
    489             if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
    490             //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;}   // Damn the difference between int32_t and int
    491                                                             //   which forces the use of a cast here.
    492 
    493             int r = 0;
    494             for (loops=0; loops<adj_loopCount; loops++) {
    495 
    496                 for (line=0; line < gNumFileLines; line++) {
    497                     int hi      = gNumFileLines-1;
    498                     int lo      = 0;
    499                     int  guess = -1;
    500                     for (;;) {
    501                         int newGuess = (hi + lo) / 2;
    502                         if (newGuess == guess)
    503                             break;
    504                         guess = newGuess;
    505                         for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    506                             r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
    507                         }
    508                         gCount++;
    509                         if (r== 0)
    510                             break;
    511                         if (r < 0)
    512                             hi = guess;
    513                         else
    514                             lo   = guess;
    515                     }
    516                 }
    517             }
    518             elapsedTime = timeGetTime() - startTime;
    519             break;
    520         }
    521 
    522 
    523         if (opt_icu)
    524         {
    525             unsigned long startTime = timeGetTime();
    526             UCollationResult  r = UCOL_EQUAL;
    527             for (loops=0; loops<adj_loopCount; loops++) {
    528 
    529                 for (line=0; line < gNumFileLines; line++) {
    530                     int lineLen  = -1;
    531                     int guessLen = -1;
    532                     if (opt_uselen) {
    533                         lineLen = (gSortedLines[line])->len;
    534                     }
    535                     int hi      = gNumFileLines-1;
    536                     int lo      = 0;
    537                     int  guess = -1;
    538                     for (;;) {
    539                         int newGuess = (hi + lo) / 2;
    540                         if (newGuess == guess)
    541                             break;
    542                         guess = newGuess;
    543                         int ri = 0;
    544                         if (opt_usekeys) {
    545                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    546                                 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
    547                             }
    548                             gCount++;
    549                             r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
    550                         }
    551                         else
    552                         {
    553                             if (opt_uselen) {
    554                                 guessLen = (gSortedLines[guess])->len;
    555                             }
    556                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    557                                 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
    558                             }
    559                             gCount++;
    560                         }
    561                         if (r== UCOL_EQUAL)
    562                             break;
    563                         if (r == UCOL_LESS)
    564                             hi = guess;
    565                         else
    566                             lo   = guess;
    567                     }
    568                 }
    569             }
    570             elapsedTime = timeGetTime() - startTime;
    571             break;
    572         }
    573 
    574         if (opt_win)
    575         {
    576             unsigned long startTime = timeGetTime();
    577             int r = 0;
    578             for (loops=0; loops<adj_loopCount; loops++) {
    579 
    580                 for (line=0; line < gNumFileLines; line++) {
    581                     int lineLen  = -1;
    582                     int guessLen = -1;
    583                     if (opt_uselen) {
    584                         lineLen = (gSortedLines[line])->len;
    585                     }
    586                     int hi   = gNumFileLines-1;
    587                     int lo   = 0;
    588                     int  guess = -1;
    589                     for (;;) {
    590                         int newGuess = (hi + lo) / 2;
    591                         if (newGuess == guess)
    592                             break;
    593                         guess = newGuess;
    594                         if (opt_usekeys) {
    595                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    596                                 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
    597                             }
    598                             gCount++;
    599                             r+=2;
    600                         }
    601                         else
    602                         {
    603                             if (opt_uselen) {
    604                                 guessLen = (gSortedLines[guess])->len;
    605                             }
    606                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    607                                 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
    608                             }
    609                             if (r == 0) {
    610                                 if (opt_terse == FALSE) {
    611                                     fprintf(stderr, "Error returned from Windows CompareStringW.\n");
    612                                 }
    613                                 exit(-1);
    614                             }
    615                             gCount++;
    616                         }
    617                         if (r== 2)   //  strings ==
    618                             break;
    619                         if (r == 1)  //  line < guess
    620                             hi = guess;
    621                         else         //  line > guess
    622                             lo   = guess;
    623                     }
    624                 }
    625             }
    626             elapsedTime = timeGetTime() - startTime;
    627             break;
    628         }
    629 
    630         if (opt_unix)
    631         {
    632             unsigned long startTime = timeGetTime();
    633             int r = 0;
    634             for (loops=0; loops<adj_loopCount; loops++) {
    635 
    636                 for (line=0; line < gNumFileLines; line++) {
    637                     int hi   = gNumFileLines-1;
    638                     int lo   = 0;
    639                     int  guess = -1;
    640                     for (;;) {
    641                         int newGuess = (hi + lo) / 2;
    642                         if (newGuess == guess)
    643                             break;
    644                         guess = newGuess;
    645                         if (opt_usekeys) {
    646                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    647                                  r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
    648                             }
    649                             gCount++;
    650                         }
    651                         else
    652                         {
    653                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
    654                                 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
    655                             }
    656                             errno = 0;
    657                             if (errno != 0) {
    658                                 fprintf(stderr, "Error %d returned from strcoll.\n", errno);
    659                                 exit(-1);
    660                             }
    661                             gCount++;
    662                         }
    663                         if (r == 0)   //  strings ==
    664                             break;
    665                         if (r < 0)  //  line < guess
    666                             hi = guess;
    667                         else         //  line > guess
    668                             lo   = guess;
    669                     }
    670                 }
    671             }
    672             elapsedTime = timeGetTime() - startTime;
    673             break;
    674         }
    675         break;
    676     }
    677 
    678     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
    679     if (opt_terse == FALSE) {
    680         printf("binary search:  total # of string compares = %d\n", gCount);
    681         printf("binary search:  compares per loop = %d\n", gCount / loops);
    682         printf("binary search:  time per compare = %d ns\n", ns);
    683     } else {
    684         printf("%d, ", ns);
    685     }
    686 
    687 }
    688 
    689 
    690 
    691 
    692 //---------------------------------------------------------------------------------------
    693 //
    694 //   doQSort()    The quick sort timing test.  Uses the C library qsort function.
    695 //
    696 //---------------------------------------------------------------------------------------
    697 void doQSort() {
    698     int i;
    699     Line **sortBuf = new Line *[gNumFileLines];
    700 
    701     // Adjust loop count to compensate for file size.   QSort should be n log(n)
    702     double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines));
    703     if (opt_usekeys) dLoopCount *= 5;
    704     int adj_loopCount = int(dLoopCount);
    705     if (adj_loopCount < 1) adj_loopCount = 1;
    706 
    707 
    708     gCount = 0;
    709     unsigned long startTime = timeGetTime();
    710     if (opt_win && opt_usekeys) {
    711         for (i=0; i<opt_loopCount; i++) {
    712             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    713             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
    714         }
    715     }
    716 
    717     else if (opt_win && opt_uselen) {
    718         for (i=0; i<adj_loopCount; i++) {
    719             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    720             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
    721         }
    722     }
    723 
    724 
    725     else if (opt_win && !opt_uselen) {
    726         for (i=0; i<adj_loopCount; i++) {
    727             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    728             qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
    729         }
    730     }
    731 
    732     else if (opt_icu && opt_usekeys) {
    733         for (i=0; i<adj_loopCount; i++) {
    734             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    735             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
    736         }
    737     }
    738 
    739     else if (opt_icu && opt_uselen) {
    740         for (i=0; i<adj_loopCount; i++) {
    741             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    742             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
    743         }
    744     }
    745 
    746 
    747     else if (opt_icu && !opt_uselen) {
    748         for (i=0; i<adj_loopCount; i++) {
    749             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    750             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
    751         }
    752     }
    753 
    754     else if (opt_unix && !opt_usekeys) {
    755         for (i=0; i<adj_loopCount; i++) {
    756             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
    757             qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
    758         }
    759     }
    760 
    761     unsigned long elapsedTime = timeGetTime() - startTime;
    762     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
    763     if (opt_terse == FALSE) {
    764         printf("qsort:  total # of string compares = %d\n", gCount);
    765         printf("qsort:  time per compare = %d ns\n", ns);
    766     } else {
    767         printf("%d, ", ns);
    768     }
    769 }
    770 
    771 
    772 
    773 //---------------------------------------------------------------------------------------
    774 //
    775 //    doKeyHist()       Output a table of data for
    776 //                        average sort key size vs. string length.
    777 //
    778 //---------------------------------------------------------------------------------------
    779 void doKeyHist() {
    780     int     i;
    781     int     maxLen = 0;
    782 
    783     // Find the maximum string length
    784     for (i=0; i<gNumFileLines; i++) {
    785         if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
    786     }
    787 
    788     // Allocate arrays to hold the histogram data
    789     int *accumulatedLen  = new int[maxLen+1];
    790     int *numKeysOfSize   = new int[maxLen+1];
    791     for (i=0; i<=maxLen; i++) {
    792         accumulatedLen[i] = 0;
    793         numKeysOfSize[i] = 0;
    794     }
    795 
    796     // Fill the arrays...
    797     for (i=0; i<gNumFileLines; i++) {
    798         int len = gFileLines[i].len;
    799         accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
    800         numKeysOfSize[len] += 1;
    801     }
    802 
    803     // And write out averages
    804     printf("String Length,  Avg Key Length,  Avg Key Len per char\n");
    805     for (i=1; i<=maxLen; i++) {
    806         if (numKeysOfSize[i] > 0) {
    807             printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
    808                 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
    809         }
    810     }
    811     delete []accumulatedLen;
    812     delete []numKeysOfSize ;
    813 }
    814 
    815 //---------------------------------------------------------------------------------------
    816 //
    817 //    doForwardIterTest(UBool)       Forward iteration test
    818 //                                   argument null-terminated string used
    819 //
    820 //---------------------------------------------------------------------------------------
    821 void doForwardIterTest(UBool haslen) {
    822     int count = 0;
    823 
    824     UErrorCode error = U_ZERO_ERROR;
    825     printf("\n\nPerforming forward iteration performance test with ");
    826 
    827     if (haslen) {
    828         printf("non-null terminated data -----------\n");
    829     }
    830     else {
    831         printf("null terminated data -----------\n");
    832     }
    833     printf("performance test on strings from file -----------\n");
    834 
    835     UChar dummytext[] = {0, 0};
    836     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
    837     ucol_setText(iter, dummytext, 1, &error);
    838 
    839     gCount = 0;
    840     unsigned long startTime = timeGetTime();
    841     while (count < opt_loopCount) {
    842         int linecount = 0;
    843         while (linecount < gNumFileLines) {
    844             UChar *str = gFileLines[linecount].name;
    845             int strlen = haslen?gFileLines[linecount].len:-1;
    846             ucol_setText(iter, str, strlen, &error);
    847             while (ucol_next(iter, &error) != UCOL_NULLORDER) {
    848                 gCount++;
    849             }
    850 
    851             linecount ++;
    852         }
    853         count ++;
    854     }
    855     unsigned long elapsedTime = timeGetTime() - startTime;
    856     printf("elapsedTime %ld\n", elapsedTime);
    857 
    858     // empty loop recalculation
    859     count = 0;
    860     startTime = timeGetTime();
    861     while (count < opt_loopCount) {
    862         int linecount = 0;
    863         while (linecount < gNumFileLines) {
    864             UChar *str = gFileLines[linecount].name;
    865             int strlen = haslen?gFileLines[linecount].len:-1;
    866             ucol_setText(iter, str, strlen, &error);
    867             linecount ++;
    868         }
    869         count ++;
    870     }
    871     elapsedTime -= (timeGetTime() - startTime);
    872     printf("elapsedTime %ld\n", elapsedTime);
    873 
    874     ucol_closeElements(iter);
    875 
    876     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
    877     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
    878                                                                 opt_loopCount);
    879     printf("Average time per ucol_next() nano seconds %d\n", ns);
    880 
    881     printf("performance test on skipped-5 concatenated strings from file -----------\n");
    882 
    883     UChar *str;
    884     int    strlen = 0;
    885     // appending all the strings
    886     int linecount = 0;
    887     while (linecount < gNumFileLines) {
    888         strlen += haslen?gFileLines[linecount].len:
    889                                       u_strlen(gFileLines[linecount].name);
    890         linecount ++;
    891     }
    892     str = (UChar *)malloc(sizeof(UChar) * strlen);
    893     int strindex = 0;
    894     linecount = 0;
    895     while (strindex < strlen) {
    896         int len = 0;
    897         len += haslen?gFileLines[linecount].len:
    898                                       u_strlen(gFileLines[linecount].name);
    899         memcpy(str + strindex, gFileLines[linecount].name,
    900                sizeof(UChar) * len);
    901         strindex += len;
    902         linecount ++;
    903     }
    904 
    905     printf("Total size of strings %d\n", strlen);
    906 
    907     gCount = 0;
    908     count  = 0;
    909 
    910     if (!haslen) {
    911         strlen = -1;
    912     }
    913     iter = ucol_openElements(gCol, str, strlen, &error);
    914     if (!haslen) {
    915         strlen = u_strlen(str);
    916     }
    917     strlen -= 5; // any left over characters are not iterated,
    918                  // this is to ensure the backwards and forwards iterators
    919                  // gets the same position
    920     startTime = timeGetTime();
    921     while (count < opt_loopCount) {
    922         int count5 = 5;
    923         strindex = 0;
    924         ucol_setOffset(iter, strindex, &error);
    925         while (TRUE) {
    926             if (ucol_next(iter, &error) == UCOL_NULLORDER) {
    927                 break;
    928             }
    929             gCount++;
    930             count5 --;
    931             if (count5 == 0) {
    932                 strindex += 10;
    933                 if (strindex > strlen) {
    934                     break;
    935                 }
    936                 ucol_setOffset(iter, strindex, &error);
    937                 count5 = 5;
    938             }
    939         }
    940         count ++;
    941     }
    942 
    943     elapsedTime = timeGetTime() - startTime;
    944     printf("elapsedTime %ld\n", elapsedTime);
    945 
    946     // empty loop recalculation
    947     int tempgCount = 0;
    948     count = 0;
    949     startTime = timeGetTime();
    950     while (count < opt_loopCount) {
    951         int count5 = 5;
    952         strindex = 0;
    953         ucol_setOffset(iter, strindex, &error);
    954         while (TRUE) {
    955             tempgCount ++;
    956             count5 --;
    957             if (count5 == 0) {
    958                 strindex += 10;
    959                 if (strindex > strlen) {
    960                     break;
    961                 }
    962                 ucol_setOffset(iter, strindex, &error);
    963                 count5 = 5;
    964             }
    965         }
    966         count ++;
    967     }
    968     elapsedTime -= (timeGetTime() - startTime);
    969     printf("elapsedTime %ld\n", elapsedTime);
    970 
    971     ucol_closeElements(iter);
    972 
    973     printf("gCount %d\n", gCount);
    974     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
    975     printf("Average time per ucol_next() nano seconds %d\n", ns);
    976 }
    977 
    978 //---------------------------------------------------------------------------------------
    979 //
    980 //    doBackwardIterTest(UBool)      Backwards iteration test
    981 //                                   argument null-terminated string used
    982 //
    983 //---------------------------------------------------------------------------------------
    984 void doBackwardIterTest(UBool haslen) {
    985     int count = 0;
    986     UErrorCode error = U_ZERO_ERROR;
    987     printf("\n\nPerforming backward iteration performance test with ");
    988 
    989     if (haslen) {
    990         printf("non-null terminated data -----------\n");
    991     }
    992     else {
    993         printf("null terminated data -----------\n");
    994     }
    995 
    996     printf("performance test on strings from file -----------\n");
    997 
    998     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
    999     UChar dummytext[] = {0, 0};
   1000     ucol_setText(iter, dummytext, 1, &error);
   1001 
   1002     gCount = 0;
   1003     unsigned long startTime = timeGetTime();
   1004     while (count < opt_loopCount) {
   1005         int linecount = 0;
   1006         while (linecount < gNumFileLines) {
   1007             UChar *str = gFileLines[linecount].name;
   1008             int strlen = haslen?gFileLines[linecount].len:-1;
   1009             ucol_setText(iter, str, strlen, &error);
   1010             while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
   1011                 gCount ++;
   1012             }
   1013 
   1014             linecount ++;
   1015         }
   1016         count ++;
   1017     }
   1018     unsigned long elapsedTime = timeGetTime() - startTime;
   1019 
   1020     printf("elapsedTime %ld\n", elapsedTime);
   1021 
   1022     // empty loop recalculation
   1023     count = 0;
   1024     startTime = timeGetTime();
   1025     while (count < opt_loopCount) {
   1026         int linecount = 0;
   1027         while (linecount < gNumFileLines) {
   1028             UChar *str = gFileLines[linecount].name;
   1029             int strlen = haslen?gFileLines[linecount].len:-1;
   1030             ucol_setText(iter, str, strlen, &error);
   1031             linecount ++;
   1032         }
   1033         count ++;
   1034     }
   1035     elapsedTime -= (timeGetTime() - startTime);
   1036 
   1037     printf("elapsedTime %ld\n", elapsedTime);
   1038     ucol_closeElements(iter);
   1039 
   1040     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
   1041     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
   1042                                                                 opt_loopCount);
   1043     printf("Average time per ucol_previous() nano seconds %d\n", ns);
   1044 
   1045     printf("performance test on skipped-5 concatenated strings from file -----------\n");
   1046 
   1047     UChar *str;
   1048     int    strlen = 0;
   1049     // appending all the strings
   1050     int linecount = 0;
   1051     while (linecount < gNumFileLines) {
   1052         strlen += haslen?gFileLines[linecount].len:
   1053                                       u_strlen(gFileLines[linecount].name);
   1054         linecount ++;
   1055     }
   1056     str = (UChar *)malloc(sizeof(UChar) * strlen);
   1057     int strindex = 0;
   1058     linecount = 0;
   1059     while (strindex < strlen) {
   1060         int len = 0;
   1061         len += haslen?gFileLines[linecount].len:
   1062                                       u_strlen(gFileLines[linecount].name);
   1063         memcpy(str + strindex, gFileLines[linecount].name,
   1064                sizeof(UChar) * len);
   1065         strindex += len;
   1066         linecount ++;
   1067     }
   1068 
   1069     printf("Total size of strings %d\n", strlen);
   1070 
   1071     gCount = 0;
   1072     count  = 0;
   1073 
   1074     if (!haslen) {
   1075         strlen = -1;
   1076     }
   1077 
   1078     iter = ucol_openElements(gCol, str, strlen, &error);
   1079     if (!haslen) {
   1080         strlen = u_strlen(str);
   1081     }
   1082 
   1083     startTime = timeGetTime();
   1084     while (count < opt_loopCount) {
   1085         int count5 = 5;
   1086         strindex = 5;
   1087         ucol_setOffset(iter, strindex, &error);
   1088         while (TRUE) {
   1089             if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
   1090                 break;
   1091             }
   1092              gCount ++;
   1093              count5 --;
   1094              if (count5 == 0) {
   1095                  strindex += 10;
   1096                  if (strindex > strlen) {
   1097                     break;
   1098                  }
   1099                  ucol_setOffset(iter, strindex, &error);
   1100                  count5 = 5;
   1101              }
   1102         }
   1103         count ++;
   1104     }
   1105 
   1106     elapsedTime = timeGetTime() - startTime;
   1107     printf("elapsedTime %ld\n", elapsedTime);
   1108 
   1109     // empty loop recalculation
   1110     count = 0;
   1111     int tempgCount = 0;
   1112     startTime = timeGetTime();
   1113     while (count < opt_loopCount) {
   1114         int count5 = 5;
   1115         strindex = 5;
   1116         ucol_setOffset(iter, strindex, &error);
   1117         while (TRUE) {
   1118              tempgCount ++;
   1119              count5 --;
   1120              if (count5 == 0) {
   1121                  strindex += 10;
   1122                  if (strindex > strlen) {
   1123                     break;
   1124                  }
   1125                  ucol_setOffset(iter, strindex, &error);
   1126                  count5 = 5;
   1127              }
   1128         }
   1129         count ++;
   1130     }
   1131     elapsedTime -= (timeGetTime() - startTime);
   1132     printf("elapsedTime %ld\n", elapsedTime);
   1133     ucol_closeElements(iter);
   1134 
   1135     printf("gCount %d\n", gCount);
   1136     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
   1137     printf("Average time per ucol_previous() nano seconds %d\n", ns);
   1138 }
   1139 
   1140 //---------------------------------------------------------------------------------------
   1141 //
   1142 //    doIterTest()       Iteration test
   1143 //
   1144 //---------------------------------------------------------------------------------------
   1145 void doIterTest() {
   1146     doForwardIterTest(opt_uselen);
   1147     doBackwardIterTest(opt_uselen);
   1148 }
   1149 
   1150 
   1151 //----------------------------------------------------------------------------------------
   1152 //
   1153 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
   1154 //                    Since it appears that Unicode support is going in the general
   1155 //                    direction of the use of UTF-8 locales, that is the approach
   1156 //                    that is used here.
   1157 //
   1158 //----------------------------------------------------------------------------------------
   1159 void  UnixConvert() {
   1160     int    line;
   1161 
   1162     UConverter   *cvrtr;    // An ICU code page converter.
   1163     UErrorCode    status = U_ZERO_ERROR;
   1164 
   1165 
   1166     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
   1167     if (U_FAILURE(status)) {
   1168         fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
   1169         exit(-1);
   1170     }
   1171 
   1172     for (line=0; line < gNumFileLines; line++) {
   1173         int sizeNeeded = ucnv_fromUChars(cvrtr,
   1174                                          0,            // ptr to target buffer.
   1175                                          0,            // length of target buffer.
   1176                                          gFileLines[line].name,
   1177                                          -1,           //  source is null terminated
   1178                                          &status);
   1179         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
   1180             //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
   1181             //exit(-1);
   1182         }
   1183         status = U_ZERO_ERROR;
   1184         gFileLines[line].unixName = new char[sizeNeeded+1];
   1185         sizeNeeded = ucnv_fromUChars(cvrtr,
   1186                                          gFileLines[line].unixName, // ptr to target buffer.
   1187                                          sizeNeeded+1, // length of target buffer.
   1188                                          gFileLines[line].name,
   1189                                          -1,           //  source is null terminated
   1190                                          &status);
   1191         if (U_FAILURE(status)) {
   1192             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
   1193             exit(-1);
   1194         }
   1195         gFileLines[line].unixName[sizeNeeded] = 0;
   1196     };
   1197     ucnv_close(cvrtr);
   1198 }
   1199 
   1200 
   1201 //----------------------------------------------------------------------------------------
   1202 //
   1203 //  class UCharFile   Class to hide all the gorp to read a file in
   1204 //                    and produce a stream of UChars.
   1205 //
   1206 //----------------------------------------------------------------------------------------
   1207 class UCharFile {
   1208 public:
   1209     UCharFile(const char *fileName);
   1210     ~UCharFile();
   1211     UChar   get();
   1212     UBool   eof() {return fEof;};
   1213     UBool   error() {return fError;};
   1214 
   1215 private:
   1216     UCharFile (const UCharFile & /*other*/) {};                         // No copy constructor.
   1217     UCharFile & operator = (const UCharFile &/*other*/) {return *this;};   // No assignment op
   1218 
   1219     FILE         *fFile;
   1220     const char   *fName;
   1221     UBool        fEof;
   1222     UBool        fError;
   1223     UChar        fPending2ndSurrogate;
   1224 
   1225     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
   1226 };
   1227 
   1228 UCharFile::UCharFile(const char * fileName) {
   1229     fEof                 = FALSE;
   1230     fError               = FALSE;
   1231     fName                = fileName;
   1232     fFile                = fopen(fName, "rb");
   1233     fPending2ndSurrogate = 0;
   1234     if (fFile == NULL) {
   1235         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
   1236         fError = TRUE;
   1237         return;
   1238     }
   1239     //
   1240     //  Look for the byte order mark at the start of the file.
   1241     //
   1242     int BOMC1, BOMC2, BOMC3;
   1243     BOMC1 = fgetc(fFile);
   1244     BOMC2 = fgetc(fFile);
   1245 
   1246     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
   1247         fEncoding = UTF16LE; }
   1248     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
   1249         fEncoding = UTF16BE; }
   1250     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
   1251         fEncoding = UTF8; }
   1252     else
   1253     {
   1254         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
   1255             "must include a BOM.\n", fileName);
   1256         fError = true;
   1257         return;
   1258     }
   1259 }
   1260 
   1261 
   1262 UCharFile::~UCharFile() {
   1263     fclose(fFile);
   1264 }
   1265 
   1266 
   1267 
   1268 UChar UCharFile::get() {
   1269     UChar   c;
   1270     switch (fEncoding) {
   1271     case UTF16LE:
   1272         {
   1273             int  cL, cH;
   1274             cL = fgetc(fFile);
   1275             cH = fgetc(fFile);
   1276             c  = cL  | (cH << 8);
   1277             if (cH == EOF) {
   1278                 c   = 0;
   1279                 fEof = TRUE;
   1280             }
   1281             break;
   1282         }
   1283     case UTF16BE:
   1284         {
   1285             int  cL, cH;
   1286             cH = fgetc(fFile);
   1287             cL = fgetc(fFile);
   1288             c  = cL  | (cH << 8);
   1289             if (cL == EOF) {
   1290                 c   = 0;
   1291                 fEof = TRUE;
   1292             }
   1293             break;
   1294         }
   1295     case UTF8:
   1296         {
   1297             if (fPending2ndSurrogate != 0) {
   1298                 c = fPending2ndSurrogate;
   1299                 fPending2ndSurrogate = 0;
   1300                 break;
   1301             }
   1302 
   1303             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
   1304             if (ch == EOF) {
   1305                 c = 0;
   1306                 fEof = TRUE;
   1307                 break;
   1308             }
   1309 
   1310             if (ch <= 0x7f) {
   1311                 // It's ascii.  No further utf-8 conversion.
   1312                 c = ch;
   1313                 break;
   1314             }
   1315 
   1316             // Figure out the lenght of the char and read the rest of the bytes
   1317             //   into a temp array.
   1318             int nBytes;
   1319             if (ch >= 0xF0) {nBytes=4;}
   1320             else if (ch >= 0xE0) {nBytes=3;}
   1321             else if (ch >= 0xC0) {nBytes=2;}
   1322             else {
   1323                 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
   1324                 fError = TRUE;
   1325                 return 0;
   1326             }
   1327 
   1328             unsigned char  bytes[10];
   1329             bytes[0] = (unsigned char)ch;
   1330             int i;
   1331             for (i=1; i<nBytes; i++) {
   1332                 bytes[i] = fgetc(fFile);
   1333                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
   1334                     fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
   1335                     fError = TRUE;
   1336                     return 0;
   1337                 }
   1338             }
   1339 
   1340             // Convert the bytes from the temp array to a Unicode char.
   1341             i = 0;
   1342             uint32_t  cp;
   1343             UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp);
   1344             c = (UChar)cp;
   1345 
   1346             if (cp >= 0x10000) {
   1347                 // The code point needs to be broken up into a utf-16 surrogate pair.
   1348                 //  Process first half this time through the main loop, and
   1349                 //   remember the other half for the next time through.
   1350                 UChar utf16Buf[3];
   1351                 i = 0;
   1352                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
   1353                 fPending2ndSurrogate = utf16Buf[1];
   1354                 c = utf16Buf[0];
   1355             }
   1356             break;
   1357         };
   1358     default:
   1359         c = 0xFFFD; /* Error, unspecified codepage*/
   1360         fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
   1361         exit(1);
   1362     }
   1363     return c;
   1364 }
   1365 
   1366 //----------------------------------------------------------------------------------------
   1367 //
   1368 //   openRulesCollator  - Command line specified a rules file.  Read it in
   1369 //                        and open a collator with it.
   1370 //
   1371 //----------------------------------------------------------------------------------------
   1372 UCollator *openRulesCollator() {
   1373     UCharFile f(opt_rules);
   1374     if (f.error()) {
   1375         return 0;
   1376     }
   1377 
   1378     int  bufLen = 10000;
   1379     UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
   1380     int i = 0;
   1381 
   1382     for(;;) {
   1383         buf[i] = f.get();
   1384         if (f.eof()) {
   1385             break;
   1386         }
   1387         if (f.error()) {
   1388             return 0;
   1389         }
   1390         i++;
   1391         if (i >= bufLen) {
   1392             bufLen += 10000;
   1393             buf = (UChar *)realloc(buf, bufLen);
   1394         }
   1395     }
   1396     buf[i] = 0;
   1397 
   1398     UErrorCode    status = U_ZERO_ERROR;
   1399     UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
   1400                                          UCOL_DEFAULT_STRENGTH, NULL, &status);
   1401     if (U_FAILURE(status)) {
   1402         fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
   1403         return 0;
   1404     }
   1405     free(buf);
   1406     return coll;
   1407 }
   1408 
   1409 
   1410 
   1411 
   1412 
   1413 //----------------------------------------------------------------------------------------
   1414 //
   1415 //    Main   --  process command line, read in and pre-process the test file,
   1416 //                 call other functions to do the actual tests.
   1417 //
   1418 //----------------------------------------------------------------------------------------
   1419 int main(int argc, const char** argv) {
   1420     if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
   1421         printf(gUsageString);
   1422         exit (1);
   1423     }
   1424 
   1425     // Make sure that we've only got one API selected.
   1426     if (opt_unix || opt_win) opt_icu = FALSE;
   1427     if (opt_unix) opt_win = FALSE;
   1428 
   1429     //
   1430     //  Set up an ICU collator
   1431     //
   1432     UErrorCode          status = U_ZERO_ERROR;
   1433 
   1434     if (opt_rules != 0) {
   1435         gCol = openRulesCollator();
   1436         if (gCol == 0) {return -1;}
   1437     }
   1438     else {
   1439         gCol = ucol_open(opt_locale, &status);
   1440         if (U_FAILURE(status)) {
   1441             fprintf(stderr, "Collator creation failed.: %d\n", status);
   1442             return -1;
   1443         }
   1444     }
   1445     if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
   1446         fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
   1447     }
   1448     if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
   1449         fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
   1450     }
   1451 
   1452     if (opt_norm) {
   1453         ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
   1454     }
   1455     if (opt_french && opt_frenchoff) {
   1456         fprintf(stderr, "collperf:  Error, specified both -french and -frenchoff options.");
   1457         exit(-1);
   1458     }
   1459     if (opt_french) {
   1460         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
   1461     }
   1462     if (opt_frenchoff) {
   1463         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
   1464     }
   1465     if (opt_lower) {
   1466         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
   1467     }
   1468     if (opt_upper) {
   1469         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
   1470     }
   1471     if (opt_case) {
   1472         ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
   1473     }
   1474     if (opt_shifted) {
   1475         ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
   1476     }
   1477     if (opt_level != 0) {
   1478         switch (opt_level) {
   1479         case 1:
   1480             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
   1481             break;
   1482         case 2:
   1483             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
   1484             break;
   1485         case 3:
   1486             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
   1487             break;
   1488         case 4:
   1489             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
   1490             break;
   1491         case 5:
   1492             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
   1493             break;
   1494         default:
   1495             fprintf(stderr, "-level param must be between 1 and 5\n");
   1496             exit(-1);
   1497         }
   1498     }
   1499 
   1500     if (U_FAILURE(status)) {
   1501         fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
   1502         return -1;
   1503     }
   1504 
   1505 
   1506     //
   1507     //  Set up a Windows LCID
   1508     //
   1509     if (opt_langid != 0) {
   1510         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
   1511     }
   1512     else {
   1513         gWinLCID = uloc_getLCID(opt_locale);
   1514     }
   1515 
   1516 
   1517     //
   1518     //  Set the UNIX locale
   1519     //
   1520     if (opt_unix) {
   1521         if (setlocale(LC_ALL, opt_locale) == 0) {
   1522             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
   1523             exit(-1);
   1524         }
   1525     }
   1526 
   1527     // Read in  the input file.
   1528     //   File assumed to be utf-16.
   1529     //   Lines go onto heap buffers.  Global index array to line starts is created.
   1530     //   Lines themselves are null terminated.
   1531     //
   1532 
   1533     UCharFile f(opt_fName);
   1534     if (f.error()) {
   1535         exit(-1);
   1536     }
   1537 
   1538     const int MAXLINES = 100000;
   1539     gFileLines = new Line[MAXLINES];
   1540     UChar buf[1024];
   1541     int   column = 0;
   1542 
   1543     //  Read the file, split into lines, and save in memory.
   1544     //  Loop runs once per utf-16 value from the input file,
   1545     //    (The number of bytes read from file per loop iteration depends on external encoding.)
   1546     for (;;) {
   1547 
   1548         UChar c = f.get();
   1549         if (f.error()){
   1550             exit(-1);
   1551         }
   1552 
   1553 
   1554         // We now have a good UTF-16 value in c.
   1555 
   1556         // Watch for CR, LF, EOF; these finish off a line.
   1557         if (c == 0xd) {
   1558             continue;
   1559         }
   1560 
   1561         if (f.eof() || c == 0x0a || c==0x2028) {  // Unipad inserts 2028 line separators!
   1562             buf[column++] = 0;
   1563             if (column > 1) {
   1564                 gFileLines[gNumFileLines].name  = new UChar[column];
   1565                 gFileLines[gNumFileLines].len   = column-1;
   1566                 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
   1567                 gNumFileLines++;
   1568                 column = 0;
   1569                 if (gNumFileLines >= MAXLINES) {
   1570                     fprintf(stderr, "File too big.  Max number of lines is %d\n", MAXLINES);
   1571                     exit(-1);
   1572                 }
   1573 
   1574             }
   1575             if (c == 0xa || c == 0x2028)
   1576                 continue;
   1577             else
   1578                 break;  // EOF
   1579         }
   1580         buf[column++] = c;
   1581         if (column >= 1023)
   1582         {
   1583             static UBool warnFlag = TRUE;
   1584             if (warnFlag) {
   1585                 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
   1586                 warnFlag = FALSE;
   1587             }
   1588             column--;
   1589         }
   1590     }
   1591 
   1592     if (opt_terse == FALSE) {
   1593         printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
   1594     }
   1595 
   1596 
   1597     // Convert the lines to the UNIX encoding.
   1598     if (opt_unix) {
   1599         UnixConvert();
   1600     }
   1601 
   1602     //
   1603     //  Pre-compute ICU sort keys for the lines of the file.
   1604     //
   1605     int line;
   1606     int32_t t;
   1607 
   1608     for (line=0; line<gNumFileLines; line++) {
   1609          t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
   1610          gFileLines[line].icuSortKey  = new char[t];
   1611 
   1612          if (t > (int32_t)sizeof(buf)) {
   1613              t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
   1614          }
   1615          else
   1616          {
   1617              memcpy(gFileLines[line].icuSortKey, buf, t);
   1618          }
   1619     }
   1620 
   1621 
   1622 
   1623     //
   1624     //  Pre-compute Windows sort keys for the lines of the file.
   1625     //
   1626     for (line=0; line<gNumFileLines; line++) {
   1627          t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
   1628          gFileLines[line].winSortKey  = new char[t];
   1629          if (t > (int32_t)sizeof(buf)) {
   1630              t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t);
   1631          }
   1632          else
   1633          {
   1634              memcpy(gFileLines[line].winSortKey, buf, t);
   1635          }
   1636     }
   1637 
   1638     //
   1639     //  Pre-compute UNIX sort keys for the lines of the file.
   1640     //
   1641     if (opt_unix) {
   1642         for (line=0; line<gNumFileLines; line++) {
   1643             t=strxfrm((char *)buf,  gFileLines[line].unixName,  sizeof(buf));
   1644             gFileLines[line].unixSortKey  = new char[t];
   1645             if (t > (int32_t)sizeof(buf)) {
   1646                 t = strxfrm(gFileLines[line].unixSortKey,  gFileLines[line].unixName,  sizeof(buf));
   1647             }
   1648             else
   1649             {
   1650                 memcpy(gFileLines[line].unixSortKey, buf, t);
   1651             }
   1652         }
   1653     }
   1654 
   1655 
   1656     //
   1657     //  Dump file lines, CEs, Sort Keys if requested.
   1658     //
   1659     if (opt_dump) {
   1660         int  i;
   1661         for (line=0; line<gNumFileLines; line++) {
   1662             for (i=0;;i++) {
   1663                 UChar  c = gFileLines[line].name[i];
   1664                 if (c == 0)
   1665                     break;
   1666                 if (c < 0x20 || c > 0x7e) {
   1667                     printf("\\u%.4x", c);
   1668                 }
   1669                 else {
   1670                     printf("%c", c);
   1671                 }
   1672             }
   1673             printf("\n");
   1674 
   1675             printf("   CEs: ");
   1676             UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
   1677             int32_t ce;
   1678             i = 0;
   1679             for (;;) {
   1680                 ce = ucol_next(CEiter, &status);
   1681                 if (ce == UCOL_NULLORDER) {
   1682                     break;
   1683                 }
   1684                 printf(" %.8x", ce);
   1685                 if (++i > 8) {
   1686                     printf("\n        ");
   1687                     i = 0;
   1688                 }
   1689             }
   1690             printf("\n");
   1691             ucol_closeElements(CEiter);
   1692 
   1693 
   1694             printf("   ICU Sort Key: ");
   1695             for (i=0; ; i++) {
   1696                 unsigned char c = gFileLines[line].icuSortKey[i];
   1697                 printf("%02x ", c);
   1698                 if (c == 0) {
   1699                     break;
   1700                 }
   1701                 if (i > 0 && i % 20 == 0) {
   1702                     printf("\n                 ");
   1703                 }
   1704            }
   1705             printf("\n");
   1706         }
   1707     }
   1708 
   1709 
   1710     //
   1711     //  Pre-sort the lines.
   1712     //
   1713     int i;
   1714     gSortedLines = new Line *[gNumFileLines];
   1715     for (i=0; i<gNumFileLines; i++) {
   1716         gSortedLines[i] = &gFileLines[i];
   1717     }
   1718 
   1719     if (opt_win) {
   1720         qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
   1721     }
   1722     else if (opt_unix) {
   1723         qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
   1724     }
   1725     else   /* ICU */
   1726     {
   1727         qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
   1728     }
   1729 
   1730 
   1731     //
   1732     //  Make up a randomized order, will be used for sorting tests.
   1733     //
   1734     gRandomLines = new Line *[gNumFileLines];
   1735     for (i=0; i<gNumFileLines; i++) {
   1736         gRandomLines[i] = &gFileLines[i];
   1737     }
   1738     qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
   1739 
   1740 
   1741 
   1742 
   1743     //
   1744     //  We've got the file read into memory.  Go do something with it.
   1745     //
   1746 
   1747     if (opt_qsort)     doQSort();
   1748     if (opt_binsearch) doBinarySearch();
   1749     if (opt_keygen)    doKeyGen();
   1750     if (opt_keyhist)   doKeyHist();
   1751     if (opt_itertest)  doIterTest();
   1752 
   1753     return 0;
   1754 
   1755 }
   1756