Home | History | Annotate | Download | only in ubrkperf
      1 /***********************************************************************
      2  *  2016 and later: Unicode, Inc. and others.
      3  * License & terms of use: http://www.unicode.org/copyright.html#License
      4  *
      5  ***********************************************************************
      6  ***********************************************************************
      7  * COPYRIGHT:
      8  * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
      9  *
     10  ***********************************************************************/
     11 /********************************************************************************
     12 *
     13 * File ubrkperf.cpp
     14 *
     15 * Modification History:
     16 *        Name                     Description
     17 *     Vladimir Weinstein          First Version, based on collperf
     18 *
     19 *********************************************************************************
     20 */
     21 
     22 //
     23 //  This program tests break iterator performance
     24 //      Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
     25 //      (if any)
     26 //      A text file is required as input.  It must be in utf-8 or utf-16 format,
     27 //      and include a byte order mark.  Either LE or BE format is OK.
     28 //
     29 
     30 const char gUsageString[] =
     31  "usage:  ubrkperf options...\n"
     32     "-help                      Display this message.\n"
     33     "-file file_name            utf-16/utf-8 format file.\n"
     34     "-locale name               ICU locale to use.  Default is en_US\n"
     35     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
     36     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
     37     "-win                       Run test using Windows native services. (currently not working) (ICU is default)\n"
     38     "-unix                      Run test using Unix word breaking services. (currently not working) \n"
     39     "-mac                       Run test using MacOSX word breaking services.\n"
     40     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
     41     "-char                      Use character break iterator\n"
     42     "-word                      Use word break iterator\n"
     43     "-line                      Use line break iterator\n"
     44     "-sentence                  Use sentence break iterator\n"
     45     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
     46     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
     47     "                               under test at each call point.  For measuring test overhead.\n"
     48     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
     49     "-dump                      Display stuff.\n"
     50     "-capi                      Use C APIs instead of C++ APIs (currently not working)\n"
     51     "-next                      Do the next test\n"
     52     "-isBound                   Do the isBound test\n"
     53     ;
     54 
     55 
     56 #include <stdio.h>
     57 #include <string.h>
     58 #include <stdlib.h>
     59 #include <math.h>
     60 #include <locale.h>
     61 #include <errno.h>
     62 #include <sys/stat.h>
     63 
     64 #include <unicode/utypes.h>
     65 #include <unicode/ucol.h>
     66 #include <unicode/ucoleitr.h>
     67 #include <unicode/uloc.h>
     68 #include <unicode/ustring.h>
     69 #include <unicode/ures.h>
     70 #include <unicode/uchar.h>
     71 #include <unicode/ucnv.h>
     72 #include <unicode/utf8.h>
     73 
     74 #include <unicode/brkiter.h>
     75 
     76 
     77 #if U_PLATFORM_HAS_WIN32_API
     78 #include <windows.h>
     79 #else
     80 //
     81 //  Stubs for Windows API functions when building on UNIXes.
     82 //
     83 #include <sys/time.h>
     84 unsigned long timeGetTime() {
     85     struct timeval t;
     86     gettimeofday(&t, 0);
     87     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
     88     val += t.tv_usec / 1000;
     89     return val;
     90 };
     91 #define MAKELCID(a,b) 0
     92 #endif
     93 
     94 
     95 //
     96 //  Command line option variables
     97 //     These global variables are set according to the options specified
     98 //     on the command line by the user.
     99 char * opt_fName      = 0;
    100 char * opt_locale     = "en_US";
    101 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
    102 char * opt_rules      = 0;
    103 UBool  opt_help       = FALSE;
    104 int    opt_time       = 0;
    105 int    opt_loopCount  = 0;
    106 int    opt_passesCount= 1;
    107 UBool  opt_terse      = FALSE;
    108 UBool  opt_icu        = TRUE;
    109 UBool  opt_win        = FALSE;      // Run with Windows native functions.
    110 UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
    111 UBool  opt_mac        = FALSE;      // Run with MacOSX word break services.
    112 UBool  opt_uselen     = FALSE;
    113 UBool  opt_dump       = FALSE;
    114 UBool  opt_char       = FALSE;
    115 UBool  opt_word       = FALSE;
    116 UBool  opt_line       = FALSE;
    117 UBool  opt_sentence   = FALSE;
    118 UBool  opt_capi       = FALSE;
    119 
    120 UBool  opt_next       = FALSE;
    121 UBool  opt_isBound    = FALSE;
    122 
    123 
    124 
    125 //
    126 //   Definitions for the command line options
    127 //
    128 struct OptSpec {
    129     const char *name;
    130     enum {FLAG, NUM, STRING} type;
    131     void *pVar;
    132 };
    133 
    134 OptSpec opts[] = {
    135     {"-file",        OptSpec::STRING, &opt_fName},
    136     {"-locale",      OptSpec::STRING, &opt_locale},
    137     {"-langid",      OptSpec::NUM,    &opt_langid},
    138     {"-win",         OptSpec::FLAG,   &opt_win},
    139     {"-unix",        OptSpec::FLAG,   &opt_unix},
    140     {"-mac",         OptSpec::FLAG,   &opt_mac},
    141     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
    142     {"-loop",        OptSpec::NUM,    &opt_loopCount},
    143     {"-time",        OptSpec::NUM,    &opt_time},
    144     {"-passes",      OptSpec::NUM,    &opt_passesCount},
    145     {"-char",        OptSpec::FLAG,   &opt_char},
    146     {"-word",        OptSpec::FLAG,   &opt_word},
    147     {"-line",        OptSpec::FLAG,   &opt_line},
    148     {"-sentence",    OptSpec::FLAG,   &opt_sentence},
    149     {"-terse",       OptSpec::FLAG,   &opt_terse},
    150     {"-dump",        OptSpec::FLAG,   &opt_dump},
    151     {"-capi",        OptSpec::FLAG,   &opt_capi},
    152     {"-next",        OptSpec::FLAG,   &opt_next},
    153     {"-isBound",     OptSpec::FLAG,   &opt_isBound},
    154     {"-help",        OptSpec::FLAG,   &opt_help},
    155     {"-?",           OptSpec::FLAG,   &opt_help},
    156     {0, OptSpec::FLAG, 0}
    157 };
    158 
    159 
    160 //---------------------------------------------------------------------------
    161 //
    162 //  Global variables pointing to and describing the test file
    163 //
    164 //---------------------------------------------------------------------------
    165 
    166 //DWORD          gWinLCID;
    167 BreakIterator *brkit = NULL;
    168 UChar *text = NULL;
    169 int32_t textSize = 0;
    170 
    171 
    172 
    173 #if U_PLATFORM_IS_DARWIN_BASED
    174 #include <ApplicationServices/ApplicationServices.h>
    175 enum{
    176   kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
    177     };
    178 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
    179 TextBreakLocatorRef breakRef;
    180 UCTextBreakType macBreakType;
    181 
    182 void createMACBrkIt() {
    183   OSStatus status = noErr;
    184   LocaleRef lref;
    185   status = LocaleRefFromLocaleString(opt_locale, &lref);
    186   status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
    187   if(opt_char == TRUE) {
    188     macBreakType = kUCTextBreakClusterMask;
    189   } else if(opt_word == TRUE) {
    190     macBreakType = kUCTextBreakWordMask;
    191   } else if(opt_line == TRUE) {
    192     macBreakType = kUCTextBreakLineMask;
    193   } else if(opt_sentence == TRUE) {
    194     // error
    195     // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
    196   } else {
    197     // default is character iterator
    198     macBreakType = kUCTextBreakClusterMask;
    199       }
    200 }
    201 #endif
    202 
    203 void createICUBrkIt() {
    204   //
    205   //  Set up an ICU break iterator
    206   //
    207   UErrorCode          status = U_ZERO_ERROR;
    208   if(opt_char == TRUE) {
    209     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
    210   } else if(opt_word == TRUE) {
    211     brkit = BreakIterator::createWordInstance(opt_locale, status);
    212   } else if(opt_line == TRUE) {
    213     brkit = BreakIterator::createLineInstance(opt_locale, status);
    214   } else if(opt_sentence == TRUE) {
    215     brkit = BreakIterator::createSentenceInstance(opt_locale, status);
    216   } else {
    217     // default is character iterator
    218     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
    219   }
    220   if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
    221     fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
    222   }
    223   if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
    224     fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
    225   }
    226 
    227 }
    228 
    229 //---------------------------------------------------------------------------
    230 //
    231 //  ProcessOptions()    Function to read the command line options.
    232 //
    233 //---------------------------------------------------------------------------
    234 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
    235 {
    236     int         i;
    237     int         argNum;
    238     const char  *pArgName;
    239     OptSpec    *pOpt;
    240 
    241     for (argNum=1; argNum<argc; argNum++) {
    242         pArgName = argv[argNum];
    243         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
    244             if (strcmp(pOpt->name, pArgName) == 0) {
    245                 switch (pOpt->type) {
    246                 case OptSpec::FLAG:
    247                     *(UBool *)(pOpt->pVar) = TRUE;
    248                     break;
    249                 case OptSpec::STRING:
    250                     argNum ++;
    251                     if (argNum >= argc) {
    252                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
    253                         return FALSE;
    254                     }
    255                     *(const char **)(pOpt->pVar)  = argv[argNum];
    256                     break;
    257                 case OptSpec::NUM:
    258                     argNum ++;
    259                     if (argNum >= argc) {
    260                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
    261                         return FALSE;
    262                     }
    263                     char *endp;
    264                     i = strtol(argv[argNum], &endp, 0);
    265                     if (endp == argv[argNum]) {
    266                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
    267                         return FALSE;
    268                     }
    269                     *(int *)(pOpt->pVar) = i;
    270                 }
    271                 break;
    272             }
    273         }
    274         if (pOpt->name == 0)
    275         {
    276             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
    277             return FALSE;
    278         }
    279     }
    280 return TRUE;
    281 }
    282 
    283 
    284 void doForwardTest() {
    285   if (opt_terse == FALSE) {
    286     printf("Doing the forward test\n");
    287   }
    288   int32_t noBreaks = 0;
    289   int32_t i = 0;
    290   unsigned long startTime = timeGetTime();
    291   unsigned long elapsedTime = 0;
    292   if(opt_icu) {
    293     createICUBrkIt();
    294     brkit->setText(UnicodeString(text, textSize));
    295     brkit->first();
    296     if (opt_terse == FALSE) {
    297       printf("Warmup\n");
    298     }
    299     int j;
    300     while((j = brkit->next()) != BreakIterator::DONE) {
    301       noBreaks++;
    302       //fprintf(stderr, "%d ", j);
    303     }
    304 
    305     if (opt_terse == FALSE) {
    306       printf("Measure\n");
    307     }
    308     startTime = timeGetTime();
    309     for(i = 0; i < opt_loopCount; i++) {
    310       brkit->first();
    311       while(brkit->next() != BreakIterator::DONE) {
    312       }
    313     }
    314 
    315     elapsedTime = timeGetTime()-startTime;
    316   } else if(opt_mac) {
    317 #if U_PLATFORM_IS_DARWIN_BASED
    318     createMACBrkIt();
    319     UniChar* filePtr = text;
    320     OSStatus status = noErr;
    321     UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
    322     startOffset = 0;
    323     //printf("\t---Search forward--\n");
    324 
    325     while (startOffset < numUniChars)
    326     {
    327 	status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
    328                                startOffset, &breakOffset);
    329       //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
    330       //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
    331 
    332       // Output break
    333       //printf("\t%d\n", (int)breakOffset);
    334 
    335       // Increment counters
    336 	noBreaks++;
    337       startOffset = breakOffset;
    338     }
    339     startTime = timeGetTime();
    340     for(i = 0; i < opt_loopCount; i++) {
    341       startOffset = 0;
    342 
    343       while (startOffset < numUniChars)
    344 	{
    345 	  status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
    346 				   startOffset, &breakOffset);
    347 	  // Increment counters
    348 	  startOffset = breakOffset;
    349 	}
    350     }
    351     elapsedTime = timeGetTime()-startTime;
    352     UCDisposeTextBreakLocator(&breakRef);
    353 #endif
    354 
    355 
    356   }
    357 
    358 
    359   if (opt_terse == FALSE) {
    360   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
    361       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
    362       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
    363       printf("forward break iteration average loop time %d\n", loopTime);
    364       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
    365       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
    366   } else {
    367       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
    368   }
    369 
    370 
    371 }
    372 
    373 void doIsBoundTest() {
    374   int32_t noBreaks = 0, hit = 0;
    375   int32_t i = 0, j = 0;
    376   unsigned long startTime = timeGetTime();
    377   unsigned long elapsedTime = 0;
    378   createICUBrkIt();
    379   brkit->setText(UnicodeString(text, textSize));
    380   brkit->first();
    381   for(j = 0; j < textSize; j++) {
    382     if(brkit->isBoundary(j)) {
    383       noBreaks++;
    384       //fprintf(stderr, "%d ", j);
    385     }
    386   }
    387   /*
    388   while(brkit->next() != BreakIterator::DONE) {
    389     noBreaks++;
    390   }
    391   */
    392 
    393   startTime = timeGetTime();
    394   for(i = 0; i < opt_loopCount; i++) {
    395     for(j = 0; j < textSize; j++) {
    396       if(brkit->isBoundary(j)) {
    397         hit++;
    398       }
    399     }
    400   }
    401 
    402   elapsedTime = timeGetTime()-startTime;
    403   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
    404   if (opt_terse == FALSE) {
    405       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
    406       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
    407       printf("forward break iteration average loop time %d\n", loopTime);
    408       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
    409       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
    410   } else {
    411       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
    412   }
    413 }
    414 
    415 //----------------------------------------------------------------------------------------
    416 //
    417 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
    418 //                    Since it appears that Unicode support is going in the general
    419 //                    direction of the use of UTF-8 locales, that is the approach
    420 //                    that is used here.
    421 //
    422 //----------------------------------------------------------------------------------------
    423 void  UnixConvert() {
    424 #if 0
    425     int    line;
    426 
    427     UConverter   *cvrtr;    // An ICU code page converter.
    428     UErrorCode    status = U_ZERO_ERROR;
    429 
    430 
    431     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
    432     if (U_FAILURE(status)) {
    433         fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
    434         exit(-1);
    435     }
    436     // redo for unix
    437     for (line=0; line < gNumFileLines; line++) {
    438         int sizeNeeded = ucnv_fromUChars(cvrtr,
    439                                          0,            // ptr to target buffer.
    440                                          0,            // length of target buffer.
    441                                          gFileLines[line].name,
    442                                          -1,           //  source is null terminated
    443                                          &status);
    444         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
    445             fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
    446             exit(-1);
    447         }
    448         status = U_ZERO_ERROR;
    449         gFileLines[line].unixName = new char[sizeNeeded+1];
    450         sizeNeeded = ucnv_fromUChars(cvrtr,
    451                                          gFileLines[line].unixName, // ptr to target buffer.
    452                                          sizeNeeded+1, // length of target buffer.
    453                                          gFileLines[line].name,
    454                                          -1,           //  source is null terminated
    455                                          &status);
    456         if (U_FAILURE(status)) {
    457             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
    458             exit(-1);
    459         }
    460         gFileLines[line].unixName[sizeNeeded] = 0;
    461     };
    462     ucnv_close(cvrtr);
    463 #endif
    464 }
    465 
    466 
    467 //----------------------------------------------------------------------------------------
    468 //
    469 //  class UCharFile   Class to hide all the gorp to read a file in
    470 //                    and produce a stream of UChars.
    471 //
    472 //----------------------------------------------------------------------------------------
    473 class UCharFile {
    474 public:
    475     UCharFile(const char *fileName);
    476     ~UCharFile();
    477     UChar   get();
    478     UBool   eof() {return fEof;};
    479     UBool   error() {return fError;};
    480     int32_t size() { return fFileSize; };
    481 
    482 private:
    483     UCharFile (const UCharFile &other) {};                         // No copy constructor.
    484     UCharFile & operator = (const UCharFile &other) {return *this;};   // No assignment op
    485 
    486     FILE         *fFile;
    487     const char   *fName;
    488     UBool        fEof;
    489     UBool        fError;
    490     UChar        fPending2ndSurrogate;
    491     int32_t      fFileSize;
    492 
    493     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
    494 };
    495 
    496 UCharFile::UCharFile(const char * fileName) {
    497     fEof                 = FALSE;
    498     fError               = FALSE;
    499     fName                = fileName;
    500     struct stat buf;
    501     int32_t result = stat(fileName, &buf);
    502     if(result != 0) {
    503       fprintf(stderr, "Error getting info\n");
    504       fFileSize = -1;
    505     } else {
    506       fFileSize = buf.st_size;
    507     }
    508     fFile                = fopen(fName, "rb");
    509     fPending2ndSurrogate = 0;
    510     if (fFile == NULL) {
    511         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
    512         fError = TRUE;
    513         return;
    514     }
    515     //
    516     //  Look for the byte order mark at the start of the file.
    517     //
    518     int BOMC1, BOMC2, BOMC3;
    519     BOMC1 = fgetc(fFile);
    520     BOMC2 = fgetc(fFile);
    521 
    522     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
    523         fEncoding = UTF16LE; }
    524     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
    525         fEncoding = UTF16BE; }
    526     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
    527         fEncoding = UTF8; }
    528     else
    529     {
    530         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
    531             "must include a BOM.\n", fileName);
    532         fError = true;
    533         return;
    534     }
    535 }
    536 
    537 
    538 UCharFile::~UCharFile() {
    539     fclose(fFile);
    540 }
    541 
    542 
    543 
    544 UChar UCharFile::get() {
    545     UChar   c;
    546     switch (fEncoding) {
    547     case UTF16LE:
    548         {
    549             int  cL, cH;
    550             cL = fgetc(fFile);
    551             cH = fgetc(fFile);
    552             c  = cL  | (cH << 8);
    553             if (cH == EOF) {
    554                 c   = 0;
    555                 fEof = TRUE;
    556             }
    557             break;
    558         }
    559     case UTF16BE:
    560         {
    561             int  cL, cH;
    562             cH = fgetc(fFile);
    563             cL = fgetc(fFile);
    564             c  = cL  | (cH << 8);
    565             if (cL == EOF) {
    566                 c   = 0;
    567                 fEof = TRUE;
    568             }
    569             break;
    570         }
    571     case UTF8:
    572         {
    573             if (fPending2ndSurrogate != 0) {
    574                 c = fPending2ndSurrogate;
    575                 fPending2ndSurrogate = 0;
    576                 break;
    577             }
    578 
    579             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
    580             if (ch == EOF) {
    581                 c = 0;
    582                 fEof = TRUE;
    583                 break;
    584             }
    585 
    586             if (ch <= 0x7f) {
    587                 // It's ascii.  No further utf-8 conversion.
    588                 c = ch;
    589                 break;
    590             }
    591 
    592             // Figure out the lenght of the char and read the rest of the bytes
    593             //   into a temp array.
    594             int nBytes;
    595             if (ch >= 0xF0) {nBytes=4;}
    596             else if (ch >= 0xE0) {nBytes=3;}
    597             else if (ch >= 0xC0) {nBytes=2;}
    598             else {
    599                 fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
    600                 fError = TRUE;
    601                 return 0;
    602             }
    603 
    604             unsigned char  bytes[10];
    605             bytes[0] = (unsigned char)ch;
    606             int i;
    607             for (i=1; i<nBytes; i++) {
    608                 bytes[i] = fgetc(fFile);
    609                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
    610                     fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
    611                     fError = TRUE;
    612                     return 0;
    613                 }
    614             }
    615 
    616             // Convert the bytes from the temp array to a Unicode char.
    617             i = 0;
    618             uint32_t  cp;
    619             U8_NEXT_UNSAFE(bytes, i, cp);
    620             c = (UChar)cp;
    621 
    622             if (cp >= 0x10000) {
    623                 // The code point needs to be broken up into a utf-16 surrogate pair.
    624                 //  Process first half this time through the main loop, and
    625                 //   remember the other half for the next time through.
    626                 UChar utf16Buf[3];
    627                 i = 0;
    628                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
    629                 fPending2ndSurrogate = utf16Buf[1];
    630                 c = utf16Buf[0];
    631             }
    632             break;
    633         };
    634     }
    635     return c;
    636 }
    637 
    638 
    639 //----------------------------------------------------------------------------------------
    640 //
    641 //    Main   --  process command line, read in and pre-process the test file,
    642 //                 call other functions to do the actual tests.
    643 //
    644 //----------------------------------------------------------------------------------------
    645 int main(int argc, const char** argv) {
    646     if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
    647         printf(gUsageString);
    648         exit (1);
    649     }
    650     // Make sure that we've only got one API selected.
    651     if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
    652     if (opt_mac || opt_unix) opt_win = FALSE;
    653     if (opt_mac) opt_unix = FALSE;
    654 
    655     UErrorCode          status = U_ZERO_ERROR;
    656 
    657 
    658 
    659     //
    660     //  Set up a Windows LCID
    661     //
    662   /*
    663     if (opt_langid != 0) {
    664         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
    665     }
    666     else {
    667         gWinLCID = uloc_getLCID(opt_locale);
    668     }
    669   */
    670 
    671     //
    672     //  Set the UNIX locale
    673     //
    674     if (opt_unix) {
    675         if (setlocale(LC_ALL, opt_locale) == 0) {
    676             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
    677             exit(-1);
    678         }
    679     }
    680 
    681     // Read in  the input file.
    682     //   File assumed to be utf-16.
    683     //   Lines go onto heap buffers.  Global index array to line starts is created.
    684     //   Lines themselves are null terminated.
    685     //
    686 
    687     UCharFile f(opt_fName);
    688     if (f.error()) {
    689         exit(-1);
    690     }
    691     int32_t fileSize = f.size();
    692     const int STARTSIZE = 70000;
    693     int32_t bufSize = 0;
    694     int32_t charCount = 0;
    695     if(fileSize != -1) {
    696       text = (UChar *)malloc(fileSize*sizeof(UChar));
    697       bufSize = fileSize;
    698     } else {
    699       text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
    700       bufSize = STARTSIZE;
    701     }
    702     if(text == NULL) {
    703       fprintf(stderr, "Allocating buffer failed\n");
    704       exit(-1);
    705     }
    706 
    707 
    708     //  Read the file, split into lines, and save in memory.
    709     //  Loop runs once per utf-16 value from the input file,
    710     //    (The number of bytes read from file per loop iteration depends on external encoding.)
    711     for (;;) {
    712 
    713         UChar c = f.get();
    714         if(f.eof()) {
    715           break;
    716         }
    717         if (f.error()){
    718           exit(-1);
    719         }
    720         // We now have a good UTF-16 value in c.
    721         text[charCount++] = c;
    722         if(charCount == bufSize) {
    723           text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
    724           if(text == NULL) {
    725             fprintf(stderr, "Reallocating buffer failed\n");
    726             exit(-1);
    727           }
    728           bufSize *= 2;
    729         }
    730     }
    731 
    732 
    733     if (opt_terse == FALSE) {
    734         printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
    735     }
    736 
    737     textSize = charCount;
    738 
    739 
    740 
    741 
    742     //
    743     //  Dump file contents if requested.
    744     //
    745     if (opt_dump) {
    746       // dump file, etc... possibly
    747     }
    748 
    749 
    750     //
    751     //  We've got the file read into memory.  Go do something with it.
    752     //
    753     int32_t i = 0;
    754     for(i = 0; i < opt_passesCount; i++) {
    755       if(opt_loopCount != 0) {
    756         if(opt_next) {
    757           doForwardTest();
    758         } else if(opt_isBound) {
    759           doIsBoundTest();
    760         } else {
    761           doForwardTest();
    762         }
    763       } else if(opt_time != 0) {
    764 
    765       }
    766     }
    767 
    768   if(text != NULL) {
    769     free(text);
    770   }
    771     if(brkit != NULL) {
    772       delete brkit;
    773     }
    774 
    775     return 0;
    776 }
    777