Home | History | Annotate | Download | only in ubrkperf
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
      4  *
      5  ********************************************************************/
      6 /********************************************************************************
      7 *
      8 * File ubrkperf.cpp
      9 *
     10 * Modification History:
     11 *        Name                     Description
     12 *     Vladimir Weinstein          First Version, based on collperf
     13 *
     14 *********************************************************************************
     15 */
     16 
     17 //
     18 //  This program tests break iterator performance
     19 //      Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
     20 //      (if any)
     21 //      A text file is required as input.  It must be in utf-8 or utf-16 format,
     22 //      and include a byte order mark.  Either LE or BE format is OK.
     23 //
     24 
     25 const char gUsageString[] =
     26  "usage:  ubrkperf options...\n"
     27     "-help                      Display this message.\n"
     28     "-file file_name            utf-16/utf-8 format file.\n"
     29     "-locale name               ICU locale to use.  Default is en_US\n"
     30     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
     31     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
     32     "-win                       Run test using Windows native services. (currently not working) (ICU is default)\n"
     33     "-unix                      Run test using Unix word breaking services. (currently not working) \n"
     34     "-mac                       Run test using MacOSX word breaking services.\n"
     35     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
     36     "-char                      Use character break iterator\n"
     37     "-word                      Use word break iterator\n"
     38     "-line                      Use line break iterator\n"
     39     "-sentence                  Use sentence break iterator\n"
     40     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
     41     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
     42     "                               under test at each call point.  For measuring test overhead.\n"
     43     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
     44     "-dump                      Display stuff.\n"
     45     "-capi                      Use C APIs instead of C++ APIs (currently not working)\n"
     46     "-next                      Do the next test\n"
     47     "-isBound                   Do the isBound test\n"
     48     ;
     49 
     50 
     51 #include <stdio.h>
     52 #include <string.h>
     53 #include <stdlib.h>
     54 #include <math.h>
     55 #include <locale.h>
     56 #include <errno.h>
     57 #include <sys/stat.h>
     58 
     59 #include <unicode/utypes.h>
     60 #include <unicode/ucol.h>
     61 #include <unicode/ucoleitr.h>
     62 #include <unicode/uloc.h>
     63 #include <unicode/ustring.h>
     64 #include <unicode/ures.h>
     65 #include <unicode/uchar.h>
     66 #include <unicode/ucnv.h>
     67 #include <unicode/utf8.h>
     68 
     69 #include <unicode/brkiter.h>
     70 
     71 
     72 #if U_PLATFORM_HAS_WIN32_API
     73 #include <windows.h>
     74 #else
     75 //
     76 //  Stubs for Windows API functions when building on UNIXes.
     77 //
     78 #include <sys/time.h>
     79 unsigned long timeGetTime() {
     80     struct timeval t;
     81     gettimeofday(&t, 0);
     82     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
     83     val += t.tv_usec / 1000;
     84     return val;
     85 };
     86 #define MAKELCID(a,b) 0
     87 #endif
     88 
     89 
     90 //
     91 //  Command line option variables
     92 //     These global variables are set according to the options specified
     93 //     on the command line by the user.
     94 char * opt_fName      = 0;
     95 char * opt_locale     = "en_US";
     96 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
     97 char * opt_rules      = 0;
     98 UBool  opt_help       = FALSE;
     99 int    opt_time       = 0;
    100 int    opt_loopCount  = 0;
    101 int    opt_passesCount= 1;
    102 UBool  opt_terse      = FALSE;
    103 UBool  opt_icu        = TRUE;
    104 UBool  opt_win        = FALSE;      // Run with Windows native functions.
    105 UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
    106 UBool  opt_mac        = FALSE;      // Run with MacOSX word break services.
    107 UBool  opt_uselen     = FALSE;
    108 UBool  opt_dump       = FALSE;
    109 UBool  opt_char       = FALSE;
    110 UBool  opt_word       = FALSE;
    111 UBool  opt_line       = FALSE;
    112 UBool  opt_sentence   = FALSE;
    113 UBool  opt_capi       = FALSE;
    114 
    115 UBool  opt_next       = FALSE;
    116 UBool  opt_isBound    = FALSE;
    117 
    118 
    119 
    120 //
    121 //   Definitions for the command line options
    122 //
    123 struct OptSpec {
    124     const char *name;
    125     enum {FLAG, NUM, STRING} type;
    126     void *pVar;
    127 };
    128 
    129 OptSpec opts[] = {
    130     {"-file",        OptSpec::STRING, &opt_fName},
    131     {"-locale",      OptSpec::STRING, &opt_locale},
    132     {"-langid",      OptSpec::NUM,    &opt_langid},
    133     {"-win",         OptSpec::FLAG,   &opt_win},
    134     {"-unix",        OptSpec::FLAG,   &opt_unix},
    135     {"-mac",         OptSpec::FLAG,   &opt_mac},
    136     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
    137     {"-loop",        OptSpec::NUM,    &opt_loopCount},
    138     {"-time",        OptSpec::NUM,    &opt_time},
    139     {"-passes",      OptSpec::NUM,    &opt_passesCount},
    140     {"-char",        OptSpec::FLAG,   &opt_char},
    141     {"-word",        OptSpec::FLAG,   &opt_word},
    142     {"-line",        OptSpec::FLAG,   &opt_line},
    143     {"-sentence",    OptSpec::FLAG,   &opt_sentence},
    144     {"-terse",       OptSpec::FLAG,   &opt_terse},
    145     {"-dump",        OptSpec::FLAG,   &opt_dump},
    146     {"-capi",        OptSpec::FLAG,   &opt_capi},
    147     {"-next",        OptSpec::FLAG,   &opt_next},
    148     {"-isBound",     OptSpec::FLAG,   &opt_isBound},
    149     {"-help",        OptSpec::FLAG,   &opt_help},
    150     {"-?",           OptSpec::FLAG,   &opt_help},
    151     {0, OptSpec::FLAG, 0}
    152 };
    153 
    154 
    155 //---------------------------------------------------------------------------
    156 //
    157 //  Global variables pointing to and describing the test file
    158 //
    159 //---------------------------------------------------------------------------
    160 
    161 //DWORD          gWinLCID;
    162 BreakIterator *brkit = NULL;
    163 UChar *text = NULL;
    164 int32_t textSize = 0;
    165 
    166 
    167 
    168 #if U_PLATFORM_IS_DARWIN_BASED
    169 #include <ApplicationServices/ApplicationServices.h>
    170 enum{
    171   kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
    172     };
    173 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
    174 TextBreakLocatorRef breakRef;
    175 UCTextBreakType macBreakType;
    176 
    177 void createMACBrkIt() {
    178   OSStatus status = noErr;
    179   LocaleRef lref;
    180   status = LocaleRefFromLocaleString(opt_locale, &lref);
    181   status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
    182   if(opt_char == TRUE) {
    183     macBreakType = kUCTextBreakClusterMask;
    184   } else if(opt_word == TRUE) {
    185     macBreakType = kUCTextBreakWordMask;
    186   } else if(opt_line == TRUE) {
    187     macBreakType = kUCTextBreakLineMask;
    188   } else if(opt_sentence == TRUE) {
    189     // error
    190     // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
    191   } else {
    192     // default is character iterator
    193     macBreakType = kUCTextBreakClusterMask;
    194       }
    195 }
    196 #endif
    197 
    198 void createICUBrkIt() {
    199   //
    200   //  Set up an ICU break iterator
    201   //
    202   UErrorCode          status = U_ZERO_ERROR;
    203   if(opt_char == TRUE) {
    204     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
    205   } else if(opt_word == TRUE) {
    206     brkit = BreakIterator::createWordInstance(opt_locale, status);
    207   } else if(opt_line == TRUE) {
    208     brkit = BreakIterator::createLineInstance(opt_locale, status);
    209   } else if(opt_sentence == TRUE) {
    210     brkit = BreakIterator::createSentenceInstance(opt_locale, status);
    211   } else {
    212     // default is character iterator
    213     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
    214   }
    215   if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
    216     fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
    217   }
    218   if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
    219     fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
    220   }
    221 
    222 }
    223 
    224 //---------------------------------------------------------------------------
    225 //
    226 //  ProcessOptions()    Function to read the command line options.
    227 //
    228 //---------------------------------------------------------------------------
    229 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
    230 {
    231     int         i;
    232     int         argNum;
    233     const char  *pArgName;
    234     OptSpec    *pOpt;
    235 
    236     for (argNum=1; argNum<argc; argNum++) {
    237         pArgName = argv[argNum];
    238         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
    239             if (strcmp(pOpt->name, pArgName) == 0) {
    240                 switch (pOpt->type) {
    241                 case OptSpec::FLAG:
    242                     *(UBool *)(pOpt->pVar) = TRUE;
    243                     break;
    244                 case OptSpec::STRING:
    245                     argNum ++;
    246                     if (argNum >= argc) {
    247                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
    248                         return FALSE;
    249                     }
    250                     *(const char **)(pOpt->pVar)  = argv[argNum];
    251                     break;
    252                 case OptSpec::NUM:
    253                     argNum ++;
    254                     if (argNum >= argc) {
    255                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
    256                         return FALSE;
    257                     }
    258                     char *endp;
    259                     i = strtol(argv[argNum], &endp, 0);
    260                     if (endp == argv[argNum]) {
    261                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
    262                         return FALSE;
    263                     }
    264                     *(int *)(pOpt->pVar) = i;
    265                 }
    266                 break;
    267             }
    268         }
    269         if (pOpt->name == 0)
    270         {
    271             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
    272             return FALSE;
    273         }
    274     }
    275 return TRUE;
    276 }
    277 
    278 
    279 void doForwardTest() {
    280   if (opt_terse == FALSE) {
    281     printf("Doing the forward test\n");
    282   }
    283   int32_t noBreaks = 0;
    284   int32_t i = 0;
    285   unsigned long startTime = timeGetTime();
    286   unsigned long elapsedTime = 0;
    287   if(opt_icu) {
    288     createICUBrkIt();
    289     brkit->setText(UnicodeString(text, textSize));
    290     brkit->first();
    291     if (opt_terse == FALSE) {
    292       printf("Warmup\n");
    293     }
    294     int j;
    295     while((j = brkit->next()) != BreakIterator::DONE) {
    296       noBreaks++;
    297       //fprintf(stderr, "%d ", j);
    298     }
    299 
    300     if (opt_terse == FALSE) {
    301       printf("Measure\n");
    302     }
    303     startTime = timeGetTime();
    304     for(i = 0; i < opt_loopCount; i++) {
    305       brkit->first();
    306       while(brkit->next() != BreakIterator::DONE) {
    307       }
    308     }
    309 
    310     elapsedTime = timeGetTime()-startTime;
    311   } else if(opt_mac) {
    312 #if U_PLATFORM_IS_DARWIN_BASED
    313     createMACBrkIt();
    314     UniChar* filePtr = text;
    315     OSStatus status = noErr;
    316     UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
    317     startOffset = 0;
    318     //printf("\t---Search forward--\n");
    319 
    320     while (startOffset < numUniChars)
    321     {
    322 	status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
    323                                startOffset, &breakOffset);
    324       //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
    325       //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
    326 
    327       // Output break
    328       //printf("\t%d\n", (int)breakOffset);
    329 
    330       // Increment counters
    331 	noBreaks++;
    332       startOffset = breakOffset;
    333     }
    334     startTime = timeGetTime();
    335     for(i = 0; i < opt_loopCount; i++) {
    336       startOffset = 0;
    337 
    338       while (startOffset < numUniChars)
    339 	{
    340 	  status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
    341 				   startOffset, &breakOffset);
    342 	  // Increment counters
    343 	  startOffset = breakOffset;
    344 	}
    345     }
    346     elapsedTime = timeGetTime()-startTime;
    347     UCDisposeTextBreakLocator(&breakRef);
    348 #endif
    349 
    350 
    351   }
    352 
    353 
    354   if (opt_terse == FALSE) {
    355   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
    356       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
    357       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
    358       printf("forward break iteration average loop time %d\n", loopTime);
    359       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
    360       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
    361   } else {
    362       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
    363   }
    364 
    365 
    366 }
    367 
    368 void doIsBoundTest() {
    369   int32_t noBreaks = 0, hit = 0;
    370   int32_t i = 0, j = 0;
    371   unsigned long startTime = timeGetTime();
    372   unsigned long elapsedTime = 0;
    373   createICUBrkIt();
    374   brkit->setText(UnicodeString(text, textSize));
    375   brkit->first();
    376   for(j = 0; j < textSize; j++) {
    377     if(brkit->isBoundary(j)) {
    378       noBreaks++;
    379       //fprintf(stderr, "%d ", j);
    380     }
    381   }
    382   /*
    383   while(brkit->next() != BreakIterator::DONE) {
    384     noBreaks++;
    385   }
    386   */
    387 
    388   startTime = timeGetTime();
    389   for(i = 0; i < opt_loopCount; i++) {
    390     for(j = 0; j < textSize; j++) {
    391       if(brkit->isBoundary(j)) {
    392         hit++;
    393       }
    394     }
    395   }
    396 
    397   elapsedTime = timeGetTime()-startTime;
    398   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
    399   if (opt_terse == FALSE) {
    400       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
    401       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
    402       printf("forward break iteration average loop time %d\n", loopTime);
    403       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
    404       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
    405   } else {
    406       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
    407   }
    408 }
    409 
    410 //----------------------------------------------------------------------------------------
    411 //
    412 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
    413 //                    Since it appears that Unicode support is going in the general
    414 //                    direction of the use of UTF-8 locales, that is the approach
    415 //                    that is used here.
    416 //
    417 //----------------------------------------------------------------------------------------
    418 void  UnixConvert() {
    419 #if 0
    420     int    line;
    421 
    422     UConverter   *cvrtr;    // An ICU code page converter.
    423     UErrorCode    status = U_ZERO_ERROR;
    424 
    425 
    426     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
    427     if (U_FAILURE(status)) {
    428         fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
    429         exit(-1);
    430     }
    431     // redo for unix
    432     for (line=0; line < gNumFileLines; line++) {
    433         int sizeNeeded = ucnv_fromUChars(cvrtr,
    434                                          0,            // ptr to target buffer.
    435                                          0,            // length of target buffer.
    436                                          gFileLines[line].name,
    437                                          -1,           //  source is null terminated
    438                                          &status);
    439         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
    440             fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
    441             exit(-1);
    442         }
    443         status = U_ZERO_ERROR;
    444         gFileLines[line].unixName = new char[sizeNeeded+1];
    445         sizeNeeded = ucnv_fromUChars(cvrtr,
    446                                          gFileLines[line].unixName, // ptr to target buffer.
    447                                          sizeNeeded+1, // length of target buffer.
    448                                          gFileLines[line].name,
    449                                          -1,           //  source is null terminated
    450                                          &status);
    451         if (U_FAILURE(status)) {
    452             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
    453             exit(-1);
    454         }
    455         gFileLines[line].unixName[sizeNeeded] = 0;
    456     };
    457     ucnv_close(cvrtr);
    458 #endif
    459 }
    460 
    461 
    462 //----------------------------------------------------------------------------------------
    463 //
    464 //  class UCharFile   Class to hide all the gorp to read a file in
    465 //                    and produce a stream of UChars.
    466 //
    467 //----------------------------------------------------------------------------------------
    468 class UCharFile {
    469 public:
    470     UCharFile(const char *fileName);
    471     ~UCharFile();
    472     UChar   get();
    473     UBool   eof() {return fEof;};
    474     UBool   error() {return fError;};
    475     int32_t size() { return fFileSize; };
    476 
    477 private:
    478     UCharFile (const UCharFile &other) {};                         // No copy constructor.
    479     UCharFile & operator = (const UCharFile &other) {return *this;};   // No assignment op
    480 
    481     FILE         *fFile;
    482     const char   *fName;
    483     UBool        fEof;
    484     UBool        fError;
    485     UChar        fPending2ndSurrogate;
    486     int32_t      fFileSize;
    487 
    488     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
    489 };
    490 
    491 UCharFile::UCharFile(const char * fileName) {
    492     fEof                 = FALSE;
    493     fError               = FALSE;
    494     fName                = fileName;
    495     struct stat buf;
    496     int32_t result = stat(fileName, &buf);
    497     if(result != 0) {
    498       fprintf(stderr, "Error getting info\n");
    499       fFileSize = -1;
    500     } else {
    501       fFileSize = buf.st_size;
    502     }
    503     fFile                = fopen(fName, "rb");
    504     fPending2ndSurrogate = 0;
    505     if (fFile == NULL) {
    506         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
    507         fError = TRUE;
    508         return;
    509     }
    510     //
    511     //  Look for the byte order mark at the start of the file.
    512     //
    513     int BOMC1, BOMC2, BOMC3;
    514     BOMC1 = fgetc(fFile);
    515     BOMC2 = fgetc(fFile);
    516 
    517     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
    518         fEncoding = UTF16LE; }
    519     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
    520         fEncoding = UTF16BE; }
    521     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
    522         fEncoding = UTF8; }
    523     else
    524     {
    525         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
    526             "must include a BOM.\n", fileName);
    527         fError = true;
    528         return;
    529     }
    530 }
    531 
    532 
    533 UCharFile::~UCharFile() {
    534     fclose(fFile);
    535 }
    536 
    537 
    538 
    539 UChar UCharFile::get() {
    540     UChar   c;
    541     switch (fEncoding) {
    542     case UTF16LE:
    543         {
    544             int  cL, cH;
    545             cL = fgetc(fFile);
    546             cH = fgetc(fFile);
    547             c  = cL  | (cH << 8);
    548             if (cH == EOF) {
    549                 c   = 0;
    550                 fEof = TRUE;
    551             }
    552             break;
    553         }
    554     case UTF16BE:
    555         {
    556             int  cL, cH;
    557             cH = fgetc(fFile);
    558             cL = fgetc(fFile);
    559             c  = cL  | (cH << 8);
    560             if (cL == EOF) {
    561                 c   = 0;
    562                 fEof = TRUE;
    563             }
    564             break;
    565         }
    566     case UTF8:
    567         {
    568             if (fPending2ndSurrogate != 0) {
    569                 c = fPending2ndSurrogate;
    570                 fPending2ndSurrogate = 0;
    571                 break;
    572             }
    573 
    574             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
    575             if (ch == EOF) {
    576                 c = 0;
    577                 fEof = TRUE;
    578                 break;
    579             }
    580 
    581             if (ch <= 0x7f) {
    582                 // It's ascii.  No further utf-8 conversion.
    583                 c = ch;
    584                 break;
    585             }
    586 
    587             // Figure out the lenght of the char and read the rest of the bytes
    588             //   into a temp array.
    589             int nBytes;
    590             if (ch >= 0xF0) {nBytes=4;}
    591             else if (ch >= 0xE0) {nBytes=3;}
    592             else if (ch >= 0xC0) {nBytes=2;}
    593             else {
    594                 fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
    595                 fError = TRUE;
    596                 return 0;
    597             }
    598 
    599             unsigned char  bytes[10];
    600             bytes[0] = (unsigned char)ch;
    601             int i;
    602             for (i=1; i<nBytes; i++) {
    603                 bytes[i] = fgetc(fFile);
    604                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
    605                     fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
    606                     fError = TRUE;
    607                     return 0;
    608                 }
    609             }
    610 
    611             // Convert the bytes from the temp array to a Unicode char.
    612             i = 0;
    613             uint32_t  cp;
    614             U8_NEXT_UNSAFE(bytes, i, cp);
    615             c = (UChar)cp;
    616 
    617             if (cp >= 0x10000) {
    618                 // The code point needs to be broken up into a utf-16 surrogate pair.
    619                 //  Process first half this time through the main loop, and
    620                 //   remember the other half for the next time through.
    621                 UChar utf16Buf[3];
    622                 i = 0;
    623                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
    624                 fPending2ndSurrogate = utf16Buf[1];
    625                 c = utf16Buf[0];
    626             }
    627             break;
    628         };
    629     }
    630     return c;
    631 }
    632 
    633 
    634 //----------------------------------------------------------------------------------------
    635 //
    636 //    Main   --  process command line, read in and pre-process the test file,
    637 //                 call other functions to do the actual tests.
    638 //
    639 //----------------------------------------------------------------------------------------
    640 int main(int argc, const char** argv) {
    641     if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
    642         printf(gUsageString);
    643         exit (1);
    644     }
    645     // Make sure that we've only got one API selected.
    646     if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
    647     if (opt_mac || opt_unix) opt_win = FALSE;
    648     if (opt_mac) opt_unix = FALSE;
    649 
    650     UErrorCode          status = U_ZERO_ERROR;
    651 
    652 
    653 
    654     //
    655     //  Set up a Windows LCID
    656     //
    657   /*
    658     if (opt_langid != 0) {
    659         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
    660     }
    661     else {
    662         gWinLCID = uloc_getLCID(opt_locale);
    663     }
    664   */
    665 
    666     //
    667     //  Set the UNIX locale
    668     //
    669     if (opt_unix) {
    670         if (setlocale(LC_ALL, opt_locale) == 0) {
    671             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
    672             exit(-1);
    673         }
    674     }
    675 
    676     // Read in  the input file.
    677     //   File assumed to be utf-16.
    678     //   Lines go onto heap buffers.  Global index array to line starts is created.
    679     //   Lines themselves are null terminated.
    680     //
    681 
    682     UCharFile f(opt_fName);
    683     if (f.error()) {
    684         exit(-1);
    685     }
    686     int32_t fileSize = f.size();
    687     const int STARTSIZE = 70000;
    688     int32_t bufSize = 0;
    689     int32_t charCount = 0;
    690     if(fileSize != -1) {
    691       text = (UChar *)malloc(fileSize*sizeof(UChar));
    692       bufSize = fileSize;
    693     } else {
    694       text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
    695       bufSize = STARTSIZE;
    696     }
    697     if(text == NULL) {
    698       fprintf(stderr, "Allocating buffer failed\n");
    699       exit(-1);
    700     }
    701 
    702 
    703     //  Read the file, split into lines, and save in memory.
    704     //  Loop runs once per utf-16 value from the input file,
    705     //    (The number of bytes read from file per loop iteration depends on external encoding.)
    706     for (;;) {
    707 
    708         UChar c = f.get();
    709         if(f.eof()) {
    710           break;
    711         }
    712         if (f.error()){
    713           exit(-1);
    714         }
    715         // We now have a good UTF-16 value in c.
    716         text[charCount++] = c;
    717         if(charCount == bufSize) {
    718           text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
    719           if(text == NULL) {
    720             fprintf(stderr, "Reallocating buffer failed\n");
    721             exit(-1);
    722           }
    723           bufSize *= 2;
    724         }
    725     }
    726 
    727 
    728     if (opt_terse == FALSE) {
    729         printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
    730     }
    731 
    732     textSize = charCount;
    733 
    734 
    735 
    736 
    737     //
    738     //  Dump file contents if requested.
    739     //
    740     if (opt_dump) {
    741       // dump file, etc... possibly
    742     }
    743 
    744 
    745     //
    746     //  We've got the file read into memory.  Go do something with it.
    747     //
    748     int32_t i = 0;
    749     for(i = 0; i < opt_passesCount; i++) {
    750       if(opt_loopCount != 0) {
    751         if(opt_next) {
    752           doForwardTest();
    753         } else if(opt_isBound) {
    754           doIsBoundTest();
    755         } else {
    756           doForwardTest();
    757         }
    758       } else if(opt_time != 0) {
    759 
    760       }
    761     }
    762 
    763   if(text != NULL) {
    764     free(text);
    765   }
    766     if(brkit != NULL) {
    767       delete brkit;
    768     }
    769 
    770     return 0;
    771 }
    772