1 /*********************************************************************** 2 * 2016 and later: Unicode, Inc. and others. 3 * License & terms of use: http://www.unicode.org/copyright.html#License 4 * 5 *********************************************************************** 6 *********************************************************************** 7 * COPYRIGHT: 8 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. 9 * 10 ***********************************************************************/ 11 /******************************************************************************** 12 * 13 * File ubrkperf.cpp 14 * 15 * Modification History: 16 * Name Description 17 * Vladimir Weinstein First Version, based on collperf 18 * 19 ********************************************************************************* 20 */ 21 22 // 23 // This program tests break iterator performance 24 // Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs 25 // (if any) 26 // A text file is required as input. It must be in utf-8 or utf-16 format, 27 // and include a byte order mark. Either LE or BE format is OK. 28 // 29 30 const char gUsageString[] = 31 "usage: ubrkperf options...\n" 32 "-help Display this message.\n" 33 "-file file_name utf-16/utf-8 format file.\n" 34 "-locale name ICU locale to use. Default is en_US\n" 35 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" 36 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" 37 "-win Run test using Windows native services. (currently not working) (ICU is default)\n" 38 "-unix Run test using Unix word breaking services. (currently not working) \n" 39 "-mac Run test using MacOSX word breaking services.\n" 40 "-uselen Use API with string lengths. Default is null-terminated strings\n" 41 "-char Use character break iterator\n" 42 "-word Use word break iterator\n" 43 "-line Use line break iterator\n" 44 "-sentence Use sentence break iterator\n" 45 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" 46 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" 47 " under test at each call point. For measuring test overhead.\n" 48 "-terse Terse numbers-only output. Intended for use by scripts.\n" 49 "-dump Display stuff.\n" 50 "-capi Use C APIs instead of C++ APIs (currently not working)\n" 51 "-next Do the next test\n" 52 "-isBound Do the isBound test\n" 53 ; 54 55 56 #include <stdio.h> 57 #include <string.h> 58 #include <stdlib.h> 59 #include <math.h> 60 #include <locale.h> 61 #include <errno.h> 62 #include <sys/stat.h> 63 64 #include <unicode/utypes.h> 65 #include <unicode/ucol.h> 66 #include <unicode/ucoleitr.h> 67 #include <unicode/uloc.h> 68 #include <unicode/ustring.h> 69 #include <unicode/ures.h> 70 #include <unicode/uchar.h> 71 #include <unicode/ucnv.h> 72 #include <unicode/utf8.h> 73 74 #include <unicode/brkiter.h> 75 76 77 #if U_PLATFORM_HAS_WIN32_API 78 #include <windows.h> 79 #else 80 // 81 // Stubs for Windows API functions when building on UNIXes. 82 // 83 #include <sys/time.h> 84 unsigned long timeGetTime() { 85 struct timeval t; 86 gettimeofday(&t, 0); 87 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. 88 val += t.tv_usec / 1000; 89 return val; 90 }; 91 #define MAKELCID(a,b) 0 92 #endif 93 94 95 // 96 // Command line option variables 97 // These global variables are set according to the options specified 98 // on the command line by the user. 99 char * opt_fName = 0; 100 char * opt_locale = "en_US"; 101 int opt_langid = 0; // Defaults to value corresponding to opt_locale. 102 char * opt_rules = 0; 103 UBool opt_help = FALSE; 104 int opt_time = 0; 105 int opt_loopCount = 0; 106 int opt_passesCount= 1; 107 UBool opt_terse = FALSE; 108 UBool opt_icu = TRUE; 109 UBool opt_win = FALSE; // Run with Windows native functions. 110 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. 111 UBool opt_mac = FALSE; // Run with MacOSX word break services. 112 UBool opt_uselen = FALSE; 113 UBool opt_dump = FALSE; 114 UBool opt_char = FALSE; 115 UBool opt_word = FALSE; 116 UBool opt_line = FALSE; 117 UBool opt_sentence = FALSE; 118 UBool opt_capi = FALSE; 119 120 UBool opt_next = FALSE; 121 UBool opt_isBound = FALSE; 122 123 124 125 // 126 // Definitions for the command line options 127 // 128 struct OptSpec { 129 const char *name; 130 enum {FLAG, NUM, STRING} type; 131 void *pVar; 132 }; 133 134 OptSpec opts[] = { 135 {"-file", OptSpec::STRING, &opt_fName}, 136 {"-locale", OptSpec::STRING, &opt_locale}, 137 {"-langid", OptSpec::NUM, &opt_langid}, 138 {"-win", OptSpec::FLAG, &opt_win}, 139 {"-unix", OptSpec::FLAG, &opt_unix}, 140 {"-mac", OptSpec::FLAG, &opt_mac}, 141 {"-uselen", OptSpec::FLAG, &opt_uselen}, 142 {"-loop", OptSpec::NUM, &opt_loopCount}, 143 {"-time", OptSpec::NUM, &opt_time}, 144 {"-passes", OptSpec::NUM, &opt_passesCount}, 145 {"-char", OptSpec::FLAG, &opt_char}, 146 {"-word", OptSpec::FLAG, &opt_word}, 147 {"-line", OptSpec::FLAG, &opt_line}, 148 {"-sentence", OptSpec::FLAG, &opt_sentence}, 149 {"-terse", OptSpec::FLAG, &opt_terse}, 150 {"-dump", OptSpec::FLAG, &opt_dump}, 151 {"-capi", OptSpec::FLAG, &opt_capi}, 152 {"-next", OptSpec::FLAG, &opt_next}, 153 {"-isBound", OptSpec::FLAG, &opt_isBound}, 154 {"-help", OptSpec::FLAG, &opt_help}, 155 {"-?", OptSpec::FLAG, &opt_help}, 156 {0, OptSpec::FLAG, 0} 157 }; 158 159 160 //--------------------------------------------------------------------------- 161 // 162 // Global variables pointing to and describing the test file 163 // 164 //--------------------------------------------------------------------------- 165 166 //DWORD gWinLCID; 167 BreakIterator *brkit = NULL; 168 UChar *text = NULL; 169 int32_t textSize = 0; 170 171 172 173 #if U_PLATFORM_IS_DARWIN_BASED 174 #include <ApplicationServices/ApplicationServices.h> 175 enum{ 176 kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask) 177 }; 178 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask}; 179 TextBreakLocatorRef breakRef; 180 UCTextBreakType macBreakType; 181 182 void createMACBrkIt() { 183 OSStatus status = noErr; 184 LocaleRef lref; 185 status = LocaleRefFromLocaleString(opt_locale, &lref); 186 status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef); 187 if(opt_char == TRUE) { 188 macBreakType = kUCTextBreakClusterMask; 189 } else if(opt_word == TRUE) { 190 macBreakType = kUCTextBreakWordMask; 191 } else if(opt_line == TRUE) { 192 macBreakType = kUCTextBreakLineMask; 193 } else if(opt_sentence == TRUE) { 194 // error 195 // brkit = BreakIterator::createSentenceInstance(opt_locale, status); 196 } else { 197 // default is character iterator 198 macBreakType = kUCTextBreakClusterMask; 199 } 200 } 201 #endif 202 203 void createICUBrkIt() { 204 // 205 // Set up an ICU break iterator 206 // 207 UErrorCode status = U_ZERO_ERROR; 208 if(opt_char == TRUE) { 209 brkit = BreakIterator::createCharacterInstance(opt_locale, status); 210 } else if(opt_word == TRUE) { 211 brkit = BreakIterator::createWordInstance(opt_locale, status); 212 } else if(opt_line == TRUE) { 213 brkit = BreakIterator::createLineInstance(opt_locale, status); 214 } else if(opt_sentence == TRUE) { 215 brkit = BreakIterator::createSentenceInstance(opt_locale, status); 216 } else { 217 // default is character iterator 218 brkit = BreakIterator::createCharacterInstance(opt_locale, status); 219 } 220 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { 221 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); 222 } 223 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { 224 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); 225 } 226 227 } 228 229 //--------------------------------------------------------------------------- 230 // 231 // ProcessOptions() Function to read the command line options. 232 // 233 //--------------------------------------------------------------------------- 234 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) 235 { 236 int i; 237 int argNum; 238 const char *pArgName; 239 OptSpec *pOpt; 240 241 for (argNum=1; argNum<argc; argNum++) { 242 pArgName = argv[argNum]; 243 for (pOpt = opts; pOpt->name != 0; pOpt++) { 244 if (strcmp(pOpt->name, pArgName) == 0) { 245 switch (pOpt->type) { 246 case OptSpec::FLAG: 247 *(UBool *)(pOpt->pVar) = TRUE; 248 break; 249 case OptSpec::STRING: 250 argNum ++; 251 if (argNum >= argc) { 252 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 253 return FALSE; 254 } 255 *(const char **)(pOpt->pVar) = argv[argNum]; 256 break; 257 case OptSpec::NUM: 258 argNum ++; 259 if (argNum >= argc) { 260 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 261 return FALSE; 262 } 263 char *endp; 264 i = strtol(argv[argNum], &endp, 0); 265 if (endp == argv[argNum]) { 266 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); 267 return FALSE; 268 } 269 *(int *)(pOpt->pVar) = i; 270 } 271 break; 272 } 273 } 274 if (pOpt->name == 0) 275 { 276 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); 277 return FALSE; 278 } 279 } 280 return TRUE; 281 } 282 283 284 void doForwardTest() { 285 if (opt_terse == FALSE) { 286 printf("Doing the forward test\n"); 287 } 288 int32_t noBreaks = 0; 289 int32_t i = 0; 290 unsigned long startTime = timeGetTime(); 291 unsigned long elapsedTime = 0; 292 if(opt_icu) { 293 createICUBrkIt(); 294 brkit->setText(UnicodeString(text, textSize)); 295 brkit->first(); 296 if (opt_terse == FALSE) { 297 printf("Warmup\n"); 298 } 299 int j; 300 while((j = brkit->next()) != BreakIterator::DONE) { 301 noBreaks++; 302 //fprintf(stderr, "%d ", j); 303 } 304 305 if (opt_terse == FALSE) { 306 printf("Measure\n"); 307 } 308 startTime = timeGetTime(); 309 for(i = 0; i < opt_loopCount; i++) { 310 brkit->first(); 311 while(brkit->next() != BreakIterator::DONE) { 312 } 313 } 314 315 elapsedTime = timeGetTime()-startTime; 316 } else if(opt_mac) { 317 #if U_PLATFORM_IS_DARWIN_BASED 318 createMACBrkIt(); 319 UniChar* filePtr = text; 320 OSStatus status = noErr; 321 UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize; 322 startOffset = 0; 323 //printf("\t---Search forward--\n"); 324 325 while (startOffset < numUniChars) 326 { 327 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, 328 startOffset, &breakOffset); 329 //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status)); 330 //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset)); 331 332 // Output break 333 //printf("\t%d\n", (int)breakOffset); 334 335 // Increment counters 336 noBreaks++; 337 startOffset = breakOffset; 338 } 339 startTime = timeGetTime(); 340 for(i = 0; i < opt_loopCount; i++) { 341 startOffset = 0; 342 343 while (startOffset < numUniChars) 344 { 345 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, 346 startOffset, &breakOffset); 347 // Increment counters 348 startOffset = breakOffset; 349 } 350 } 351 elapsedTime = timeGetTime()-startTime; 352 UCDisposeTextBreakLocator(&breakRef); 353 #endif 354 355 356 } 357 358 359 if (opt_terse == FALSE) { 360 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); 361 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); 362 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); 363 printf("forward break iteration average loop time %d\n", loopTime); 364 printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); 365 printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); 366 } else { 367 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); 368 } 369 370 371 } 372 373 void doIsBoundTest() { 374 int32_t noBreaks = 0, hit = 0; 375 int32_t i = 0, j = 0; 376 unsigned long startTime = timeGetTime(); 377 unsigned long elapsedTime = 0; 378 createICUBrkIt(); 379 brkit->setText(UnicodeString(text, textSize)); 380 brkit->first(); 381 for(j = 0; j < textSize; j++) { 382 if(brkit->isBoundary(j)) { 383 noBreaks++; 384 //fprintf(stderr, "%d ", j); 385 } 386 } 387 /* 388 while(brkit->next() != BreakIterator::DONE) { 389 noBreaks++; 390 } 391 */ 392 393 startTime = timeGetTime(); 394 for(i = 0; i < opt_loopCount; i++) { 395 for(j = 0; j < textSize; j++) { 396 if(brkit->isBoundary(j)) { 397 hit++; 398 } 399 } 400 } 401 402 elapsedTime = timeGetTime()-startTime; 403 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); 404 if (opt_terse == FALSE) { 405 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); 406 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); 407 printf("forward break iteration average loop time %d\n", loopTime); 408 printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); 409 printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); 410 } else { 411 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); 412 } 413 } 414 415 //---------------------------------------------------------------------------------------- 416 // 417 // UnixConvert -- Convert the lines of the file to the encoding for UNIX 418 // Since it appears that Unicode support is going in the general 419 // direction of the use of UTF-8 locales, that is the approach 420 // that is used here. 421 // 422 //---------------------------------------------------------------------------------------- 423 void UnixConvert() { 424 #if 0 425 int line; 426 427 UConverter *cvrtr; // An ICU code page converter. 428 UErrorCode status = U_ZERO_ERROR; 429 430 431 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. 432 if (U_FAILURE(status)) { 433 fprintf(stderr, "ICU Converter open failed.: %d\n", &status); 434 exit(-1); 435 } 436 // redo for unix 437 for (line=0; line < gNumFileLines; line++) { 438 int sizeNeeded = ucnv_fromUChars(cvrtr, 439 0, // ptr to target buffer. 440 0, // length of target buffer. 441 gFileLines[line].name, 442 -1, // source is null terminated 443 &status); 444 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { 445 fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); 446 exit(-1); 447 } 448 status = U_ZERO_ERROR; 449 gFileLines[line].unixName = new char[sizeNeeded+1]; 450 sizeNeeded = ucnv_fromUChars(cvrtr, 451 gFileLines[line].unixName, // ptr to target buffer. 452 sizeNeeded+1, // length of target buffer. 453 gFileLines[line].name, 454 -1, // source is null terminated 455 &status); 456 if (U_FAILURE(status)) { 457 fprintf(stderr, "ICU Conversion Failed.: %d\n", status); 458 exit(-1); 459 } 460 gFileLines[line].unixName[sizeNeeded] = 0; 461 }; 462 ucnv_close(cvrtr); 463 #endif 464 } 465 466 467 //---------------------------------------------------------------------------------------- 468 // 469 // class UCharFile Class to hide all the gorp to read a file in 470 // and produce a stream of UChars. 471 // 472 //---------------------------------------------------------------------------------------- 473 class UCharFile { 474 public: 475 UCharFile(const char *fileName); 476 ~UCharFile(); 477 UChar get(); 478 UBool eof() {return fEof;}; 479 UBool error() {return fError;}; 480 int32_t size() { return fFileSize; }; 481 482 private: 483 UCharFile (const UCharFile &other) {}; // No copy constructor. 484 UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op 485 486 FILE *fFile; 487 const char *fName; 488 UBool fEof; 489 UBool fError; 490 UChar fPending2ndSurrogate; 491 int32_t fFileSize; 492 493 enum {UTF16LE, UTF16BE, UTF8} fEncoding; 494 }; 495 496 UCharFile::UCharFile(const char * fileName) { 497 fEof = FALSE; 498 fError = FALSE; 499 fName = fileName; 500 struct stat buf; 501 int32_t result = stat(fileName, &buf); 502 if(result != 0) { 503 fprintf(stderr, "Error getting info\n"); 504 fFileSize = -1; 505 } else { 506 fFileSize = buf.st_size; 507 } 508 fFile = fopen(fName, "rb"); 509 fPending2ndSurrogate = 0; 510 if (fFile == NULL) { 511 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); 512 fError = TRUE; 513 return; 514 } 515 // 516 // Look for the byte order mark at the start of the file. 517 // 518 int BOMC1, BOMC2, BOMC3; 519 BOMC1 = fgetc(fFile); 520 BOMC2 = fgetc(fFile); 521 522 if (BOMC1 == 0xff && BOMC2 == 0xfe) { 523 fEncoding = UTF16LE; } 524 else if (BOMC1 == 0xfe && BOMC2 == 0xff) { 525 fEncoding = UTF16BE; } 526 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { 527 fEncoding = UTF8; } 528 else 529 { 530 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " 531 "must include a BOM.\n", fileName); 532 fError = true; 533 return; 534 } 535 } 536 537 538 UCharFile::~UCharFile() { 539 fclose(fFile); 540 } 541 542 543 544 UChar UCharFile::get() { 545 UChar c; 546 switch (fEncoding) { 547 case UTF16LE: 548 { 549 int cL, cH; 550 cL = fgetc(fFile); 551 cH = fgetc(fFile); 552 c = cL | (cH << 8); 553 if (cH == EOF) { 554 c = 0; 555 fEof = TRUE; 556 } 557 break; 558 } 559 case UTF16BE: 560 { 561 int cL, cH; 562 cH = fgetc(fFile); 563 cL = fgetc(fFile); 564 c = cL | (cH << 8); 565 if (cL == EOF) { 566 c = 0; 567 fEof = TRUE; 568 } 569 break; 570 } 571 case UTF8: 572 { 573 if (fPending2ndSurrogate != 0) { 574 c = fPending2ndSurrogate; 575 fPending2ndSurrogate = 0; 576 break; 577 } 578 579 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. 580 if (ch == EOF) { 581 c = 0; 582 fEof = TRUE; 583 break; 584 } 585 586 if (ch <= 0x7f) { 587 // It's ascii. No further utf-8 conversion. 588 c = ch; 589 break; 590 } 591 592 // Figure out the lenght of the char and read the rest of the bytes 593 // into a temp array. 594 int nBytes; 595 if (ch >= 0xF0) {nBytes=4;} 596 else if (ch >= 0xE0) {nBytes=3;} 597 else if (ch >= 0xC0) {nBytes=2;} 598 else { 599 fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile)); 600 fError = TRUE; 601 return 0; 602 } 603 604 unsigned char bytes[10]; 605 bytes[0] = (unsigned char)ch; 606 int i; 607 for (i=1; i<nBytes; i++) { 608 bytes[i] = fgetc(fFile); 609 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { 610 fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch); 611 fError = TRUE; 612 return 0; 613 } 614 } 615 616 // Convert the bytes from the temp array to a Unicode char. 617 i = 0; 618 uint32_t cp; 619 U8_NEXT_UNSAFE(bytes, i, cp); 620 c = (UChar)cp; 621 622 if (cp >= 0x10000) { 623 // The code point needs to be broken up into a utf-16 surrogate pair. 624 // Process first half this time through the main loop, and 625 // remember the other half for the next time through. 626 UChar utf16Buf[3]; 627 i = 0; 628 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); 629 fPending2ndSurrogate = utf16Buf[1]; 630 c = utf16Buf[0]; 631 } 632 break; 633 }; 634 } 635 return c; 636 } 637 638 639 //---------------------------------------------------------------------------------------- 640 // 641 // Main -- process command line, read in and pre-process the test file, 642 // call other functions to do the actual tests. 643 // 644 //---------------------------------------------------------------------------------------- 645 int main(int argc, const char** argv) { 646 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) { 647 printf(gUsageString); 648 exit (1); 649 } 650 // Make sure that we've only got one API selected. 651 if (opt_mac || opt_unix || opt_win) opt_icu = FALSE; 652 if (opt_mac || opt_unix) opt_win = FALSE; 653 if (opt_mac) opt_unix = FALSE; 654 655 UErrorCode status = U_ZERO_ERROR; 656 657 658 659 // 660 // Set up a Windows LCID 661 // 662 /* 663 if (opt_langid != 0) { 664 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); 665 } 666 else { 667 gWinLCID = uloc_getLCID(opt_locale); 668 } 669 */ 670 671 // 672 // Set the UNIX locale 673 // 674 if (opt_unix) { 675 if (setlocale(LC_ALL, opt_locale) == 0) { 676 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); 677 exit(-1); 678 } 679 } 680 681 // Read in the input file. 682 // File assumed to be utf-16. 683 // Lines go onto heap buffers. Global index array to line starts is created. 684 // Lines themselves are null terminated. 685 // 686 687 UCharFile f(opt_fName); 688 if (f.error()) { 689 exit(-1); 690 } 691 int32_t fileSize = f.size(); 692 const int STARTSIZE = 70000; 693 int32_t bufSize = 0; 694 int32_t charCount = 0; 695 if(fileSize != -1) { 696 text = (UChar *)malloc(fileSize*sizeof(UChar)); 697 bufSize = fileSize; 698 } else { 699 text = (UChar *)malloc(STARTSIZE*sizeof(UChar)); 700 bufSize = STARTSIZE; 701 } 702 if(text == NULL) { 703 fprintf(stderr, "Allocating buffer failed\n"); 704 exit(-1); 705 } 706 707 708 // Read the file, split into lines, and save in memory. 709 // Loop runs once per utf-16 value from the input file, 710 // (The number of bytes read from file per loop iteration depends on external encoding.) 711 for (;;) { 712 713 UChar c = f.get(); 714 if(f.eof()) { 715 break; 716 } 717 if (f.error()){ 718 exit(-1); 719 } 720 // We now have a good UTF-16 value in c. 721 text[charCount++] = c; 722 if(charCount == bufSize) { 723 text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar)); 724 if(text == NULL) { 725 fprintf(stderr, "Reallocating buffer failed\n"); 726 exit(-1); 727 } 728 bufSize *= 2; 729 } 730 } 731 732 733 if (opt_terse == FALSE) { 734 printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount); 735 } 736 737 textSize = charCount; 738 739 740 741 742 // 743 // Dump file contents if requested. 744 // 745 if (opt_dump) { 746 // dump file, etc... possibly 747 } 748 749 750 // 751 // We've got the file read into memory. Go do something with it. 752 // 753 int32_t i = 0; 754 for(i = 0; i < opt_passesCount; i++) { 755 if(opt_loopCount != 0) { 756 if(opt_next) { 757 doForwardTest(); 758 } else if(opt_isBound) { 759 doIsBoundTest(); 760 } else { 761 doForwardTest(); 762 } 763 } else if(opt_time != 0) { 764 765 } 766 } 767 768 if(text != NULL) { 769 free(text); 770 } 771 if(brkit != NULL) { 772 delete brkit; 773 } 774 775 return 0; 776 } 777