1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. 4 * 5 ********************************************************************/ 6 /******************************************************************************** 7 * 8 * File ubrkperf.cpp 9 * 10 * Modification History: 11 * Name Description 12 * Vladimir Weinstein First Version, based on collperf 13 * 14 ********************************************************************************* 15 */ 16 17 // 18 // This program tests break iterator performance 19 // Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs 20 // (if any) 21 // A text file is required as input. It must be in utf-8 or utf-16 format, 22 // and include a byte order mark. Either LE or BE format is OK. 23 // 24 25 const char gUsageString[] = 26 "usage: ubrkperf options...\n" 27 "-help Display this message.\n" 28 "-file file_name utf-16/utf-8 format file.\n" 29 "-locale name ICU locale to use. Default is en_US\n" 30 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" 31 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" 32 "-win Run test using Windows native services. (currently not working) (ICU is default)\n" 33 "-unix Run test using Unix word breaking services. (currently not working) \n" 34 "-mac Run test using MacOSX word breaking services.\n" 35 "-uselen Use API with string lengths. Default is null-terminated strings\n" 36 "-char Use character break iterator\n" 37 "-word Use word break iterator\n" 38 "-line Use line break iterator\n" 39 "-sentence Use sentence break iterator\n" 40 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" 41 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" 42 " under test at each call point. For measuring test overhead.\n" 43 "-terse Terse numbers-only output. Intended for use by scripts.\n" 44 "-dump Display stuff.\n" 45 "-capi Use C APIs instead of C++ APIs (currently not working)\n" 46 "-next Do the next test\n" 47 "-isBound Do the isBound test\n" 48 ; 49 50 51 #include <stdio.h> 52 #include <string.h> 53 #include <stdlib.h> 54 #include <math.h> 55 #include <locale.h> 56 #include <errno.h> 57 #include <sys/stat.h> 58 59 #include <unicode/utypes.h> 60 #include <unicode/ucol.h> 61 #include <unicode/ucoleitr.h> 62 #include <unicode/uloc.h> 63 #include <unicode/ustring.h> 64 #include <unicode/ures.h> 65 #include <unicode/uchar.h> 66 #include <unicode/ucnv.h> 67 #include <unicode/utf8.h> 68 69 #include <unicode/brkiter.h> 70 71 72 #if U_PLATFORM_HAS_WIN32_API 73 #include <windows.h> 74 #else 75 // 76 // Stubs for Windows API functions when building on UNIXes. 77 // 78 #include <sys/time.h> 79 unsigned long timeGetTime() { 80 struct timeval t; 81 gettimeofday(&t, 0); 82 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. 83 val += t.tv_usec / 1000; 84 return val; 85 }; 86 #define MAKELCID(a,b) 0 87 #endif 88 89 90 // 91 // Command line option variables 92 // These global variables are set according to the options specified 93 // on the command line by the user. 94 char * opt_fName = 0; 95 char * opt_locale = "en_US"; 96 int opt_langid = 0; // Defaults to value corresponding to opt_locale. 97 char * opt_rules = 0; 98 UBool opt_help = FALSE; 99 int opt_time = 0; 100 int opt_loopCount = 0; 101 int opt_passesCount= 1; 102 UBool opt_terse = FALSE; 103 UBool opt_icu = TRUE; 104 UBool opt_win = FALSE; // Run with Windows native functions. 105 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. 106 UBool opt_mac = FALSE; // Run with MacOSX word break services. 107 UBool opt_uselen = FALSE; 108 UBool opt_dump = FALSE; 109 UBool opt_char = FALSE; 110 UBool opt_word = FALSE; 111 UBool opt_line = FALSE; 112 UBool opt_sentence = FALSE; 113 UBool opt_capi = FALSE; 114 115 UBool opt_next = FALSE; 116 UBool opt_isBound = FALSE; 117 118 119 120 // 121 // Definitions for the command line options 122 // 123 struct OptSpec { 124 const char *name; 125 enum {FLAG, NUM, STRING} type; 126 void *pVar; 127 }; 128 129 OptSpec opts[] = { 130 {"-file", OptSpec::STRING, &opt_fName}, 131 {"-locale", OptSpec::STRING, &opt_locale}, 132 {"-langid", OptSpec::NUM, &opt_langid}, 133 {"-win", OptSpec::FLAG, &opt_win}, 134 {"-unix", OptSpec::FLAG, &opt_unix}, 135 {"-mac", OptSpec::FLAG, &opt_mac}, 136 {"-uselen", OptSpec::FLAG, &opt_uselen}, 137 {"-loop", OptSpec::NUM, &opt_loopCount}, 138 {"-time", OptSpec::NUM, &opt_time}, 139 {"-passes", OptSpec::NUM, &opt_passesCount}, 140 {"-char", OptSpec::FLAG, &opt_char}, 141 {"-word", OptSpec::FLAG, &opt_word}, 142 {"-line", OptSpec::FLAG, &opt_line}, 143 {"-sentence", OptSpec::FLAG, &opt_sentence}, 144 {"-terse", OptSpec::FLAG, &opt_terse}, 145 {"-dump", OptSpec::FLAG, &opt_dump}, 146 {"-capi", OptSpec::FLAG, &opt_capi}, 147 {"-next", OptSpec::FLAG, &opt_next}, 148 {"-isBound", OptSpec::FLAG, &opt_isBound}, 149 {"-help", OptSpec::FLAG, &opt_help}, 150 {"-?", OptSpec::FLAG, &opt_help}, 151 {0, OptSpec::FLAG, 0} 152 }; 153 154 155 //--------------------------------------------------------------------------- 156 // 157 // Global variables pointing to and describing the test file 158 // 159 //--------------------------------------------------------------------------- 160 161 //DWORD gWinLCID; 162 BreakIterator *brkit = NULL; 163 UChar *text = NULL; 164 int32_t textSize = 0; 165 166 167 168 #if U_PLATFORM_IS_DARWIN_BASED 169 #include <ApplicationServices/ApplicationServices.h> 170 enum{ 171 kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask) 172 }; 173 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask}; 174 TextBreakLocatorRef breakRef; 175 UCTextBreakType macBreakType; 176 177 void createMACBrkIt() { 178 OSStatus status = noErr; 179 LocaleRef lref; 180 status = LocaleRefFromLocaleString(opt_locale, &lref); 181 status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef); 182 if(opt_char == TRUE) { 183 macBreakType = kUCTextBreakClusterMask; 184 } else if(opt_word == TRUE) { 185 macBreakType = kUCTextBreakWordMask; 186 } else if(opt_line == TRUE) { 187 macBreakType = kUCTextBreakLineMask; 188 } else if(opt_sentence == TRUE) { 189 // error 190 // brkit = BreakIterator::createSentenceInstance(opt_locale, status); 191 } else { 192 // default is character iterator 193 macBreakType = kUCTextBreakClusterMask; 194 } 195 } 196 #endif 197 198 void createICUBrkIt() { 199 // 200 // Set up an ICU break iterator 201 // 202 UErrorCode status = U_ZERO_ERROR; 203 if(opt_char == TRUE) { 204 brkit = BreakIterator::createCharacterInstance(opt_locale, status); 205 } else if(opt_word == TRUE) { 206 brkit = BreakIterator::createWordInstance(opt_locale, status); 207 } else if(opt_line == TRUE) { 208 brkit = BreakIterator::createLineInstance(opt_locale, status); 209 } else if(opt_sentence == TRUE) { 210 brkit = BreakIterator::createSentenceInstance(opt_locale, status); 211 } else { 212 // default is character iterator 213 brkit = BreakIterator::createCharacterInstance(opt_locale, status); 214 } 215 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { 216 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); 217 } 218 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { 219 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); 220 } 221 222 } 223 224 //--------------------------------------------------------------------------- 225 // 226 // ProcessOptions() Function to read the command line options. 227 // 228 //--------------------------------------------------------------------------- 229 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) 230 { 231 int i; 232 int argNum; 233 const char *pArgName; 234 OptSpec *pOpt; 235 236 for (argNum=1; argNum<argc; argNum++) { 237 pArgName = argv[argNum]; 238 for (pOpt = opts; pOpt->name != 0; pOpt++) { 239 if (strcmp(pOpt->name, pArgName) == 0) { 240 switch (pOpt->type) { 241 case OptSpec::FLAG: 242 *(UBool *)(pOpt->pVar) = TRUE; 243 break; 244 case OptSpec::STRING: 245 argNum ++; 246 if (argNum >= argc) { 247 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 248 return FALSE; 249 } 250 *(const char **)(pOpt->pVar) = argv[argNum]; 251 break; 252 case OptSpec::NUM: 253 argNum ++; 254 if (argNum >= argc) { 255 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 256 return FALSE; 257 } 258 char *endp; 259 i = strtol(argv[argNum], &endp, 0); 260 if (endp == argv[argNum]) { 261 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); 262 return FALSE; 263 } 264 *(int *)(pOpt->pVar) = i; 265 } 266 break; 267 } 268 } 269 if (pOpt->name == 0) 270 { 271 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); 272 return FALSE; 273 } 274 } 275 return TRUE; 276 } 277 278 279 void doForwardTest() { 280 if (opt_terse == FALSE) { 281 printf("Doing the forward test\n"); 282 } 283 int32_t noBreaks = 0; 284 int32_t i = 0; 285 unsigned long startTime = timeGetTime(); 286 unsigned long elapsedTime = 0; 287 if(opt_icu) { 288 createICUBrkIt(); 289 brkit->setText(UnicodeString(text, textSize)); 290 brkit->first(); 291 if (opt_terse == FALSE) { 292 printf("Warmup\n"); 293 } 294 int j; 295 while((j = brkit->next()) != BreakIterator::DONE) { 296 noBreaks++; 297 //fprintf(stderr, "%d ", j); 298 } 299 300 if (opt_terse == FALSE) { 301 printf("Measure\n"); 302 } 303 startTime = timeGetTime(); 304 for(i = 0; i < opt_loopCount; i++) { 305 brkit->first(); 306 while(brkit->next() != BreakIterator::DONE) { 307 } 308 } 309 310 elapsedTime = timeGetTime()-startTime; 311 } else if(opt_mac) { 312 #if U_PLATFORM_IS_DARWIN_BASED 313 createMACBrkIt(); 314 UniChar* filePtr = text; 315 OSStatus status = noErr; 316 UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize; 317 startOffset = 0; 318 //printf("\t---Search forward--\n"); 319 320 while (startOffset < numUniChars) 321 { 322 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, 323 startOffset, &breakOffset); 324 //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status)); 325 //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset)); 326 327 // Output break 328 //printf("\t%d\n", (int)breakOffset); 329 330 // Increment counters 331 noBreaks++; 332 startOffset = breakOffset; 333 } 334 startTime = timeGetTime(); 335 for(i = 0; i < opt_loopCount; i++) { 336 startOffset = 0; 337 338 while (startOffset < numUniChars) 339 { 340 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, 341 startOffset, &breakOffset); 342 // Increment counters 343 startOffset = breakOffset; 344 } 345 } 346 elapsedTime = timeGetTime()-startTime; 347 UCDisposeTextBreakLocator(&breakRef); 348 #endif 349 350 351 } 352 353 354 if (opt_terse == FALSE) { 355 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); 356 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); 357 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); 358 printf("forward break iteration average loop time %d\n", loopTime); 359 printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); 360 printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); 361 } else { 362 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); 363 } 364 365 366 } 367 368 void doIsBoundTest() { 369 int32_t noBreaks = 0, hit = 0; 370 int32_t i = 0, j = 0; 371 unsigned long startTime = timeGetTime(); 372 unsigned long elapsedTime = 0; 373 createICUBrkIt(); 374 brkit->setText(UnicodeString(text, textSize)); 375 brkit->first(); 376 for(j = 0; j < textSize; j++) { 377 if(brkit->isBoundary(j)) { 378 noBreaks++; 379 //fprintf(stderr, "%d ", j); 380 } 381 } 382 /* 383 while(brkit->next() != BreakIterator::DONE) { 384 noBreaks++; 385 } 386 */ 387 388 startTime = timeGetTime(); 389 for(i = 0; i < opt_loopCount; i++) { 390 for(j = 0; j < textSize; j++) { 391 if(brkit->isBoundary(j)) { 392 hit++; 393 } 394 } 395 } 396 397 elapsedTime = timeGetTime()-startTime; 398 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); 399 if (opt_terse == FALSE) { 400 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); 401 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); 402 printf("forward break iteration average loop time %d\n", loopTime); 403 printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); 404 printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); 405 } else { 406 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); 407 } 408 } 409 410 //---------------------------------------------------------------------------------------- 411 // 412 // UnixConvert -- Convert the lines of the file to the encoding for UNIX 413 // Since it appears that Unicode support is going in the general 414 // direction of the use of UTF-8 locales, that is the approach 415 // that is used here. 416 // 417 //---------------------------------------------------------------------------------------- 418 void UnixConvert() { 419 #if 0 420 int line; 421 422 UConverter *cvrtr; // An ICU code page converter. 423 UErrorCode status = U_ZERO_ERROR; 424 425 426 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. 427 if (U_FAILURE(status)) { 428 fprintf(stderr, "ICU Converter open failed.: %d\n", &status); 429 exit(-1); 430 } 431 // redo for unix 432 for (line=0; line < gNumFileLines; line++) { 433 int sizeNeeded = ucnv_fromUChars(cvrtr, 434 0, // ptr to target buffer. 435 0, // length of target buffer. 436 gFileLines[line].name, 437 -1, // source is null terminated 438 &status); 439 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { 440 fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); 441 exit(-1); 442 } 443 status = U_ZERO_ERROR; 444 gFileLines[line].unixName = new char[sizeNeeded+1]; 445 sizeNeeded = ucnv_fromUChars(cvrtr, 446 gFileLines[line].unixName, // ptr to target buffer. 447 sizeNeeded+1, // length of target buffer. 448 gFileLines[line].name, 449 -1, // source is null terminated 450 &status); 451 if (U_FAILURE(status)) { 452 fprintf(stderr, "ICU Conversion Failed.: %d\n", status); 453 exit(-1); 454 } 455 gFileLines[line].unixName[sizeNeeded] = 0; 456 }; 457 ucnv_close(cvrtr); 458 #endif 459 } 460 461 462 //---------------------------------------------------------------------------------------- 463 // 464 // class UCharFile Class to hide all the gorp to read a file in 465 // and produce a stream of UChars. 466 // 467 //---------------------------------------------------------------------------------------- 468 class UCharFile { 469 public: 470 UCharFile(const char *fileName); 471 ~UCharFile(); 472 UChar get(); 473 UBool eof() {return fEof;}; 474 UBool error() {return fError;}; 475 int32_t size() { return fFileSize; }; 476 477 private: 478 UCharFile (const UCharFile &other) {}; // No copy constructor. 479 UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op 480 481 FILE *fFile; 482 const char *fName; 483 UBool fEof; 484 UBool fError; 485 UChar fPending2ndSurrogate; 486 int32_t fFileSize; 487 488 enum {UTF16LE, UTF16BE, UTF8} fEncoding; 489 }; 490 491 UCharFile::UCharFile(const char * fileName) { 492 fEof = FALSE; 493 fError = FALSE; 494 fName = fileName; 495 struct stat buf; 496 int32_t result = stat(fileName, &buf); 497 if(result != 0) { 498 fprintf(stderr, "Error getting info\n"); 499 fFileSize = -1; 500 } else { 501 fFileSize = buf.st_size; 502 } 503 fFile = fopen(fName, "rb"); 504 fPending2ndSurrogate = 0; 505 if (fFile == NULL) { 506 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); 507 fError = TRUE; 508 return; 509 } 510 // 511 // Look for the byte order mark at the start of the file. 512 // 513 int BOMC1, BOMC2, BOMC3; 514 BOMC1 = fgetc(fFile); 515 BOMC2 = fgetc(fFile); 516 517 if (BOMC1 == 0xff && BOMC2 == 0xfe) { 518 fEncoding = UTF16LE; } 519 else if (BOMC1 == 0xfe && BOMC2 == 0xff) { 520 fEncoding = UTF16BE; } 521 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { 522 fEncoding = UTF8; } 523 else 524 { 525 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " 526 "must include a BOM.\n", fileName); 527 fError = true; 528 return; 529 } 530 } 531 532 533 UCharFile::~UCharFile() { 534 fclose(fFile); 535 } 536 537 538 539 UChar UCharFile::get() { 540 UChar c; 541 switch (fEncoding) { 542 case UTF16LE: 543 { 544 int cL, cH; 545 cL = fgetc(fFile); 546 cH = fgetc(fFile); 547 c = cL | (cH << 8); 548 if (cH == EOF) { 549 c = 0; 550 fEof = TRUE; 551 } 552 break; 553 } 554 case UTF16BE: 555 { 556 int cL, cH; 557 cH = fgetc(fFile); 558 cL = fgetc(fFile); 559 c = cL | (cH << 8); 560 if (cL == EOF) { 561 c = 0; 562 fEof = TRUE; 563 } 564 break; 565 } 566 case UTF8: 567 { 568 if (fPending2ndSurrogate != 0) { 569 c = fPending2ndSurrogate; 570 fPending2ndSurrogate = 0; 571 break; 572 } 573 574 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. 575 if (ch == EOF) { 576 c = 0; 577 fEof = TRUE; 578 break; 579 } 580 581 if (ch <= 0x7f) { 582 // It's ascii. No further utf-8 conversion. 583 c = ch; 584 break; 585 } 586 587 // Figure out the lenght of the char and read the rest of the bytes 588 // into a temp array. 589 int nBytes; 590 if (ch >= 0xF0) {nBytes=4;} 591 else if (ch >= 0xE0) {nBytes=3;} 592 else if (ch >= 0xC0) {nBytes=2;} 593 else { 594 fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile)); 595 fError = TRUE; 596 return 0; 597 } 598 599 unsigned char bytes[10]; 600 bytes[0] = (unsigned char)ch; 601 int i; 602 for (i=1; i<nBytes; i++) { 603 bytes[i] = fgetc(fFile); 604 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { 605 fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch); 606 fError = TRUE; 607 return 0; 608 } 609 } 610 611 // Convert the bytes from the temp array to a Unicode char. 612 i = 0; 613 uint32_t cp; 614 U8_NEXT_UNSAFE(bytes, i, cp); 615 c = (UChar)cp; 616 617 if (cp >= 0x10000) { 618 // The code point needs to be broken up into a utf-16 surrogate pair. 619 // Process first half this time through the main loop, and 620 // remember the other half for the next time through. 621 UChar utf16Buf[3]; 622 i = 0; 623 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); 624 fPending2ndSurrogate = utf16Buf[1]; 625 c = utf16Buf[0]; 626 } 627 break; 628 }; 629 } 630 return c; 631 } 632 633 634 //---------------------------------------------------------------------------------------- 635 // 636 // Main -- process command line, read in and pre-process the test file, 637 // call other functions to do the actual tests. 638 // 639 //---------------------------------------------------------------------------------------- 640 int main(int argc, const char** argv) { 641 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) { 642 printf(gUsageString); 643 exit (1); 644 } 645 // Make sure that we've only got one API selected. 646 if (opt_mac || opt_unix || opt_win) opt_icu = FALSE; 647 if (opt_mac || opt_unix) opt_win = FALSE; 648 if (opt_mac) opt_unix = FALSE; 649 650 UErrorCode status = U_ZERO_ERROR; 651 652 653 654 // 655 // Set up a Windows LCID 656 // 657 /* 658 if (opt_langid != 0) { 659 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); 660 } 661 else { 662 gWinLCID = uloc_getLCID(opt_locale); 663 } 664 */ 665 666 // 667 // Set the UNIX locale 668 // 669 if (opt_unix) { 670 if (setlocale(LC_ALL, opt_locale) == 0) { 671 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); 672 exit(-1); 673 } 674 } 675 676 // Read in the input file. 677 // File assumed to be utf-16. 678 // Lines go onto heap buffers. Global index array to line starts is created. 679 // Lines themselves are null terminated. 680 // 681 682 UCharFile f(opt_fName); 683 if (f.error()) { 684 exit(-1); 685 } 686 int32_t fileSize = f.size(); 687 const int STARTSIZE = 70000; 688 int32_t bufSize = 0; 689 int32_t charCount = 0; 690 if(fileSize != -1) { 691 text = (UChar *)malloc(fileSize*sizeof(UChar)); 692 bufSize = fileSize; 693 } else { 694 text = (UChar *)malloc(STARTSIZE*sizeof(UChar)); 695 bufSize = STARTSIZE; 696 } 697 if(text == NULL) { 698 fprintf(stderr, "Allocating buffer failed\n"); 699 exit(-1); 700 } 701 702 703 // Read the file, split into lines, and save in memory. 704 // Loop runs once per utf-16 value from the input file, 705 // (The number of bytes read from file per loop iteration depends on external encoding.) 706 for (;;) { 707 708 UChar c = f.get(); 709 if(f.eof()) { 710 break; 711 } 712 if (f.error()){ 713 exit(-1); 714 } 715 // We now have a good UTF-16 value in c. 716 text[charCount++] = c; 717 if(charCount == bufSize) { 718 text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar)); 719 if(text == NULL) { 720 fprintf(stderr, "Reallocating buffer failed\n"); 721 exit(-1); 722 } 723 bufSize *= 2; 724 } 725 } 726 727 728 if (opt_terse == FALSE) { 729 printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount); 730 } 731 732 textSize = charCount; 733 734 735 736 737 // 738 // Dump file contents if requested. 739 // 740 if (opt_dump) { 741 // dump file, etc... possibly 742 } 743 744 745 // 746 // We've got the file read into memory. Go do something with it. 747 // 748 int32_t i = 0; 749 for(i = 0; i < opt_passesCount; i++) { 750 if(opt_loopCount != 0) { 751 if(opt_next) { 752 doForwardTest(); 753 } else if(opt_isBound) { 754 doIsBoundTest(); 755 } else { 756 doForwardTest(); 757 } 758 } else if(opt_time != 0) { 759 760 } 761 } 762 763 if(text != NULL) { 764 free(text); 765 } 766 if(brkit != NULL) { 767 delete brkit; 768 } 769 770 return 0; 771 } 772