1 /* 2 ********************************************************************** 3 * Copyright (C) 2010-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: dicttrieperf.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010dec09 12 * created by: Markus W. Scherer 13 * 14 * Performance test program for dictionary-type tries. 15 * 16 * Usage from within <ICU build tree>/test/perf/dicttrieperf/ : 17 * (Linux) 18 * make 19 * export LD_LIBRARY_PATH=../../../lib:../../../stubdata:../../../tools/ctestfw 20 * ./dicttrieperf --sourcedir <ICU build tree>/data/out/tmp --passes 3 --iterations 1000 21 * or 22 * ./dicttrieperf -f <ICU source tree>/source/data/brkitr/thaidict.txt --passes 3 --iterations 250 23 */ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include "unicode/bytestrie.h" 28 #include "unicode/bytestriebuilder.h" 29 #include "unicode/localpointer.h" 30 #include "unicode/ucharstrie.h" 31 #include "unicode/ucharstriebuilder.h" 32 #include "unicode/uperf.h" 33 #include "unicode/utext.h" 34 #include "charstr.h" 35 #include "package.h" 36 #include "toolutil.h" 37 #include "triedict.h" 38 #include "ucbuf.h" // struct ULine 39 #include "uoptions.h" 40 #include "uvectr32.h" 41 42 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 43 44 // Test object. 45 class DictionaryTriePerfTest : public UPerfTest { 46 public: 47 DictionaryTriePerfTest(int32_t argc, const char *argv[], UErrorCode &status) 48 : UPerfTest(argc, argv, NULL, 0, "", status), numTextLines(0) { 49 if(hasFile()) { 50 getLines(status); 51 for(int32_t i=0; i<numLines; ++i) { 52 // Skip comment lines (start with a character below 'A'). 53 if(lines[i].name[0]>=0x41) { 54 ++numTextLines; 55 // Remove trailing CR LF. 56 int32_t len=lines[i].len; 57 UChar c; 58 while(len>0 && ((c=lines[i].name[len-1])==0xa || c==0xd)) { 59 --len; 60 } 61 lines[i].len=len; 62 } 63 } 64 } 65 } 66 67 virtual UPerfFunction *runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL); 68 69 const char *getSourceDir() const { return sourceDir; } 70 71 UBool hasFile() const { return ucharBuf!=NULL; } 72 const ULine *getCachedLines() const { return lines; } 73 int32_t getNumLines() const { return numLines; } 74 int32_t numTextLines; // excluding comment lines 75 }; 76 77 // Performance test function object. 78 // Loads icudt46l.dat (or whatever its current versioned filename) 79 // from the -s or --sourcedir path. 80 class PackageLookup : public UPerfFunction { 81 protected: 82 PackageLookup(const DictionaryTriePerfTest &perf) { 83 IcuToolErrorCode errorCode("PackageLookup()"); 84 CharString filename(perf.getSourceDir(), errorCode); 85 int32_t filenameLength=filename.length(); 86 if(filenameLength>0 && filename[filenameLength-1]!=U_FILE_SEP_CHAR && 87 filename[filenameLength-1]!=U_FILE_ALT_SEP_CHAR) { 88 filename.append(U_FILE_SEP_CHAR, errorCode); 89 } 90 filename.append(U_ICUDATA_NAME, errorCode); 91 filename.append(".dat", errorCode); 92 pkg.readPackage(filename.data()); 93 } 94 95 public: 96 virtual ~PackageLookup() {} 97 98 // virtual void call(UErrorCode* pErrorCode) { ... } 99 100 virtual long getOperationsPerIteration() { 101 return pkg.getItemCount(); 102 } 103 104 // virtual long getEventsPerIteration(); 105 106 protected: 107 Package pkg; 108 }; 109 110 struct TOCEntry { 111 int32_t nameOffset, dataOffset; 112 }; 113 114 // Similar to ICU 4.6 offsetTOCLookupFn() (in ucmndata.c). 115 static int32_t simpleBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) { 116 int32_t start=0; 117 int32_t limit=count; 118 int32_t lastNumber=limit; 119 for(;;) { 120 int32_t number=(start+limit)/2; 121 if(lastNumber==number) { // have we moved? 122 return -1; // not found 123 } 124 lastNumber=number; 125 int32_t cmp=strcmp(s, names+toc[number].nameOffset); 126 if(cmp<0) { 127 limit=number; 128 } else if(cmp>0) { 129 start=number; 130 } else { // found s 131 return number; 132 } 133 } 134 } 135 136 class BinarySearchPackageLookup : public PackageLookup { 137 public: 138 BinarySearchPackageLookup(const DictionaryTriePerfTest &perf) 139 : PackageLookup(perf) { 140 IcuToolErrorCode errorCode("BinarySearchPackageLookup()"); 141 int32_t count=pkg.getItemCount(); 142 toc=new TOCEntry[count]; 143 for(int32_t i=0; i<count; ++i) { 144 toc[i].nameOffset=itemNames.length(); 145 toc[i].dataOffset=i; // arbitrary value, see toc comment below 146 // The Package class removes the "icudt46l/" prefix. 147 // We restore that here for a fair performance test. 148 const char *name=pkg.getItem(i)->name; 149 itemNames.append("icudt46l/", errorCode); 150 itemNames.append(name, strlen(name)+1, errorCode); 151 } 152 printf("size of item names: %6ld\n", (long)itemNames.length()); 153 printf("size of TOC: %6ld\n", (long)(count*8)); 154 printf("total index size: %6ld\n", (long)(itemNames.length()+count*8)); 155 } 156 virtual ~BinarySearchPackageLookup() { 157 delete[] toc; 158 } 159 160 virtual void call(UErrorCode * /*pErrorCode*/) { 161 int32_t count=pkg.getItemCount(); 162 const char *itemNameChars=itemNames.data(); 163 const char *name=itemNameChars; 164 for(int32_t i=0; i<count; ++i) { 165 if(simpleBinarySearch(name, itemNameChars, toc, count)<0) { 166 fprintf(stderr, "item not found: %s\n", name); 167 } 168 name=strchr(name, 0)+1; 169 } 170 } 171 172 protected: 173 CharString itemNames; 174 // toc imitates a .dat file's array of UDataOffsetTOCEntry 175 // with nameOffset and dataOffset. 176 // We don't need the dataOffsets, but we want to imitate the real 177 // memory density, to measure equivalent CPU cache usage. 178 TOCEntry *toc; 179 }; 180 181 #ifndef MIN 182 #define MIN(a,b) (((a)<(b)) ? (a) : (b)) 183 #endif 184 185 // Compare strings where we know the shared prefix length, 186 // and advance the prefix length as we find that the strings share even more characters. 187 static int32_t strcmpAfterPrefix(const char *s1, const char *s2, int32_t &prefixLength) { 188 int32_t pl=prefixLength; 189 s1+=pl; 190 s2+=pl; 191 int32_t cmp=0; 192 for(;;) { 193 int32_t c1=(uint8_t)*s1++; 194 int32_t c2=(uint8_t)*s2++; 195 cmp=c1-c2; 196 if(cmp!=0 || c1==0) { // different or done 197 break; 198 } 199 ++pl; // increment shared same-prefix length 200 } 201 prefixLength=pl; 202 return cmp; 203 } 204 205 static int32_t prefixBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) { 206 if(count==0) { 207 return -1; 208 } 209 int32_t start=0; 210 int32_t limit=count; 211 // Remember the shared prefix between s, start and limit, 212 // and don't compare that shared prefix again. 213 // The shared prefix should get longer as we narrow the [start, limit[ range. 214 int32_t startPrefixLength=0; 215 int32_t limitPrefixLength=0; 216 // Prime the prefix lengths so that we don't keep prefixLength at 0 until 217 // both the start and limit indexes have moved. 218 // At the same time, we find if s is one of the start and (limit-1) names, 219 // and if not, exclude them from the actual binary search. 220 if(0==strcmpAfterPrefix(s, names+toc[0].nameOffset, startPrefixLength)) { 221 return 0; 222 } 223 ++start; 224 --limit; 225 if(0==strcmpAfterPrefix(s, names+toc[limit].nameOffset, limitPrefixLength)) { 226 return limit; 227 } 228 while(start<limit) { 229 int32_t i=(start+limit)/2; 230 int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength); 231 int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, prefixLength); 232 if(cmp<0) { 233 limit=i; 234 limitPrefixLength=prefixLength; 235 } else if(cmp==0) { 236 return i; 237 } else { 238 start=i+1; 239 startPrefixLength=prefixLength; 240 } 241 } 242 return -1; 243 } 244 245 class PrefixBinarySearchPackageLookup : public BinarySearchPackageLookup { 246 public: 247 PrefixBinarySearchPackageLookup(const DictionaryTriePerfTest &perf) 248 : BinarySearchPackageLookup(perf) {} 249 250 virtual void call(UErrorCode * /*pErrorCode*/) { 251 int32_t count=pkg.getItemCount(); 252 const char *itemNameChars=itemNames.data(); 253 const char *name=itemNameChars; 254 for(int32_t i=0; i<count; ++i) { 255 if(prefixBinarySearch(name, itemNameChars, toc, count)<0) { 256 fprintf(stderr, "item not found: %s\n", name); 257 } 258 name=strchr(name, 0)+1; 259 } 260 } 261 }; 262 263 static int32_t bytesTrieLookup(const char *s, const char *nameTrieBytes) { 264 BytesTrie trie(nameTrieBytes); 265 if(USTRINGTRIE_HAS_VALUE(trie.next(s, -1))) { 266 return trie.getValue(); 267 } else { 268 return -1; 269 } 270 } 271 272 class BytesTriePackageLookup : public PackageLookup { 273 public: 274 BytesTriePackageLookup(const DictionaryTriePerfTest &perf) 275 : PackageLookup(perf) { 276 IcuToolErrorCode errorCode("BinarySearchPackageLookup()"); 277 builder=new BytesTrieBuilder(errorCode); 278 int32_t count=pkg.getItemCount(); 279 for(int32_t i=0; i<count; ++i) { 280 // The Package class removes the "icudt46l/" prefix. 281 // We restore that here for a fair performance test. 282 // We store all full names so that we do not have to reconstruct them 283 // in the call() function. 284 const char *name=pkg.getItem(i)->name; 285 int32_t offset=itemNames.length(); 286 itemNames.append("icudt46l/", errorCode); 287 itemNames.append(name, -1, errorCode); 288 // As value, set the data item index. 289 // In a real implementation, we would use that to get the 290 // start and limit offset of the data item. 291 StringPiece fullName(itemNames.toStringPiece()); 292 fullName.remove_prefix(offset); 293 builder->add(fullName, i, errorCode); 294 // NUL-terminate the name for call() to find the next one. 295 itemNames.append(0, errorCode); 296 } 297 int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length(); 298 printf("size of BytesTrie: %6ld\n", (long)length); 299 // count+1: +1 for the last-item limit offset which we should have always had 300 printf("size of dataOffsets:%6ld\n", (long)((count+1)*4)); 301 printf("total index size: %6ld\n", (long)(length+(count+1)*4)); 302 } 303 virtual ~BytesTriePackageLookup() { 304 delete builder; 305 } 306 307 virtual void call(UErrorCode *pErrorCode) { 308 int32_t count=pkg.getItemCount(); 309 const char *nameTrieBytes=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, *pErrorCode).data(); 310 const char *name=itemNames.data(); 311 for(int32_t i=0; i<count; ++i) { 312 if(bytesTrieLookup(name, nameTrieBytes)<0) { 313 fprintf(stderr, "item not found: %s\n", name); 314 } 315 name=strchr(name, 0)+1; 316 } 317 } 318 319 protected: 320 BytesTrieBuilder *builder; 321 CharString itemNames; 322 }; 323 324 // Performance test function object. 325 // Each subclass loads a dictionary text file 326 // from the -s or --sourcedir path plus -f or --file-name. 327 // For example, <ICU source dir>/source/data/brkitr/thaidict.txt. 328 class DictLookup : public UPerfFunction { 329 public: 330 DictLookup(const DictionaryTriePerfTest &perfTest) : perf(perfTest) {} 331 332 virtual long getOperationsPerIteration() { 333 return perf.numTextLines; 334 } 335 336 protected: 337 const DictionaryTriePerfTest &perf; 338 }; 339 340 class CompactTrieDictLookup : public DictLookup { 341 public: 342 CompactTrieDictLookup(const DictionaryTriePerfTest &perfTest) 343 : DictLookup(perfTest), ctd(NULL) { 344 IcuToolErrorCode errorCode("UCharsTrieDictLookup()"); 345 // U+0E1C is the median code unit, from 346 // the UCharsTrie root node (split-branch node) for thaidict.txt. 347 MutableTrieDictionary builder(0xe1c, errorCode); 348 const ULine *lines=perf.getCachedLines(); 349 int32_t numLines=perf.getNumLines(); 350 for(int32_t i=0; i<numLines; ++i) { 351 // Skip comment lines (start with a character below 'A'). 352 if(lines[i].name[0]<0x41) { 353 continue; 354 } 355 builder.addWord(lines[i].name, lines[i].len, errorCode); 356 } 357 ctd=new CompactTrieDictionary(builder, errorCode); 358 int32_t length=(int32_t)ctd->dataSize(); 359 printf("size of CompactTrieDict: %6ld bytes\n", (long)length); 360 } 361 362 virtual ~CompactTrieDictLookup() { 363 delete ctd; 364 } 365 366 virtual void call(UErrorCode *pErrorCode) { 367 UText text=UTEXT_INITIALIZER; 368 int32_t lengths[20]; 369 const ULine *lines=perf.getCachedLines(); 370 int32_t numLines=perf.getNumLines(); 371 for(int32_t i=0; i<numLines; ++i) { 372 // Skip comment lines (start with a character below 'A'). 373 if(lines[i].name[0]<0x41) { 374 continue; 375 } 376 utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode); 377 int32_t count; 378 ctd->matches(&text, lines[i].len, 379 lengths, count, LENGTHOF(lengths)); 380 if(count==0 || lengths[count-1]!=lines[i].len) { 381 fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 382 } 383 } 384 } 385 386 protected: 387 CompactTrieDictionary *ctd; 388 }; 389 390 // Closely imitate CompactTrieDictionary::matches(). 391 // Note: CompactTrieDictionary::matches() is part of its trie implementation, 392 // and while it loops over the text, it knows the current state. 393 // By contrast, this implementation uses UCharsTrie API functions that have to 394 // check the trie state each time and load/store state in the object. 395 // (Whether it hasNext() and whether it is in the middle of a linear-match node.) 396 static int32_t 397 ucharsTrieMatches(UCharsTrie &trie, 398 UText *text, int32_t textLimit, 399 int32_t *lengths, int &count, int limit ) { 400 UChar32 c=utext_next32(text); 401 // Notes: 402 // a) CompactTrieDictionary::matches() does not check for U_SENTINEL. 403 // b) It also ignores non-BMP code points by casting to UChar! 404 if(c<0) { 405 return 0; 406 } 407 // Should be firstForCodePoint() but CompactTrieDictionary 408 // handles only code units. 409 UStringTrieResult result=trie.first(c); 410 int32_t numChars=1; 411 count=0; 412 for(;;) { 413 if(USTRINGTRIE_HAS_VALUE(result)) { 414 if(count<limit) { 415 // lengths[count++]=(int32_t)utext_getNativeIndex(text); 416 lengths[count++]=numChars; // CompactTrieDictionary just counts chars too. 417 } 418 if(result==USTRINGTRIE_FINAL_VALUE) { 419 break; 420 } 421 } else if(result==USTRINGTRIE_NO_MATCH) { 422 break; 423 } 424 if(numChars>=textLimit) { 425 // Note: Why do we have both a text limit and a UText that knows its length? 426 break; 427 } 428 UChar32 c=utext_next32(text); 429 // Notes: 430 // a) CompactTrieDictionary::matches() does not check for U_SENTINEL. 431 // b) It also ignores non-BMP code points by casting to UChar! 432 if(c<0) { 433 break; 434 } 435 ++numChars; 436 // Should be nextForCodePoint() but CompactTrieDictionary 437 // handles only code units. 438 result=trie.next(c); 439 } 440 #if 0 441 // Note: CompactTrieDictionary::matches() comments say that it leaves the UText 442 // after the longest prefix match and returns the number of characters 443 // that were matched. 444 if(index!=lastMatch) { 445 utext_setNativeIndex(text, lastMatch); 446 } 447 return lastMatch-start; 448 // However, it does not do either of these, so I am not trying to 449 // imitate it (or its docs) 100%. 450 #endif 451 return numChars; 452 } 453 454 class UCharsTrieDictLookup : public DictLookup { 455 public: 456 UCharsTrieDictLookup(const DictionaryTriePerfTest &perfTest) 457 : DictLookup(perfTest), trie(NULL) { 458 IcuToolErrorCode errorCode("UCharsTrieDictLookup()"); 459 builder=new UCharsTrieBuilder(errorCode); 460 const ULine *lines=perf.getCachedLines(); 461 int32_t numLines=perf.getNumLines(); 462 for(int32_t i=0; i<numLines; ++i) { 463 // Skip comment lines (start with a character below 'A'). 464 if(lines[i].name[0]<0x41) { 465 continue; 466 } 467 builder->add(UnicodeString(FALSE, lines[i].name, lines[i].len), 0, errorCode); 468 } 469 UnicodeString trieUChars; 470 int32_t length=builder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieUChars, errorCode).length(); 471 printf("size of UCharsTrie: %6ld bytes\n", (long)length*2); 472 trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode); 473 } 474 475 virtual ~UCharsTrieDictLookup() { 476 delete builder; 477 delete trie; 478 } 479 480 protected: 481 UCharsTrieBuilder *builder; 482 UCharsTrie *trie; 483 }; 484 485 class UCharsTrieDictMatches : public UCharsTrieDictLookup { 486 public: 487 UCharsTrieDictMatches(const DictionaryTriePerfTest &perfTest) 488 : UCharsTrieDictLookup(perfTest) {} 489 490 virtual void call(UErrorCode *pErrorCode) { 491 UText text=UTEXT_INITIALIZER; 492 int32_t lengths[20]; 493 const ULine *lines=perf.getCachedLines(); 494 int32_t numLines=perf.getNumLines(); 495 for(int32_t i=0; i<numLines; ++i) { 496 // Skip comment lines (start with a character below 'A'). 497 if(lines[i].name[0]<0x41) { 498 continue; 499 } 500 utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode); 501 int32_t count=0; 502 ucharsTrieMatches(*trie, &text, lines[i].len, 503 lengths, count, LENGTHOF(lengths)); 504 if(count==0 || lengths[count-1]!=lines[i].len) { 505 fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 506 } 507 } 508 } 509 }; 510 511 class UCharsTrieDictContains : public UCharsTrieDictLookup { 512 public: 513 UCharsTrieDictContains(const DictionaryTriePerfTest &perfTest) 514 : UCharsTrieDictLookup(perfTest) {} 515 516 virtual void call(UErrorCode * /*pErrorCode*/) { 517 const ULine *lines=perf.getCachedLines(); 518 int32_t numLines=perf.getNumLines(); 519 for(int32_t i=0; i<numLines; ++i) { 520 // Skip comment lines (which start with a character below 'A'). 521 if(lines[i].name[0]<0x41) { 522 continue; 523 } 524 if(!USTRINGTRIE_HAS_VALUE(trie->reset().next(lines[i].name, lines[i].len))) { 525 fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 526 } 527 } 528 } 529 }; 530 531 static inline int32_t thaiCharToByte(UChar32 c) { 532 if(0xe00<=c && c<=0xefe) { 533 return c&0xff; 534 } else if(c==0x2e) { 535 return 0xff; 536 } else { 537 return -1; 538 } 539 } 540 541 static UBool thaiWordToBytes(const UChar *s, int32_t length, 542 CharString &str, UErrorCode &errorCode) { 543 for(int32_t i=0; i<length; ++i) { 544 UChar c=s[i]; 545 int32_t b=thaiCharToByte(c); 546 if(b>=0) { 547 str.append((char)b, errorCode); 548 } else { 549 fprintf(stderr, "thaiWordToBytes(): unable to encode U+%04X as a byte\n", c); 550 return FALSE; 551 } 552 } 553 return TRUE; 554 } 555 556 class BytesTrieDictLookup : public DictLookup { 557 public: 558 BytesTrieDictLookup(const DictionaryTriePerfTest &perfTest) 559 : DictLookup(perfTest), trie(NULL), noDict(FALSE) { 560 IcuToolErrorCode errorCode("BytesTrieDictLookup()"); 561 builder=new BytesTrieBuilder(errorCode); 562 CharString str; 563 const ULine *lines=perf.getCachedLines(); 564 int32_t numLines=perf.getNumLines(); 565 for(int32_t i=0; i<numLines; ++i) { 566 // Skip comment lines (start with a character below 'A'). 567 if(lines[i].name[0]<0x41) { 568 continue; 569 } 570 if(!thaiWordToBytes(lines[i].name, lines[i].len, str.clear(), errorCode)) { 571 fprintf(stderr, "thaiWordToBytes(): failed for word %ld (0-based)\n", (long)i); 572 noDict=TRUE; 573 break; 574 } 575 builder->add(str.toStringPiece(), 0, errorCode); 576 } 577 if(!noDict) { 578 int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length(); 579 printf("size of BytesTrie: %6ld bytes\n", (long)length); 580 trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode); 581 } 582 } 583 584 virtual ~BytesTrieDictLookup() { 585 delete builder; 586 delete trie; 587 } 588 589 protected: 590 BytesTrieBuilder *builder; 591 BytesTrie *trie; 592 UBool noDict; 593 }; 594 595 static int32_t 596 bytesTrieMatches(BytesTrie &trie, 597 UText *text, int32_t textLimit, 598 int32_t *lengths, int &count, int limit ) { 599 UChar32 c=utext_next32(text); 600 if(c<0) { 601 return 0; 602 } 603 UStringTrieResult result=trie.first(thaiCharToByte(c)); 604 int32_t numChars=1; 605 count=0; 606 for(;;) { 607 if(USTRINGTRIE_HAS_VALUE(result)) { 608 if(count<limit) { 609 // lengths[count++]=(int32_t)utext_getNativeIndex(text); 610 lengths[count++]=numChars; // CompactTrieDictionary just counts chars too. 611 } 612 if(result==USTRINGTRIE_FINAL_VALUE) { 613 break; 614 } 615 } else if(result==USTRINGTRIE_NO_MATCH) { 616 break; 617 } 618 if(numChars>=textLimit) { 619 break; 620 } 621 UChar32 c=utext_next32(text); 622 if(c<0) { 623 break; 624 } 625 ++numChars; 626 result=trie.next(thaiCharToByte(c)); 627 } 628 return numChars; 629 } 630 631 class BytesTrieDictMatches : public BytesTrieDictLookup { 632 public: 633 BytesTrieDictMatches(const DictionaryTriePerfTest &perfTest) 634 : BytesTrieDictLookup(perfTest) {} 635 636 virtual void call(UErrorCode *pErrorCode) { 637 if(noDict) { 638 return; 639 } 640 UText text=UTEXT_INITIALIZER; 641 int32_t lengths[20]; 642 const ULine *lines=perf.getCachedLines(); 643 int32_t numLines=perf.getNumLines(); 644 for(int32_t i=0; i<numLines; ++i) { 645 // Skip comment lines (start with a character below 'A'). 646 if(lines[i].name[0]<0x41) { 647 continue; 648 } 649 utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode); 650 int32_t count=0; 651 bytesTrieMatches(*trie, &text, lines[i].len, 652 lengths, count, LENGTHOF(lengths)); 653 if(count==0 || lengths[count-1]!=lines[i].len) { 654 fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 655 } 656 } 657 } 658 }; 659 660 class BytesTrieDictContains : public BytesTrieDictLookup { 661 public: 662 BytesTrieDictContains(const DictionaryTriePerfTest &perfTest) 663 : BytesTrieDictLookup(perfTest) {} 664 665 virtual void call(UErrorCode * /*pErrorCode*/) { 666 if(noDict) { 667 return; 668 } 669 const ULine *lines=perf.getCachedLines(); 670 int32_t numLines=perf.getNumLines(); 671 for(int32_t i=0; i<numLines; ++i) { 672 const UChar *line=lines[i].name; 673 // Skip comment lines (start with a character below 'A'). 674 if(line[0]<0x41) { 675 continue; 676 } 677 UStringTrieResult result=trie->first(thaiCharToByte(line[0])); 678 int32_t lineLength=lines[i].len; 679 for(int32_t j=1; j<lineLength; ++j) { 680 if(!USTRINGTRIE_HAS_NEXT(result)) { 681 fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 682 break; 683 } 684 result=trie->next(thaiCharToByte(line[j])); 685 } 686 if(!USTRINGTRIE_HAS_VALUE(result)) { 687 fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 688 } 689 } 690 } 691 }; 692 693 UPerfFunction *DictionaryTriePerfTest::runIndexedTest(int32_t index, UBool exec, 694 const char *&name, char * /*par*/) { 695 if(hasFile()) { 696 switch(index) { 697 case 0: 698 name="compacttriematches"; 699 if(exec) { 700 return new CompactTrieDictLookup(*this); 701 } 702 break; 703 case 1: 704 name="ucharstriematches"; 705 if(exec) { 706 return new UCharsTrieDictMatches(*this); 707 } 708 break; 709 case 2: 710 name="ucharstriecontains"; 711 if(exec) { 712 return new UCharsTrieDictContains(*this); 713 } 714 break; 715 case 3: 716 name="bytestriematches"; 717 if(exec) { 718 return new BytesTrieDictMatches(*this); 719 } 720 break; 721 case 4: 722 name="bytestriecontains"; 723 if(exec) { 724 return new BytesTrieDictContains(*this); 725 } 726 break; 727 default: 728 name=""; 729 break; 730 } 731 } else { 732 if(index==0 && exec) { 733 puts("Running BytesTrie perf tests on the .dat package file from the --sourcedir.\n" 734 "For UCharsTrie perf tests on a dictionary text file, specify the -f or --file-name.\n"); 735 } 736 switch(index) { 737 case 0: 738 name="simplebinarysearch"; 739 if(exec) { 740 return new BinarySearchPackageLookup(*this); 741 } 742 break; 743 case 1: 744 name="prefixbinarysearch"; 745 if(exec) { 746 return new PrefixBinarySearchPackageLookup(*this); 747 } 748 break; 749 case 2: 750 name="bytestrie"; 751 if(exec) { 752 return new BytesTriePackageLookup(*this); 753 } 754 break; 755 default: 756 name=""; 757 break; 758 } 759 } 760 return NULL; 761 } 762 763 int main(int argc, const char *argv[]) { 764 IcuToolErrorCode errorCode("dicttrieperf main()"); 765 DictionaryTriePerfTest test(argc, argv, errorCode); 766 if(errorCode.isFailure()) { 767 fprintf(stderr, "DictionaryTriePerfTest() failed: %s\n", errorCode.errorName()); 768 test.usage(); 769 return errorCode.reset(); 770 } 771 if(!test.run()) { 772 fprintf(stderr, "FAILED: Tests could not be run, please check the arguments.\n"); 773 return -1; 774 } 775 return 0; 776 } 777