1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006-2008, International Business Machines Corporation * 4 * and others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "triedict.h" 13 #include "unicode/chariter.h" 14 #include "unicode/uchriter.h" 15 #include "unicode/strenum.h" 16 #include "unicode/uenum.h" 17 #include "unicode/udata.h" 18 #include "cmemory.h" 19 #include "udataswp.h" 20 #include "uvector.h" 21 #include "uvectr32.h" 22 #include "uarrsort.h" 23 #include "hash.h" 24 25 //#define DEBUG_TRIE_DICT 1 26 27 #ifdef DEBUG_TRIE_DICT 28 #include <sys/times.h> 29 #include <limits.h> 30 #include <stdio.h> 31 #include <time.h> 32 #ifndef CLK_TCK 33 #define CLK_TCK CLOCKS_PER_SEC 34 #endif 35 36 #endif 37 38 U_NAMESPACE_BEGIN 39 40 /******************************************************************* 41 * TrieWordDictionary 42 */ 43 44 TrieWordDictionary::TrieWordDictionary() { 45 } 46 47 TrieWordDictionary::~TrieWordDictionary() { 48 } 49 50 /******************************************************************* 51 * MutableTrieDictionary 52 */ 53 54 //#define MAX_VALUE 65535 55 56 // forward declaration 57 inline uint16_t scaleLogProbabilities(double logprob); 58 59 // Node structure for the ternary, uncompressed trie 60 struct TernaryNode : public UMemory { 61 UChar ch; // UTF-16 code unit 62 uint16_t flags; // Flag word 63 TernaryNode *low; // Less-than link 64 TernaryNode *equal; // Equal link 65 TernaryNode *high; // Greater-than link 66 67 TernaryNode(UChar uc); 68 ~TernaryNode(); 69 }; 70 71 enum MutableTrieNodeFlags { 72 kEndsWord = 0x0001 // This node marks the end of a valid word 73 }; 74 75 inline 76 TernaryNode::TernaryNode(UChar uc) { 77 ch = uc; 78 flags = 0; 79 low = NULL; 80 equal = NULL; 81 high = NULL; 82 } 83 84 // Not inline since it's recursive 85 TernaryNode::~TernaryNode() { 86 delete low; 87 delete equal; 88 delete high; 89 } 90 91 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status, 92 UBool containsValue /* = FALSE */ ) { 93 // Start the trie off with something. Having the root node already present 94 // cuts a special case out of the search/insertion functions. 95 // Making it a median character cuts the worse case for searches from 96 // 4x a balanced trie to 2x a balanced trie. It's best to choose something 97 // that starts a word that is midway in the list. 98 fTrie = new TernaryNode(median); 99 if (fTrie == NULL) { 100 status = U_MEMORY_ALLOCATION_ERROR; 101 } 102 fIter = utext_openUChars(NULL, NULL, 0, &status); 103 if (U_SUCCESS(status) && fIter == NULL) { 104 status = U_MEMORY_ALLOCATION_ERROR; 105 } 106 107 fValued = containsValue; 108 } 109 110 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status, 111 UBool containsValue /* = false */ ) { 112 fTrie = NULL; 113 fIter = utext_openUChars(NULL, NULL, 0, &status); 114 if (U_SUCCESS(status) && fIter == NULL) { 115 status = U_MEMORY_ALLOCATION_ERROR; 116 } 117 118 fValued = containsValue; 119 } 120 121 MutableTrieDictionary::~MutableTrieDictionary() { 122 delete fTrie; 123 utext_close(fIter); 124 } 125 126 int32_t 127 MutableTrieDictionary::search( UText *text, 128 int32_t maxLength, 129 int32_t *lengths, 130 int &count, 131 int limit, 132 TernaryNode *&parent, 133 UBool &pMatched, 134 uint16_t *values /*=NULL*/) const { 135 // TODO: current implementation works in UTF-16 space 136 const TernaryNode *up = NULL; 137 const TernaryNode *p = fTrie; 138 int mycount = 0; 139 pMatched = TRUE; 140 int i; 141 142 if (!fValued) { 143 values = NULL; 144 } 145 146 UChar uc = utext_current32(text); 147 for (i = 0; i < maxLength && p != NULL; ++i) { 148 while (p != NULL) { 149 if (uc < p->ch) { 150 up = p; 151 p = p->low; 152 } 153 else if (uc == p->ch) { 154 break; 155 } 156 else { 157 up = p; 158 p = p->high; 159 } 160 } 161 if (p == NULL) { 162 pMatched = FALSE; 163 break; 164 } 165 // Must be equal to get here 166 if (limit > 0 && (p->flags > 0)) { 167 //is there a more efficient way to add values? ie. remove if stmt 168 if(values != NULL) { 169 values[mycount] = p->flags; 170 } 171 lengths[mycount++] = i+1; 172 --limit; 173 } 174 up = p; 175 p = p->equal; 176 uc = utext_next32(text); 177 uc = utext_current32(text); 178 } 179 180 // Note that there is no way to reach here with up == 0 unless 181 // maxLength is 0 coming in. 182 parent = (TernaryNode *)up; 183 count = mycount; 184 return i; 185 } 186 187 void 188 MutableTrieDictionary::addWord( const UChar *word, 189 int32_t length, 190 UErrorCode &status, 191 uint16_t value /* = 0 */ ) { 192 // dictionary cannot store zero values, would interfere with flags 193 if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) { 194 status = U_ILLEGAL_ARGUMENT_ERROR; 195 return; 196 } 197 198 TernaryNode *parent; 199 UBool pMatched; 200 int count; 201 fIter = utext_openUChars(fIter, word, length, &status); 202 203 int matched; 204 matched = search(fIter, length, NULL, count, 0, parent, pMatched); 205 206 while (matched++ < length) { 207 UChar32 uc = utext_next32(fIter); // TODO: supplementary support? 208 U_ASSERT(uc != U_SENTINEL); 209 TernaryNode *newNode = new TernaryNode(uc); 210 if (newNode == NULL) { 211 status = U_MEMORY_ALLOCATION_ERROR; 212 return; 213 } 214 if (pMatched) { 215 parent->equal = newNode; 216 } 217 else { 218 pMatched = TRUE; 219 if (uc < parent->ch) { 220 parent->low = newNode; 221 } 222 else { 223 parent->high = newNode; 224 } 225 } 226 parent = newNode; 227 } 228 229 if(fValued && value > 0){ 230 parent->flags = value; 231 } else { 232 parent->flags |= kEndsWord; 233 } 234 } 235 236 int32_t 237 MutableTrieDictionary::matches( UText *text, 238 int32_t maxLength, 239 int32_t *lengths, 240 int &count, 241 int limit, 242 uint16_t *values /*=NULL*/) const { 243 TernaryNode *parent; 244 UBool pMatched; 245 return search(text, maxLength, lengths, count, limit, parent, pMatched, values); 246 } 247 248 // Implementation of iteration for MutableTrieDictionary 249 class MutableTrieEnumeration : public StringEnumeration { 250 private: 251 UStack fNodeStack; // Stack of nodes to process 252 UVector32 fBranchStack; // Stack of which branch we are working on 253 TernaryNode *fRoot; // Root node 254 enum StackBranch { 255 kLessThan, 256 kEqual, 257 kGreaterThan, 258 kDone 259 }; 260 261 public: 262 static UClassID U_EXPORT2 getStaticClassID(void); 263 virtual UClassID getDynamicClassID(void) const; 264 public: 265 MutableTrieEnumeration(TernaryNode *root, UErrorCode &status) 266 : fNodeStack(status), fBranchStack(status) { 267 fRoot = root; 268 fNodeStack.push(root, status); 269 fBranchStack.push(kLessThan, status); 270 unistr.remove(); 271 } 272 273 virtual ~MutableTrieEnumeration() { 274 } 275 276 virtual StringEnumeration *clone() const { 277 UErrorCode status = U_ZERO_ERROR; 278 return new MutableTrieEnumeration(fRoot, status); 279 } 280 281 virtual const UnicodeString *snext(UErrorCode &status) { 282 if (fNodeStack.empty() || U_FAILURE(status)) { 283 return NULL; 284 } 285 TernaryNode *node = (TernaryNode *) fNodeStack.peek(); 286 StackBranch where = (StackBranch) fBranchStack.peeki(); 287 while (!fNodeStack.empty() && U_SUCCESS(status)) { 288 UBool emit; 289 UBool equal; 290 291 switch (where) { 292 case kLessThan: 293 if (node->low != NULL) { 294 fBranchStack.setElementAt(kEqual, fBranchStack.size()-1); 295 node = (TernaryNode *) fNodeStack.push(node->low, status); 296 where = (StackBranch) fBranchStack.push(kLessThan, status); 297 break; 298 } 299 case kEqual: 300 emit = node->flags > 0; 301 equal = (node->equal != NULL); 302 // If this node should be part of the next emitted string, append 303 // the UChar to the string, and make sure we pop it when we come 304 // back to this node. The character should only be in the string 305 // for as long as we're traversing the equal subtree of this node 306 if (equal || emit) { 307 unistr.append(node->ch); 308 fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-1); 309 } 310 if (equal) { 311 node = (TernaryNode *) fNodeStack.push(node->equal, status); 312 where = (StackBranch) fBranchStack.push(kLessThan, status); 313 } 314 if (emit) { 315 return &unistr; 316 } 317 if (equal) { 318 break; 319 } 320 case kGreaterThan: 321 // If this node's character is in the string, remove it. 322 if (node->equal != NULL || node->flags > 0) { 323 unistr.truncate(unistr.length()-1); 324 } 325 if (node->high != NULL) { 326 fBranchStack.setElementAt(kDone, fBranchStack.size()-1); 327 node = (TernaryNode *) fNodeStack.push(node->high, status); 328 where = (StackBranch) fBranchStack.push(kLessThan, status); 329 break; 330 } 331 case kDone: 332 fNodeStack.pop(); 333 fBranchStack.popi(); 334 node = (TernaryNode *) fNodeStack.peek(); 335 where = (StackBranch) fBranchStack.peeki(); 336 break; 337 default: 338 return NULL; 339 } 340 } 341 return NULL; 342 } 343 344 // Very expensive, but this should never be used. 345 virtual int32_t count(UErrorCode &status) const { 346 MutableTrieEnumeration counter(fRoot, status); 347 int32_t result = 0; 348 while (counter.snext(status) != NULL && U_SUCCESS(status)) { 349 ++result; 350 } 351 return result; 352 } 353 354 virtual void reset(UErrorCode &status) { 355 fNodeStack.removeAllElements(); 356 fBranchStack.removeAllElements(); 357 fNodeStack.push(fRoot, status); 358 fBranchStack.push(kLessThan, status); 359 unistr.remove(); 360 } 361 }; 362 363 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration) 364 365 StringEnumeration * 366 MutableTrieDictionary::openWords( UErrorCode &status ) const { 367 if (U_FAILURE(status)) { 368 return NULL; 369 } 370 return new MutableTrieEnumeration(fTrie, status); 371 } 372 373 /******************************************************************* 374 * CompactTrieDictionary 375 */ 376 377 //TODO further optimization: 378 // minimise size of trie with logprobs by storing values 379 // for terminal nodes directly in offsets[] 380 // --> calculating from next offset *might* be simpler, but would have to add 381 // one last offset for logprob of last node 382 // --> if calculate from current offset, need to factor in possible overflow 383 // as well. 384 // idea: store in offset, set first bit to indicate logprob storage-->won't 385 // have to access additional node 386 387 // {'Dic', 1}, version 1: uses old header, no values 388 #define COMPACT_TRIE_MAGIC_1 0x44696301 389 // version 2: uses new header (more than 2^16 nodes), no values 390 #define COMPACT_TRIE_MAGIC_2 0x44696302 391 // version 3: uses new header, includes values 392 #define COMPACT_TRIE_MAGIC_3 0x44696303 393 394 struct CompactTrieHeader { 395 uint32_t size; // Size of the data in bytes 396 uint32_t magic; // Magic number (including version) 397 uint32_t nodeCount; // Number of entries in offsets[] 398 uint32_t root; // Node number of the root node 399 uint32_t offsets[1]; // Offsets to nodes from start of data 400 }; 401 402 // old version of CompactTrieHeader kept for backwards compatibility 403 struct CompactTrieHeaderV1 { 404 uint32_t size; // Size of the data in bytes 405 uint32_t magic; // Magic number (including version) 406 uint16_t nodeCount; // Number of entries in offsets[] 407 uint16_t root; // Node number of the root node 408 uint32_t offsets[1]; // Offsets to nodes from start of data 409 }; 410 411 // Helper class for managing CompactTrieHeader and CompactTrieHeaderV1 412 struct CompactTrieInfo { 413 uint32_t size; // Size of the data in bytes 414 uint32_t magic; // Magic number (including version) 415 uint32_t nodeCount; // Number of entries in offsets[] 416 uint32_t root; // Node number of the root node 417 uint32_t *offsets; // Offsets to nodes from start of data 418 uint8_t *address; // pointer to header bytes in memory 419 420 CompactTrieInfo(const void *data, UErrorCode &status){ 421 CompactTrieHeader *header = (CompactTrieHeader *) data; 422 if (header->magic != COMPACT_TRIE_MAGIC_1 && 423 header->magic != COMPACT_TRIE_MAGIC_2 && 424 header->magic != COMPACT_TRIE_MAGIC_3) { 425 status = U_ILLEGAL_ARGUMENT_ERROR; 426 } else { 427 size = header->size; 428 magic = header->magic; 429 430 if (header->magic == COMPACT_TRIE_MAGIC_1) { 431 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header; 432 nodeCount = headerV1->nodeCount; 433 root = headerV1->root; 434 offsets = &(headerV1->offsets[0]); 435 address = (uint8_t *)headerV1; 436 } else { 437 nodeCount = header->nodeCount; 438 root = header->root; 439 offsets = &(header->offsets[0]); 440 address = (uint8_t *)header; 441 } 442 } 443 } 444 445 ~CompactTrieInfo(){} 446 }; 447 448 // Note that to avoid platform-specific alignment issues, all members of the node 449 // structures should be the same size, or should contain explicit padding to 450 // natural alignment boundaries. 451 452 // We can't use a bitfield for the flags+count field, because the layout of those 453 // is not portable. 12 bits of count allows for up to 4096 entries in a node. 454 struct CompactTrieNode { 455 uint16_t flagscount; // Count of sub-entries, plus flags 456 }; 457 458 enum CompactTrieNodeFlags { 459 kVerticalNode = 0x1000, // This is a vertical node 460 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word 461 kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kReservedFlag1 462 kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReservedFlag2 463 kCountMask = 0x0FFF, // The count portion of flagscount 464 kFlagMask = 0xF000, // The flags portion of flagscount 465 kRootCountMask = 0x7FFF // The count portion of flagscount in the root node 466 467 //offset flags: 468 //kOffsetContainsValue = 0x80000000 // Offset contains value for parent node 469 }; 470 471 // The two node types are distinguished by the kVerticalNode flag. 472 473 struct CompactTrieHorizontalEntry { 474 uint16_t ch; // UChar 475 uint16_t equal; // Equal link node index 476 }; 477 478 // We don't use inheritance here because C++ does not guarantee that the 479 // base class comes first in memory!! 480 481 struct CompactTrieHorizontalNode { 482 uint16_t flagscount; // Count of sub-entries, plus flags 483 CompactTrieHorizontalEntry entries[1]; 484 }; 485 486 struct CompactTrieVerticalNode { 487 uint16_t flagscount; // Count of sub-entries, plus flags 488 uint16_t equal; // Equal link node index 489 uint16_t chars[1]; // Code units 490 }; 491 492 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, 493 UErrorCode &status ) 494 : fUData(dataObj) 495 { 496 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); 497 *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status); 498 fOwnData = FALSE; 499 } 500 501 CompactTrieDictionary::CompactTrieDictionary( const void *data, 502 UErrorCode &status ) 503 : fUData(NULL) 504 { 505 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); 506 *fInfo = CompactTrieInfo(data, status); 507 fOwnData = FALSE; 508 } 509 510 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict, 511 UErrorCode &status ) 512 : fUData(NULL) 513 { 514 const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status); 515 if (U_SUCCESS(status)) { 516 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); 517 *fInfo = CompactTrieInfo(header, status); 518 } 519 520 fOwnData = !U_FAILURE(status); 521 } 522 523 CompactTrieDictionary::~CompactTrieDictionary() { 524 if (fOwnData) { 525 uprv_free((void *)(fInfo->address)); 526 } 527 uprv_free((void *)fInfo); 528 529 if (fUData) { 530 udata_close(fUData); 531 } 532 } 533 534 UBool CompactTrieDictionary::getValued() const{ 535 return fInfo->magic == COMPACT_TRIE_MAGIC_3; 536 } 537 538 uint32_t 539 CompactTrieDictionary::dataSize() const { 540 return fInfo->size; 541 } 542 543 const void * 544 CompactTrieDictionary::data() const { 545 return fInfo->address; 546 } 547 548 //This function finds the address of a node for us, given its node ID 549 static inline const CompactTrieNode * 550 getCompactNode(const CompactTrieInfo *info, uint32_t node) { 551 if(node < info->root-1) { 552 return (const CompactTrieNode *)(&info->offsets[node]); 553 } else { 554 return (const CompactTrieNode *)(info->address + info->offsets[node]); 555 } 556 } 557 558 //this version of getCompactNode is currently only used in compactMutableTrieDictionary() 559 static inline const CompactTrieNode * 560 getCompactNode(const CompactTrieHeader *header, uint32_t node) { 561 if(node < header->root-1) { 562 return (const CompactTrieNode *)(&header->offsets[node]); 563 } else { 564 return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]); 565 } 566 } 567 568 569 /** 570 * Calculates the number of links in a node 571 * @node The specified node 572 */ 573 static inline const uint16_t 574 getCount(const CompactTrieNode *node){ 575 return (node->flagscount & kCountMask); 576 //use the code below if number of links ever exceed 4096 577 //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCount) >> 2); 578 } 579 580 /** 581 * calculates an equal link node ID of a horizontal node 582 * @hnode The horizontal node containing the equal link 583 * @param index The index into hnode->entries[] 584 * @param nodeCount The length of hnode->entries[] 585 */ 586 static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){ 587 if(vnode->flagscount & kEqualOverflows){ 588 // treat overflow bits as an extension of chars[] 589 uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNode*)vnode)]; 590 return vnode->equal + (((uint32_t)*overflow) << 16); 591 }else{ 592 return vnode->equal; 593 } 594 } 595 596 /** 597 * calculates an equal link node ID of a horizontal node 598 * @hnode The horizontal node containing the equal link 599 * @param index The index into hnode->entries[] 600 * @param nodeCount The length of hnode->entries[] 601 */ 602 static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uint16_t index, uint16_t nodeCount){ 603 if(hnode->flagscount & kEqualOverflows){ 604 //set overflow to point to the uint16_t containing the overflow bits 605 uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount]; 606 overflow += index/4; 607 uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10; 608 return hnode->entries[index].equal + (((uint32_t)extraBits) << 16); 609 } else { 610 return hnode->entries[index].equal; 611 } 612 } 613 614 /** 615 * Returns the value stored in the specified node which is associated with its 616 * parent node. 617 * TODO: how to tell that value is stored in node or in offset? check whether 618 * node ID < fInfo->root! 619 */ 620 static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){ 621 uint16_t count = getCount((CompactTrieNode *)hnode); 622 uint16_t overflowSize = 0; //size of node ID overflow storage in bytes 623 624 if(hnode->flagscount & kEqualOverflows) 625 overflowSize = (count + 3) / 4 * sizeof(uint16_t); 626 return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize)); 627 } 628 629 static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){ 630 // calculate size of total node ID overflow storage in bytes 631 uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16_t) : 0; 632 return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)vnode)] + overflowSize)); 633 } 634 635 static inline uint16_t getValue(const CompactTrieNode *node){ 636 if(node->flagscount & kVerticalNode) 637 return getValue((const CompactTrieVerticalNode *)node); 638 else 639 return getValue((const CompactTrieHorizontalNode *)node); 640 } 641 642 //returns index of match in CompactTrieHorizontalNode.entries[] using binary search 643 inline int16_t 644 searchHorizontalEntries(const CompactTrieHorizontalEntry *entries, 645 UChar uc, uint16_t nodeCount){ 646 int low = 0; 647 int high = nodeCount-1; 648 int middle; 649 while (high >= low) { 650 middle = (high+low)/2; 651 if (uc == entries[middle].ch) { 652 return middle; 653 } 654 else if (uc < entries[middle].ch) { 655 high = middle-1; 656 } 657 else { 658 low = middle+1; 659 } 660 } 661 662 return -1; 663 } 664 665 int32_t 666 CompactTrieDictionary::matches( UText *text, 667 int32_t maxLength, 668 int32_t *lengths, 669 int &count, 670 int limit, 671 uint16_t *values /*= NULL*/) const { 672 if (fInfo->magic == COMPACT_TRIE_MAGIC_2) 673 values = NULL; 674 675 // TODO: current implementation works in UTF-16 space 676 const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root); 677 int mycount = 0; 678 679 UChar uc = utext_current32(text); 680 int i = 0; 681 682 // handle root node with only kEqualOverflows flag: assume horizontal node without parent 683 if(node != NULL){ 684 const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node; 685 int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask); 686 if(index > -1){ 687 node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagscount & kRootCountMask)); 688 utext_next32(text); 689 uc = utext_current32(text); 690 ++i; 691 }else{ 692 node = NULL; 693 } 694 } 695 696 while (node != NULL) { 697 // Check if the node we just exited ends a word 698 if (limit > 0 && (node->flagscount & kParentEndsWord)) { 699 if(values != NULL){ 700 values[mycount] = getValue(node); 701 } 702 lengths[mycount++] = i; 703 --limit; 704 } 705 // Check that we haven't exceeded the maximum number of input characters. 706 // We have to do that here rather than in the while condition so that 707 // we can check for ending a word, above. 708 if (i >= maxLength) { 709 break; 710 } 711 712 int nodeCount = getCount(node); 713 if (nodeCount == 0) { 714 // Special terminal node; return now 715 break; 716 } 717 if (node->flagscount & kVerticalNode) { 718 // Vertical node; check all the characters in it 719 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node; 720 for (int j = 0; j < nodeCount && i < maxLength; ++j) { 721 if (uc != vnode->chars[j]) { 722 // We hit a non-equal character; return 723 goto exit; 724 } 725 utext_next32(text); 726 uc = utext_current32(text); 727 ++i; 728 } 729 // To get here we must have come through the whole list successfully; 730 // go on to the next node. Note that a word cannot end in the middle 731 // of a vertical node. 732 node = getCompactNode(fInfo, calcEqualLink(vnode)); 733 } 734 else { 735 // Horizontal node; do binary search 736 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; 737 const CompactTrieHorizontalEntry *entries; 738 entries = hnode->entries; 739 740 int index = searchHorizontalEntries(entries, uc, nodeCount); 741 if(index > -1){ // 742 // We hit a match; get the next node and next character 743 node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCount)); 744 utext_next32(text); 745 uc = utext_current32(text); 746 ++i; 747 }else{ 748 node = NULL; // If we don't find a match, we'll fall out of the loop 749 } 750 } 751 } 752 exit: 753 count = mycount; 754 return i; 755 } 756 757 // Implementation of iteration for CompactTrieDictionary 758 class CompactTrieEnumeration : public StringEnumeration { 759 private: 760 UVector32 fNodeStack; // Stack of nodes to process 761 UVector32 fIndexStack; // Stack of where in node we are 762 const CompactTrieInfo *fInfo; // Trie data 763 764 public: 765 static UClassID U_EXPORT2 getStaticClassID(void); 766 virtual UClassID getDynamicClassID(void) const; 767 public: 768 CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status) 769 : fNodeStack(status), fIndexStack(status) { 770 fInfo = info; 771 fNodeStack.push(info->root, status); 772 fIndexStack.push(0, status); 773 unistr.remove(); 774 } 775 776 virtual ~CompactTrieEnumeration() { 777 } 778 779 virtual StringEnumeration *clone() const { 780 UErrorCode status = U_ZERO_ERROR; 781 return new CompactTrieEnumeration(fInfo, status); 782 } 783 784 virtual const UnicodeString * snext(UErrorCode &status); 785 786 // Very expensive, but this should never be used. 787 virtual int32_t count(UErrorCode &status) const { 788 CompactTrieEnumeration counter(fInfo, status); 789 int32_t result = 0; 790 while (counter.snext(status) != NULL && U_SUCCESS(status)) { 791 ++result; 792 } 793 return result; 794 } 795 796 virtual void reset(UErrorCode &status) { 797 fNodeStack.removeAllElements(); 798 fIndexStack.removeAllElements(); 799 fNodeStack.push(fInfo->root, status); 800 fIndexStack.push(0, status); 801 unistr.remove(); 802 } 803 }; 804 805 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration) 806 807 const UnicodeString * 808 CompactTrieEnumeration::snext(UErrorCode &status) { 809 if (fNodeStack.empty() || U_FAILURE(status)) { 810 return NULL; 811 } 812 const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki()); 813 int where = fIndexStack.peeki(); 814 while (!fNodeStack.empty() && U_SUCCESS(status)) { 815 int nodeCount; 816 817 bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root); 818 if(isRoot){ 819 nodeCount = node->flagscount & kRootCountMask; 820 } else { 821 nodeCount = getCount(node); 822 } 823 824 UBool goingDown = FALSE; 825 if (nodeCount == 0) { 826 // Terminal node; go up immediately 827 fNodeStack.popi(); 828 fIndexStack.popi(); 829 node = getCompactNode(fInfo, fNodeStack.peeki()); 830 where = fIndexStack.peeki(); 831 } 832 else if ((node->flagscount & kVerticalNode) && !isRoot) { 833 // Vertical node 834 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node; 835 if (where == 0) { 836 // Going down 837 unistr.append((const UChar *)vnode->chars, nodeCount); 838 fIndexStack.setElementAt(1, fIndexStack.size()-1); 839 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode), status)); 840 where = fIndexStack.push(0, status); 841 goingDown = TRUE; 842 } 843 else { 844 // Going up 845 unistr.truncate(unistr.length()-nodeCount); 846 fNodeStack.popi(); 847 fIndexStack.popi(); 848 node = getCompactNode(fInfo, fNodeStack.peeki()); 849 where = fIndexStack.peeki(); 850 } 851 } 852 else { 853 // Horizontal node 854 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; 855 if (where > 0) { 856 // Pop previous char 857 unistr.truncate(unistr.length()-1); 858 } 859 if (where < nodeCount) { 860 // Push on next node 861 unistr.append((UChar)hnode->entries[where].ch); 862 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); 863 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode, where, nodeCount), status)); 864 where = fIndexStack.push(0, status); 865 goingDown = TRUE; 866 } 867 else { 868 // Going up 869 fNodeStack.popi(); 870 fIndexStack.popi(); 871 node = getCompactNode(fInfo, fNodeStack.peeki()); 872 where = fIndexStack.peeki(); 873 } 874 } 875 876 // Check if the parent of the node we've just gone down to ends a 877 // word. If so, return it. 878 // The root node should never end up here. 879 if (goingDown && (node->flagscount & kParentEndsWord)) { 880 return &unistr; 881 } 882 } 883 return NULL; 884 } 885 886 StringEnumeration * 887 CompactTrieDictionary::openWords( UErrorCode &status ) const { 888 if (U_FAILURE(status)) { 889 return NULL; 890 } 891 return new CompactTrieEnumeration(fInfo, status); 892 } 893 894 // 895 // Below here is all code related to converting a ternary trie to a compact trie 896 // and back again 897 // 898 899 enum CompactTrieNodeType { 900 kHorizontalType = 0, 901 kVerticalType = 1, 902 kValueType = 2 903 }; 904 905 /** 906 * The following classes (i.e. BuildCompactTrie*Node) are helper classes to 907 * construct the compact trie by storing information for each node and later 908 * writing the node to memory in a sequential format. 909 */ 910 class BuildCompactTrieNode: public UMemory { 911 public: 912 UBool fParentEndsWord; 913 CompactTrieNodeType fNodeType; 914 UBool fHasDuplicate; 915 UBool fEqualOverflows; 916 int32_t fNodeID; 917 UnicodeString fChars; 918 uint16_t fValue; 919 920 public: 921 BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType, 922 UStack &nodes, UErrorCode &status, uint16_t value = 0) { 923 fParentEndsWord = parentEndsWord; 924 fHasDuplicate = FALSE; 925 fNodeType = nodeType; 926 fEqualOverflows = FALSE; 927 fNodeID = nodes.size(); 928 fValue = parentEndsWord? value : 0; 929 nodes.push(this, status); 930 } 931 932 virtual ~BuildCompactTrieNode() { 933 } 934 935 virtual uint32_t size() { 936 if(fValue > 0) 937 return sizeof(uint16_t) * 2; 938 else 939 return sizeof(uint16_t); 940 } 941 942 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) { 943 // Write flag/count 944 945 // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be 946 // used as a 5th MSB. 947 U_ASSERT(fChars.length() < 4096 || fNodeID == 2); 948 949 *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) | 950 ((fNodeID == 2)? (fChars.length() & kRootCountMask): 951 ( 952 (fChars.length() & kCountMask) | 953 //((fChars.length() << 2) & kExceedsCount) | 954 (fNodeType == kVerticalType ? kVerticalNode : 0) | 955 (fParentEndsWord ? kParentEndsWord : 0 ) 956 ) 957 ); 958 offset += sizeof(uint16_t); 959 } 960 961 virtual void writeValue(uint8_t *bytes, uint32_t &offset) { 962 if(fValue > 0){ 963 *((uint16_t *)(bytes+offset)) = fValue; 964 offset += sizeof(uint16_t); 965 } 966 } 967 968 }; 969 970 /** 971 * Stores value of parent terminating nodes that have no more subtries. 972 */ 973 class BuildCompactTrieValueNode: public BuildCompactTrieNode { 974 public: 975 BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value) 976 : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){ 977 } 978 979 virtual ~BuildCompactTrieValueNode(){ 980 } 981 982 virtual uint32_t size() { 983 return sizeof(uint16_t) * 2; 984 } 985 986 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { 987 // don't write value directly to memory but store it in offset to be written later 988 //offset = fValue & kOffsetContainsValue; 989 BuildCompactTrieNode::write(bytes, offset, translate); 990 BuildCompactTrieNode::writeValue(bytes, offset); 991 } 992 }; 993 994 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { 995 public: 996 UStack fLinks; 997 UBool fMayOverflow; //intermediate value for fEqualOverflows 998 999 public: 1000 BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0) 1001 : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value), fLinks(status) { 1002 fMayOverflow = FALSE; 1003 } 1004 1005 virtual ~BuildCompactTrieHorizontalNode() { 1006 } 1007 1008 // It is impossible to know beforehand exactly how much space the node will 1009 // need in memory before being written, because the node IDs in the equal 1010 // links may or may not overflow after node coalescing. Therefore, this method 1011 // returns the maximum size possible for the node. 1012 virtual uint32_t size() { 1013 uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) + 1014 (fChars.length()*sizeof(CompactTrieHorizontalEntry)); 1015 1016 if(fValue > 0) 1017 estimatedSize += sizeof(uint16_t); 1018 1019 //estimate extra space needed to store overflow for node ID links 1020 //may be more than what is actually needed 1021 for(int i=0; i < fChars.length(); i++){ 1022 if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){ 1023 fMayOverflow = TRUE; 1024 break; 1025 } 1026 } 1027 if(fMayOverflow) // added space for overflow should be same as ceil(fChars.length()/4) * sizeof(uint16_t) 1028 estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4; 1029 1030 return estimatedSize; 1031 } 1032 1033 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { 1034 int32_t count = fChars.length(); 1035 1036 //if largest nodeID > 2^16, set flag 1037 //large node IDs are more likely to be at the back of the array 1038 for (int32_t i = count-1; i >= 0; --i) { 1039 if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) > 0xFFFF){ 1040 fEqualOverflows = TRUE; 1041 break; 1042 } 1043 } 1044 1045 BuildCompactTrieNode::write(bytes, offset, translate); 1046 1047 // write entries[] to memory 1048 for (int32_t i = 0; i < count; ++i) { 1049 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset); 1050 entry->ch = fChars[i]; 1051 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID); 1052 #ifdef DEBUG_TRIE_DICT 1053 1054 if ((entry->equal == 0) && !fEqualOverflows) { 1055 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n", 1056 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); 1057 } 1058 #endif 1059 offset += sizeof(CompactTrieHorizontalEntry); 1060 } 1061 1062 // append extra bits of equal nodes to end if fEqualOverflows 1063 if (fEqualOverflows) { 1064 uint16_t leftmostBits = 0; 1065 for (int16_t i = 0; i < count; i++) { 1066 leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i); 1067 1068 // write filled uint16_t to memory 1069 if(i % 4 == 3){ 1070 *((uint16_t *)(bytes+offset)) = leftmostBits; 1071 leftmostBits = 0; 1072 offset += sizeof(uint16_t); 1073 } 1074 } 1075 1076 // pad last uint16_t with zeroes if necessary 1077 int remainder = count % 4; 1078 if (remainder > 0) { 1079 *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remainder)); 1080 offset += sizeof(uint16_t); 1081 } 1082 } 1083 1084 BuildCompactTrieNode::writeValue(bytes, offset); 1085 } 1086 1087 // returns leftmost bits of physical node link 1088 uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){ 1089 uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) >> 16); 1090 #ifdef DEBUG_TRIE_DICT 1091 if (leftmostBits > 0xF) { 1092 fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n", 1093 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); 1094 } 1095 #endif 1096 return leftmostBits; 1097 } 1098 1099 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { 1100 fChars.append(ch); 1101 fLinks.push(link, status); 1102 } 1103 1104 }; 1105 1106 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { 1107 public: 1108 BuildCompactTrieNode *fEqual; 1109 1110 public: 1111 BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0) 1112 : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) { 1113 fEqual = NULL; 1114 } 1115 1116 virtual ~BuildCompactTrieVerticalNode() { 1117 } 1118 1119 // Returns the maximum possible size of this node. See comment in 1120 // BuildCompactTrieHorizontal node for more information. 1121 virtual uint32_t size() { 1122 uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t)); 1123 if(fValue > 0){ 1124 estimatedSize += sizeof(uint16_t); 1125 } 1126 1127 if(fEqual->fNodeID > 0xFFFF){ 1128 estimatedSize += sizeof(uint16_t); 1129 } 1130 return estimatedSize; 1131 } 1132 1133 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { 1134 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset); 1135 fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF); 1136 BuildCompactTrieNode::write(bytes, offset, translate); 1137 node->equal = translate.elementAti(fEqual->fNodeID); 1138 offset += sizeof(node->equal); 1139 #ifdef DEBUG_TRIE_DICT 1140 if ((node->equal == 0) && !fEqualOverflows) { 1141 fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n", 1142 fEqual->fNodeID); 1143 } 1144 #endif 1145 fChars.extract(0, fChars.length(), (UChar *)node->chars); 1146 offset += sizeof(UChar)*fChars.length(); 1147 1148 // append 16 bits of to end for equal node if fEqualOverflows 1149 if (fEqualOverflows) { 1150 *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeID) >> 16); 1151 offset += sizeof(uint16_t); 1152 } 1153 1154 BuildCompactTrieNode::writeValue(bytes, offset); 1155 } 1156 1157 void addChar(UChar ch) { 1158 fChars.append(ch); 1159 } 1160 1161 void setLink(BuildCompactTrieNode *node) { 1162 fEqual = node; 1163 } 1164 1165 }; 1166 1167 // Forward declaration 1168 static void walkHorizontal(const TernaryNode *node, 1169 BuildCompactTrieHorizontalNode *building, 1170 UStack &nodes, 1171 UErrorCode &status, 1172 Hashtable *values); 1173 1174 // Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion. 1175 1176 static BuildCompactTrieNode * 1177 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, 1178 UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) { 1179 if (U_FAILURE(status)) { 1180 return NULL; 1181 } 1182 BuildCompactTrieNode *result = NULL; 1183 UBool horizontal = (node->low != NULL || node->high != NULL); 1184 if (horizontal) { 1185 BuildCompactTrieHorizontalNode *hResult; 1186 if(values != NULL){ 1187 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue); 1188 } else { 1189 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status); 1190 } 1191 1192 if (hResult == NULL) { 1193 status = U_MEMORY_ALLOCATION_ERROR; 1194 return NULL; 1195 } 1196 if (U_SUCCESS(status)) { 1197 walkHorizontal(node, hResult, nodes, status, values); 1198 result = hResult; 1199 } 1200 } 1201 else { 1202 BuildCompactTrieVerticalNode *vResult; 1203 if(values != NULL){ 1204 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status, parentValue); 1205 } else { 1206 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); 1207 } 1208 1209 if (vResult == NULL) { 1210 status = U_MEMORY_ALLOCATION_ERROR; 1211 return NULL; 1212 } 1213 else if (U_SUCCESS(status)) { 1214 uint16_t value = 0; 1215 UBool endsWord = FALSE; 1216 // Take up nodes until we end a word, or hit a node with < or > links 1217 do { 1218 vResult->addChar(node->ch); 1219 value = node->flags; 1220 endsWord = value > 0; 1221 node = node->equal; 1222 } 1223 while(node != NULL && !endsWord && node->low == NULL && node->high == NULL); 1224 1225 if (node == NULL) { 1226 if (!endsWord) { 1227 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie 1228 } 1229 else if(values != NULL){ 1230 UnicodeString key(value); //store value as a single-char UnicodeString 1231 BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key); 1232 if(link == NULL){ 1233 link = new BuildCompactTrieValueNode(nodes, status, value); //take out nodes? 1234 values->put(key, link, status); 1235 } 1236 vResult->setLink(link); 1237 } else { 1238 vResult->setLink((BuildCompactTrieNode *)nodes[1]); 1239 } 1240 } 1241 else { 1242 vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value)); 1243 } 1244 result = vResult; 1245 } 1246 } 1247 return result; 1248 } 1249 1250 // Walk the set of peers at the same level, to build a horizontal node. 1251 // Uses recursion. 1252 1253 static void walkHorizontal(const TernaryNode *node, 1254 BuildCompactTrieHorizontalNode *building, 1255 UStack &nodes, 1256 UErrorCode &status, Hashtable *values = NULL) { 1257 while (U_SUCCESS(status) && node != NULL) { 1258 if (node->low != NULL) { 1259 walkHorizontal(node->low, building, nodes, status, values); 1260 } 1261 BuildCompactTrieNode *link = NULL; 1262 if (node->equal != NULL) { 1263 link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags); 1264 } 1265 else if (node->flags > 0) { 1266 if(values != NULL) { 1267 UnicodeString key(node->flags); //store value as a single-char UnicodeString 1268 link = (BuildCompactTrieValueNode *) values->get(key); 1269 if(link == NULL) { 1270 link = new BuildCompactTrieValueNode(nodes, status, node->flags); //take out nodes? 1271 values->put(key, link, status); 1272 } 1273 } else { 1274 link = (BuildCompactTrieNode *)nodes[1]; 1275 } 1276 } 1277 if (U_SUCCESS(status) && link != NULL) { 1278 building->addNode(node->ch, link, status); 1279 } 1280 // Tail recurse manually instead of leaving it to the compiler. 1281 //if (node->high != NULL) { 1282 // walkHorizontal(node->high, building, nodes, status); 1283 //} 1284 node = node->high; 1285 } 1286 } 1287 1288 U_NAMESPACE_END 1289 U_NAMESPACE_USE 1290 U_CDECL_BEGIN 1291 static int32_t U_CALLCONV 1292 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) { 1293 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; 1294 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; 1295 1296 // Check for comparing a node to itself, to avoid spurious duplicates 1297 if (left == right) { 1298 return 0; 1299 } 1300 1301 // Most significant is type of node. Can never coalesce. 1302 if (left->fNodeType != right->fNodeType) { 1303 return left->fNodeType - right->fNodeType; 1304 } 1305 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. 1306 if (left->fParentEndsWord != right->fParentEndsWord) { 1307 return left->fParentEndsWord - right->fParentEndsWord; 1308 } 1309 // Next, the string. If that differs, we can never coalesce. 1310 int32_t result = left->fChars.compare(right->fChars); 1311 if (result != 0) { 1312 return result; 1313 } 1314 1315 // If the node value differs, we should not coalesce. 1316 // If values aren't stored, all fValues should be 0. 1317 if (left->fValue != right->fValue) { 1318 return left->fValue - right->fValue; 1319 } 1320 1321 // We know they're both the same node type, so branch for the two cases. 1322 if (left->fNodeType == kVerticalType) { 1323 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID 1324 - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; 1325 } 1326 else if(left->fChars.length() > 0 && right->fChars.length() > 0){ 1327 // We need to compare the links vectors. They should be the 1328 // same size because the strings were equal. 1329 // We compare the node IDs instead of the pointers, to handle 1330 // coalesced nodes. 1331 BuildCompactTrieHorizontalNode *hleft, *hright; 1332 hleft = (BuildCompactTrieHorizontalNode *)left; 1333 hright = (BuildCompactTrieHorizontalNode *)right; 1334 int32_t count = hleft->fLinks.size(); 1335 for (int32_t i = 0; i < count && result == 0; ++i) { 1336 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - 1337 ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; 1338 } 1339 } 1340 1341 // If they are equal to each other, mark them (speeds coalescing) 1342 if (result == 0) { 1343 left->fHasDuplicate = TRUE; 1344 right->fHasDuplicate = TRUE; 1345 } 1346 return result; 1347 } 1348 U_CDECL_END 1349 U_NAMESPACE_BEGIN 1350 1351 static void coalesceDuplicates(UStack &nodes, UErrorCode &status) { 1352 // We sort the array of nodes to place duplicates next to each other 1353 if (U_FAILURE(status)) { 1354 return; 1355 } 1356 int32_t size = nodes.size(); 1357 void **array = (void **)uprv_malloc(sizeof(void *)*size); 1358 if (array == NULL) { 1359 status = U_MEMORY_ALLOCATION_ERROR; 1360 return; 1361 } 1362 (void) nodes.toArray(array); 1363 1364 // Now repeatedly identify duplicates until there are no more 1365 int32_t dupes = 0; 1366 long passCount = 0; 1367 #ifdef DEBUG_TRIE_DICT 1368 long totalDupes = 0; 1369 #endif 1370 do { 1371 BuildCompactTrieNode *node; 1372 BuildCompactTrieNode *first = NULL; 1373 BuildCompactTrieNode **p; 1374 BuildCompactTrieNode **pFirst = NULL; 1375 int32_t counter = size - 2; 1376 // Sort the array, skipping nodes 0 and 1. Use quicksort for the first 1377 // pass for speed. For the second and subsequent passes, we use stable 1378 // (insertion) sort for two reasons: 1379 // 1. The array is already mostly ordered, so we get better performance. 1380 // 2. The way we find one and only one instance of a set of duplicates is to 1381 // check that the node ID equals the array index. If we used an unstable 1382 // sort for the second or later passes, it's possible that none of the 1383 // duplicates would wind up with a node ID equal to its array index. 1384 // The sort stability guarantees that, because as we coalesce more and 1385 // more groups, the first element of the resultant group will be one of 1386 // the first elements of the groups being coalesced. 1387 // To use quicksort for the second and subsequent passes, we would have to 1388 // find the minimum of the node numbers in a group, and set all the nodes 1389 // in the group to that node number. 1390 uprv_sortArray(array+2, counter, sizeof(void *), _sortBuildNodes, NULL, (passCount > 0), &status); 1391 dupes = 0; 1392 for (p = (BuildCompactTrieNode **)array + 2; counter > 0; --counter, ++p) { 1393 node = *p; 1394 if (node->fHasDuplicate) { 1395 if (first == NULL) { 1396 first = node; 1397 pFirst = p; 1398 } 1399 else if (_sortBuildNodes(NULL, pFirst, p) != 0) { 1400 // Starting a new run of dupes 1401 first = node; 1402 pFirst = p; 1403 } 1404 else if (node->fNodeID != first->fNodeID) { 1405 // Slave one to the other, note duplicate 1406 node->fNodeID = first->fNodeID; 1407 dupes += 1; 1408 } 1409 } 1410 else { 1411 // This node has no dupes 1412 first = NULL; 1413 pFirst = NULL; 1414 } 1415 } 1416 passCount += 1; 1417 #ifdef DEBUG_TRIE_DICT 1418 totalDupes += dupes; 1419 fprintf(stderr, "Trie node dupe removal, pass %d: %d nodes tagged\n", passCount, dupes); 1420 #endif 1421 } 1422 while (dupes > 0); 1423 #ifdef DEBUG_TRIE_DICT 1424 fprintf(stderr, "Trie node dupe removal complete: %d tagged in %d passes\n", totalDupes, passCount); 1425 #endif 1426 1427 // We no longer need the temporary array, as the nodes have all been marked appropriately. 1428 uprv_free(array); 1429 } 1430 1431 U_NAMESPACE_END 1432 U_CDECL_BEGIN 1433 static void U_CALLCONV _deleteBuildNode(void *obj) { 1434 delete (BuildCompactTrieNode *) obj; 1435 } 1436 U_CDECL_END 1437 U_NAMESPACE_BEGIN 1438 1439 CompactTrieHeader * 1440 CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary &dict, 1441 UErrorCode &status ) { 1442 if (U_FAILURE(status)) { 1443 return NULL; 1444 } 1445 #ifdef DEBUG_TRIE_DICT 1446 struct tms timing; 1447 struct tms previous; 1448 (void) ::times(&previous); 1449 #endif 1450 UStack nodes(_deleteBuildNode, NULL, status); // Index of nodes 1451 1452 // Add node 0, used as the NULL pointer/sentinel. 1453 nodes.addElement((int32_t)0, status); 1454 1455 Hashtable *values = NULL; // Index of (unique) values 1456 if (dict.fValued) { 1457 values = new Hashtable(status); 1458 } 1459 1460 // Start by creating the special empty node we use to indicate that the parent 1461 // terminates a word. This must be node 1, because the builder assumes 1462 // that. This node will never be used for tries storing numerical values. 1463 if (U_FAILURE(status)) { 1464 return NULL; 1465 } 1466 BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalType, nodes, status); 1467 if (terminal == NULL) { 1468 status = U_MEMORY_ALLOCATION_ERROR; 1469 } 1470 1471 // This call does all the work of building the new trie structure. The root 1472 // will have node ID 2 before writing to memory. 1473 BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status, values); 1474 #ifdef DEBUG_TRIE_DICT 1475 (void) ::times(&timing); 1476 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", 1477 nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, 1478 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); 1479 previous = timing; 1480 #endif 1481 1482 // Now coalesce all duplicate nodes. 1483 coalesceDuplicates(nodes, status); 1484 #ifdef DEBUG_TRIE_DICT 1485 (void) ::times(&timing); 1486 fprintf(stderr, "Duplicates coalesced, time user %f system %f\n", 1487 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, 1488 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); 1489 previous = timing; 1490 #endif 1491 1492 // Next, build the output trie. 1493 // First we compute all the sizes and build the node ID translation table. 1494 uint32_t totalSize = offsetof(CompactTrieHeader,offsets); 1495 int32_t count = nodes.size(); 1496 int32_t nodeCount = 1; // The sentinel node we already have 1497 BuildCompactTrieNode *node; 1498 int32_t i; 1499 UVector32 translate(count, status); // Should be no growth needed after this 1500 translate.push(0, status); // The sentinel node 1501 1502 if (U_FAILURE(status)) { 1503 return NULL; 1504 } 1505 1506 //map terminal value nodes 1507 int valueCount = 0; 1508 UVector valueNodes(status); 1509 if(values != NULL) { 1510 valueCount = values->count(); //number of unique terminal value nodes 1511 } 1512 1513 // map non-terminal nodes 1514 int valuePos = 1;//, nodePos = valueCount + valuePos; 1515 nodeCount = valueCount + valuePos; 1516 for (i = 1; i < count; ++i) { 1517 node = (BuildCompactTrieNode *)nodes[i]; 1518 if (node->fNodeID == i) { 1519 // Only one node out of each duplicate set is used 1520 if (node->fNodeID >= translate.size()) { 1521 // Logically extend the mapping table 1522 translate.setSize(i + 1); 1523 } 1524 //translate.setElementAt(object, index)! 1525 if(node->fNodeType == kValueType) { 1526 valueNodes.addElement(node, status); 1527 translate.setElementAt(valuePos++, i); 1528 } else { 1529 translate.setElementAt(nodeCount++, i); 1530 } 1531 totalSize += node->size(); 1532 } 1533 } 1534 1535 // Check for overflowing 20 bits worth of nodes. 1536 if (nodeCount > 0x100000) { 1537 status = U_ILLEGAL_ARGUMENT_ERROR; 1538 return NULL; 1539 } 1540 1541 // Add enough room for the offsets. 1542 totalSize += nodeCount*sizeof(uint32_t); 1543 #ifdef DEBUG_TRIE_DICT 1544 (void) ::times(&timing); 1545 fprintf(stderr, "Sizes/mapping done, time user %f system %f\n", 1546 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, 1547 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); 1548 previous = timing; 1549 fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount, totalSize); 1550 #endif 1551 uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize); 1552 if (bytes == NULL) { 1553 status = U_MEMORY_ALLOCATION_ERROR; 1554 return NULL; 1555 } 1556 1557 CompactTrieHeader *header = (CompactTrieHeader *)bytes; 1558 //header->size = totalSize; 1559 if(dict.fValued){ 1560 header->magic = COMPACT_TRIE_MAGIC_3; 1561 } else { 1562 header->magic = COMPACT_TRIE_MAGIC_2; 1563 } 1564 header->nodeCount = nodeCount; 1565 header->offsets[0] = 0; // Sentinel 1566 header->root = translate.elementAti(root->fNodeID); 1567 #ifdef DEBUG_TRIE_DICT 1568 if (header->root == 0) { 1569 fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root->fNodeID); 1570 } 1571 #endif 1572 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t)); 1573 nodeCount = valueCount + 1; 1574 1575 // Write terminal value nodes to memory 1576 for (i=0; i < valueNodes.size(); i++) { 1577 //header->offsets[i + 1] = offset; 1578 uint32_t tmpOffset = 0; 1579 node = (BuildCompactTrieNode *) valueNodes.elementAt(i); 1580 //header->offsets[i + 1] = (uint32_t)node->fValue; 1581 node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate); 1582 } 1583 1584 // Now write the data 1585 for (i = 1; i < count; ++i) { 1586 node = (BuildCompactTrieNode *)nodes[i]; 1587 if (node->fNodeID == i && node->fNodeType != kValueType) { 1588 header->offsets[nodeCount++] = offset; 1589 node->write(bytes, offset, translate); 1590 } 1591 } 1592 1593 //free all extra space 1594 uprv_realloc(bytes, offset); 1595 header->size = offset; 1596 1597 #ifdef DEBUG_TRIE_DICT 1598 fprintf(stdout, "Space freed: %d\n", totalSize-offset); 1599 1600 (void) ::times(&timing); 1601 fprintf(stderr, "Trie built, time user %f system %f\n", 1602 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, 1603 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); 1604 previous = timing; 1605 fprintf(stderr, "Final offset is %d\n", offset); 1606 1607 // Collect statistics on node types and sizes 1608 int hCount = 0; 1609 int vCount = 0; 1610 size_t hSize = 0; 1611 size_t vSize = 0; 1612 size_t hItemCount = 0; 1613 size_t vItemCount = 0; 1614 uint32_t previousOff = offset; 1615 uint32_t numOverflow = 0; 1616 uint32_t valueSpace = 0; 1617 for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { 1618 const CompactTrieNode *node = getCompactNode(header, nodeIdx); 1619 int itemCount; 1620 if(nodeIdx == header->root) 1621 itemCount = node->flagscount & kRootCountMask; 1622 else 1623 itemCount = getCount(node); 1624 if(node->flagscount & kEqualOverflows){ 1625 numOverflow++; 1626 } 1627 if (node->flagscount & kVerticalNode && nodeIdx != header->root) { 1628 vCount += 1; 1629 vItemCount += itemCount; 1630 vSize += previousOff-header->offsets[nodeIdx]; 1631 } 1632 else { 1633 hCount += 1; 1634 hItemCount += itemCount; 1635 if(nodeIdx >= header->root) { 1636 hSize += previousOff-header->offsets[nodeIdx]; 1637 } 1638 } 1639 1640 if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEndsWord) 1641 valueSpace += sizeof(uint16_t); 1642 previousOff = header->offsets[nodeIdx]; 1643 } 1644 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount, 1645 (double)hSize/hCount, (double)hItemCount/hCount); 1646 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount, 1647 (double)vSize/vCount, (double)vItemCount/vCount); 1648 fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverflow); 1649 fprintf(stderr, "Space taken up by values: %d \n", valueSpace); 1650 #endif 1651 1652 if (U_FAILURE(status)) { 1653 uprv_free(bytes); 1654 header = NULL; 1655 } 1656 return header; 1657 } 1658 1659 // Forward declaration 1660 static TernaryNode * 1661 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ); 1662 1663 // Convert a horizontal node (or subarray thereof) into a ternary subtrie 1664 static TernaryNode * 1665 unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalNode *hnode, 1666 int low, int high, int nodeCount, UErrorCode &status) { 1667 if (U_FAILURE(status) || low > high) { 1668 return NULL; 1669 } 1670 int middle = (low+high)/2; 1671 TernaryNode *result = new TernaryNode(hnode->entries[middle].ch); 1672 if (result == NULL) { 1673 status = U_MEMORY_ALLOCATION_ERROR; 1674 return NULL; 1675 } 1676 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, middle, nodeCount)); 1677 if (equal->flagscount & kParentEndsWord) { 1678 if(info->magic == COMPACT_TRIE_MAGIC_3){ 1679 result->flags = getValue(equal); 1680 }else{ 1681 result->flags |= kEndsWord; 1682 } 1683 } 1684 result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status); 1685 result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status); 1686 result->equal = unpackOneNode(info, equal, status); 1687 return result; 1688 } 1689 1690 // Convert one compact trie node into a ternary subtrie 1691 static TernaryNode * 1692 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ) { 1693 int nodeCount = getCount(node); 1694 if (nodeCount == 0 || U_FAILURE(status)) { 1695 // Failure, or terminal node 1696 return NULL; 1697 } 1698 if (node->flagscount & kVerticalNode) { 1699 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node; 1700 TernaryNode *head = NULL; 1701 TernaryNode *previous = NULL; 1702 TernaryNode *latest = NULL; 1703 for (int i = 0; i < nodeCount; ++i) { 1704 latest = new TernaryNode(vnode->chars[i]); 1705 if (latest == NULL) { 1706 status = U_MEMORY_ALLOCATION_ERROR; 1707 break; 1708 } 1709 if (head == NULL) { 1710 head = latest; 1711 } 1712 if (previous != NULL) { 1713 previous->equal = latest; 1714 } 1715 previous = latest; 1716 } 1717 if (latest != NULL) { 1718 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vnode)); 1719 if (equal->flagscount & kParentEndsWord) { 1720 if(info->magic == COMPACT_TRIE_MAGIC_3){ 1721 latest->flags = getValue(equal); 1722 } else { 1723 latest->flags |= kEndsWord; 1724 } 1725 } 1726 latest->equal = unpackOneNode(info, equal, status); 1727 } 1728 return head; 1729 } 1730 else { 1731 // Horizontal node 1732 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; 1733 return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, status); 1734 } 1735 } 1736 1737 // returns a MutableTrieDictionary generated from the CompactTrieDictionary 1738 MutableTrieDictionary * 1739 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { 1740 MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->magic == COMPACT_TRIE_MAGIC_3 ); 1741 if (result == NULL) { 1742 status = U_MEMORY_ALLOCATION_ERROR; 1743 return NULL; 1744 } 1745 // treat root node as special case: don't call unpackOneNode() or unpackHorizontalArray() directly 1746 // because only kEqualOverflows flag should be checked in root's flagscount 1747 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *) 1748 getCompactNode(fInfo, fInfo->root); 1749 uint16_t nodeCount = hnode->flagscount & kRootCountMask; 1750 TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1, 1751 nodeCount, status); 1752 1753 if (U_FAILURE(status)) { 1754 delete root; // Clean up 1755 delete result; 1756 return NULL; 1757 } 1758 result->fTrie = root; 1759 return result; 1760 } 1761 1762 U_NAMESPACE_END 1763 1764 U_CAPI int32_t U_EXPORT2 1765 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 1766 UErrorCode *status) { 1767 1768 if (status == NULL || U_FAILURE(*status)) { 1769 return 0; 1770 } 1771 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { 1772 *status=U_ILLEGAL_ARGUMENT_ERROR; 1773 return 0; 1774 } 1775 1776 // 1777 // Check that the data header is for for dictionary data. 1778 // (Header contents are defined in genxxx.cpp) 1779 // 1780 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); 1781 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ 1782 pInfo->dataFormat[1]==0x72 && 1783 pInfo->dataFormat[2]==0x44 && 1784 pInfo->dataFormat[3]==0x63 && 1785 pInfo->formatVersion[0]==1 )) { 1786 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", 1787 pInfo->dataFormat[0], pInfo->dataFormat[1], 1788 pInfo->dataFormat[2], pInfo->dataFormat[3], 1789 pInfo->formatVersion[0]); 1790 *status=U_UNSUPPORTED_ERROR; 1791 return 0; 1792 } 1793 1794 // 1795 // Swap the data header. (This is the generic ICU Data Header, not the 1796 // CompactTrieHeader). This swap also conveniently gets us 1797 // the size of the ICU d.h., which lets us locate the start 1798 // of the RBBI specific data. 1799 // 1800 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 1801 1802 // 1803 // Get the CompactTrieHeader, and check that it appears to be OK. 1804 // 1805 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 1806 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; 1807 uint32_t magic = ds->readUInt32(header->magic); 1808 if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3 1809 || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1) 1810 || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) 1811 { 1812 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n"); 1813 *status=U_UNSUPPORTED_ERROR; 1814 return 0; 1815 } 1816 1817 // 1818 // Prefight operation? Just return the size 1819 // 1820 uint32_t totalSize = ds->readUInt32(header->size); 1821 int32_t sizeWithUData = (int32_t)totalSize + headerSize; 1822 if (length < 0) { 1823 return sizeWithUData; 1824 } 1825 1826 // 1827 // Check that length passed in is consistent with length from RBBI data header. 1828 // 1829 if (length < sizeWithUData) { 1830 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n", 1831 totalSize); 1832 *status=U_INDEX_OUTOFBOUNDS_ERROR; 1833 return 0; 1834 } 1835 1836 // 1837 // Swap the Data. Do the data itself first, then the CompactTrieHeader, because 1838 // we need to reference the header to locate the data, and an 1839 // inplace swap of the header leaves it unusable. 1840 // 1841 uint8_t *outBytes = (uint8_t *)outData + headerSize; 1842 CompactTrieHeader *outputHeader = (CompactTrieHeader *)outBytes; 1843 1844 #if 0 1845 // 1846 // If not swapping in place, zero out the output buffer before starting. 1847 // 1848 if (inBytes != outBytes) { 1849 uprv_memset(outBytes, 0, totalSize); 1850 } 1851 1852 // We need to loop through all the nodes in the offset table, and swap each one. 1853 uint32_t nodeCount, rootId; 1854 if(header->magic == COMPACT_TRIE_MAGIC_1) { 1855 nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount); 1856 rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root); 1857 } else { 1858 nodeCount = ds->readUInt32(header->nodeCount); 1859 rootId = ds->readUInt32(header->root); 1860 } 1861 1862 // Skip node 0, which should always be 0. 1863 for (uint32_t i = 1; i < nodeCount; ++i) { 1864 uint32_t nodeOff = ds->readUInt32(header->offsets[i]); 1865 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff); 1866 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); 1867 uint16_t flagscount = ds->readUInt16(inNode->flagscount); 1868 uint16_t itemCount = getCount(inNode); 1869 //uint16_t itemCount = flagscount & kCountMask; 1870 ds->writeUInt16(&outNode->flagscount, flagscount); 1871 if (itemCount > 0) { 1872 uint16_t overflow = 0; //number of extra uint16_ts needed to be swapped 1873 if (flagscount & kVerticalNode && i != rootId) { 1874 if(flagscount & kEqualOverflows){ 1875 // include overflow bits 1876 overflow += 1; 1877 } 1878 if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsParentWord) { 1879 //include values 1880 overflow += 1; 1881 } 1882 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), 1883 (itemCount + overflow)*sizeof(uint16_t), 1884 outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status); 1885 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal); 1886 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal)); 1887 } 1888 else { 1889 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHorizontalNode *)inNode; 1890 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode *)outNode; 1891 for (int j = 0; j < itemCount; ++j) { 1892 uint16_t word = ds->readUInt16(inHNode->entries[j].ch); 1893 ds->writeUInt16(&outHNode->entries[j].ch, word); 1894 word = ds->readUInt16(inHNode->entries[j].equal); 1895 ds->writeUInt16(&outHNode->entries[j].equal, word); 1896 } 1897 1898 // swap overflow/value information 1899 if(flagscount & kEqualOverflows){ 1900 overflow += (itemCount + 3) / 4; 1901 } 1902 1903 if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flagscount & kEndsParentWord) { 1904 //include values 1905 overflow += 1; 1906 } 1907 1908 uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount]; 1909 uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCount]; 1910 for(int j = 0; j<overflow; j++){ 1911 uint16_t extraInfo = ds->readUInt16(*inOverflow); 1912 ds->writeUInt16(outOverflow, extraInfo); 1913 1914 inOverflow++; 1915 outOverflow++; 1916 } 1917 } 1918 } 1919 } 1920 #endif 1921 1922 // Swap the header 1923 ds->writeUInt32(&outputHeader->size, totalSize); 1924 ds->writeUInt32(&outputHeader->magic, magic); 1925 1926 uint32_t nodeCount; 1927 uint32_t offsetPos; 1928 if (header->magic == COMPACT_TRIE_MAGIC_1) { 1929 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header; 1930 CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeader; 1931 1932 nodeCount = ds->readUInt16(headerV1->nodeCount); 1933 ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount); 1934 uint16_t root = ds->readUInt16(headerV1->root); 1935 ds->writeUInt16(&outputHeaderV1->root, root); 1936 offsetPos = offsetof(CompactTrieHeaderV1,offsets); 1937 } else { 1938 nodeCount = ds->readUInt32(header->nodeCount); 1939 ds->writeUInt32(&outputHeader->nodeCount, nodeCount); 1940 uint32_t root = ds->readUInt32(header->root); 1941 ds->writeUInt32(&outputHeader->root, root); 1942 offsetPos = offsetof(CompactTrieHeader,offsets); 1943 } 1944 1945 // All the data in all the nodes consist of 16 bit items. Swap them all at once. 1946 uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t)); 1947 ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status); 1948 1949 //swap offsets 1950 ds->swapArray32(ds, inBytes+offsetPos, 1951 sizeof(uint32_t)*(uint32_t)nodeCount, 1952 outBytes+offsetPos, status); 1953 1954 return sizeWithUData; 1955 } 1956 1957 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1958