Home | History | Annotate | Download | only in common
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 2006-2008, International Business Machines Corporation        *
      4  * and others. All Rights Reserved.                                            *
      5  *******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "triedict.h"
     13 #include "unicode/chariter.h"
     14 #include "unicode/uchriter.h"
     15 #include "unicode/strenum.h"
     16 #include "unicode/uenum.h"
     17 #include "unicode/udata.h"
     18 #include "cmemory.h"
     19 #include "udataswp.h"
     20 #include "uvector.h"
     21 #include "uvectr32.h"
     22 #include "uarrsort.h"
     23 #include "hash.h"
     24 
     25 //#define DEBUG_TRIE_DICT 1
     26 
     27 #ifdef DEBUG_TRIE_DICT
     28 #include <sys/times.h>
     29 #include <limits.h>
     30 #include <stdio.h>
     31 #include <time.h>
     32 #ifndef CLK_TCK
     33 #define CLK_TCK      CLOCKS_PER_SEC
     34 #endif
     35 
     36 #endif
     37 
     38 U_NAMESPACE_BEGIN
     39 
     40 /*******************************************************************
     41  * TrieWordDictionary
     42  */
     43 
     44 TrieWordDictionary::TrieWordDictionary() {
     45 }
     46 
     47 TrieWordDictionary::~TrieWordDictionary() {
     48 }
     49 
     50 /*******************************************************************
     51  * MutableTrieDictionary
     52  */
     53 
     54 //#define MAX_VALUE 65535
     55 
     56 // forward declaration
     57 inline uint16_t scaleLogProbabilities(double logprob);
     58 
     59 // Node structure for the ternary, uncompressed trie
     60 struct TernaryNode : public UMemory {
     61     UChar       ch;         // UTF-16 code unit
     62     uint16_t    flags;      // Flag word
     63     TernaryNode *low;       // Less-than link
     64     TernaryNode *equal;     // Equal link
     65     TernaryNode *high;      // Greater-than link
     66 
     67     TernaryNode(UChar uc);
     68     ~TernaryNode();
     69 };
     70 
     71 enum MutableTrieNodeFlags {
     72     kEndsWord = 0x0001      // This node marks the end of a valid word
     73 };
     74 
     75 inline
     76 TernaryNode::TernaryNode(UChar uc) {
     77     ch = uc;
     78     flags = 0;
     79     low = NULL;
     80     equal = NULL;
     81     high = NULL;
     82 }
     83 
     84 // Not inline since it's recursive
     85 TernaryNode::~TernaryNode() {
     86     delete low;
     87     delete equal;
     88     delete high;
     89 }
     90 
     91 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,
     92                                               UBool containsValue /* = FALSE */  ) {
     93     // Start the trie off with something. Having the root node already present
     94     // cuts a special case out of the search/insertion functions.
     95     // Making it a median character cuts the worse case for searches from
     96     // 4x a balanced trie to 2x a balanced trie. It's best to choose something
     97     // that starts a word that is midway in the list.
     98     fTrie = new TernaryNode(median);
     99     if (fTrie == NULL) {
    100         status = U_MEMORY_ALLOCATION_ERROR;
    101     }
    102     fIter = utext_openUChars(NULL, NULL, 0, &status);
    103     if (U_SUCCESS(status) && fIter == NULL) {
    104         status = U_MEMORY_ALLOCATION_ERROR;
    105     }
    106 
    107     fValued = containsValue;
    108 }
    109 
    110 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,
    111                                               UBool containsValue /* = false */ ) {
    112     fTrie = NULL;
    113     fIter = utext_openUChars(NULL, NULL, 0, &status);
    114     if (U_SUCCESS(status) && fIter == NULL) {
    115         status = U_MEMORY_ALLOCATION_ERROR;
    116     }
    117 
    118     fValued = containsValue;
    119 }
    120 
    121 MutableTrieDictionary::~MutableTrieDictionary() {
    122     delete fTrie;
    123     utext_close(fIter);
    124 }
    125 
    126 int32_t
    127 MutableTrieDictionary::search( UText *text,
    128                                int32_t maxLength,
    129                                int32_t *lengths,
    130                                int &count,
    131                                int limit,
    132                                TernaryNode *&parent,
    133                                UBool &pMatched,
    134                                uint16_t *values /*=NULL*/) const {
    135     // TODO: current implementation works in UTF-16 space
    136     const TernaryNode *up = NULL;
    137     const TernaryNode *p = fTrie;
    138     int mycount = 0;
    139     pMatched = TRUE;
    140     int i;
    141 
    142     if (!fValued) {
    143         values = NULL;
    144     }
    145 
    146     UChar uc = utext_current32(text);
    147     for (i = 0; i < maxLength && p != NULL; ++i) {
    148         while (p != NULL) {
    149             if (uc < p->ch) {
    150                 up = p;
    151                 p = p->low;
    152             }
    153             else if (uc == p->ch) {
    154                 break;
    155             }
    156             else {
    157                 up = p;
    158                 p = p->high;
    159             }
    160         }
    161         if (p == NULL) {
    162             pMatched = FALSE;
    163             break;
    164         }
    165         // Must be equal to get here
    166         if (limit > 0 && (p->flags > 0)) {
    167             //is there a more efficient way to add values? ie. remove if stmt
    168             if(values != NULL) {
    169                 values[mycount] = p->flags;
    170             }
    171             lengths[mycount++] = i+1;
    172             --limit;
    173         }
    174         up = p;
    175         p = p->equal;
    176         uc = utext_next32(text);
    177         uc = utext_current32(text);
    178     }
    179 
    180     // Note that there is no way to reach here with up == 0 unless
    181     // maxLength is 0 coming in.
    182     parent = (TernaryNode *)up;
    183     count = mycount;
    184     return i;
    185 }
    186 
    187 void
    188 MutableTrieDictionary::addWord( const UChar *word,
    189                                 int32_t length,
    190                                 UErrorCode &status,
    191                                 uint16_t value /* = 0 */ ) {
    192     // dictionary cannot store zero values, would interfere with flags
    193     if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) {
    194         status = U_ILLEGAL_ARGUMENT_ERROR;
    195         return;
    196     }
    197 
    198     TernaryNode *parent;
    199     UBool pMatched;
    200     int count;
    201     fIter = utext_openUChars(fIter, word, length, &status);
    202 
    203     int matched;
    204     matched = search(fIter, length, NULL, count, 0, parent, pMatched);
    205 
    206     while (matched++ < length) {
    207         UChar32 uc = utext_next32(fIter);  // TODO:  supplementary support?
    208         U_ASSERT(uc != U_SENTINEL);
    209         TernaryNode *newNode = new TernaryNode(uc);
    210         if (newNode == NULL) {
    211             status = U_MEMORY_ALLOCATION_ERROR;
    212             return;
    213         }
    214         if (pMatched) {
    215             parent->equal = newNode;
    216         }
    217         else {
    218             pMatched = TRUE;
    219             if (uc < parent->ch) {
    220                 parent->low = newNode;
    221             }
    222             else {
    223                 parent->high = newNode;
    224             }
    225         }
    226         parent = newNode;
    227     }
    228 
    229     if(fValued && value > 0){
    230         parent->flags = value;
    231     } else {
    232         parent->flags |= kEndsWord;
    233     }
    234 }
    235 
    236 int32_t
    237 MutableTrieDictionary::matches( UText *text,
    238                                 int32_t maxLength,
    239                                 int32_t *lengths,
    240                                 int &count,
    241                                 int limit,
    242                                 uint16_t *values /*=NULL*/) const {
    243     TernaryNode *parent;
    244     UBool pMatched;
    245     return search(text, maxLength, lengths, count, limit, parent, pMatched, values);
    246 }
    247 
    248 // Implementation of iteration for MutableTrieDictionary
    249 class MutableTrieEnumeration : public StringEnumeration {
    250 private:
    251     UStack      fNodeStack;     // Stack of nodes to process
    252     UVector32   fBranchStack;   // Stack of which branch we are working on
    253     TernaryNode *fRoot;         // Root node
    254     enum StackBranch {
    255         kLessThan,
    256         kEqual,
    257         kGreaterThan,
    258         kDone
    259     };
    260 
    261 public:
    262     static UClassID U_EXPORT2 getStaticClassID(void);
    263     virtual UClassID getDynamicClassID(void) const;
    264 public:
    265     MutableTrieEnumeration(TernaryNode *root, UErrorCode &status)
    266         : fNodeStack(status), fBranchStack(status) {
    267         fRoot = root;
    268         fNodeStack.push(root, status);
    269         fBranchStack.push(kLessThan, status);
    270         unistr.remove();
    271     }
    272 
    273     virtual ~MutableTrieEnumeration() {
    274     }
    275 
    276     virtual StringEnumeration *clone() const {
    277         UErrorCode status = U_ZERO_ERROR;
    278         return new MutableTrieEnumeration(fRoot, status);
    279     }
    280 
    281     virtual const UnicodeString *snext(UErrorCode &status) {
    282         if (fNodeStack.empty() || U_FAILURE(status)) {
    283             return NULL;
    284         }
    285         TernaryNode *node = (TernaryNode *) fNodeStack.peek();
    286         StackBranch where = (StackBranch) fBranchStack.peeki();
    287         while (!fNodeStack.empty() && U_SUCCESS(status)) {
    288             UBool emit;
    289             UBool equal;
    290 
    291             switch (where) {
    292             case kLessThan:
    293                 if (node->low != NULL) {
    294                     fBranchStack.setElementAt(kEqual, fBranchStack.size()-1);
    295                     node = (TernaryNode *) fNodeStack.push(node->low, status);
    296                     where = (StackBranch) fBranchStack.push(kLessThan, status);
    297                     break;
    298                 }
    299             case kEqual:
    300                 emit = node->flags > 0;
    301                 equal = (node->equal != NULL);
    302                 // If this node should be part of the next emitted string, append
    303                 // the UChar to the string, and make sure we pop it when we come
    304                 // back to this node. The character should only be in the string
    305                 // for as long as we're traversing the equal subtree of this node
    306                 if (equal || emit) {
    307                     unistr.append(node->ch);
    308                     fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-1);
    309                 }
    310                 if (equal) {
    311                     node = (TernaryNode *) fNodeStack.push(node->equal, status);
    312                     where = (StackBranch) fBranchStack.push(kLessThan, status);
    313                 }
    314                 if (emit) {
    315                     return &unistr;
    316                 }
    317                 if (equal) {
    318                     break;
    319                 }
    320             case kGreaterThan:
    321                 // If this node's character is in the string, remove it.
    322                 if (node->equal != NULL || node->flags > 0) {
    323                     unistr.truncate(unistr.length()-1);
    324                 }
    325                 if (node->high != NULL) {
    326                     fBranchStack.setElementAt(kDone, fBranchStack.size()-1);
    327                     node = (TernaryNode *) fNodeStack.push(node->high, status);
    328                     where = (StackBranch) fBranchStack.push(kLessThan, status);
    329                     break;
    330                 }
    331             case kDone:
    332                 fNodeStack.pop();
    333                 fBranchStack.popi();
    334                 node = (TernaryNode *) fNodeStack.peek();
    335                 where = (StackBranch) fBranchStack.peeki();
    336                 break;
    337             default:
    338                 return NULL;
    339             }
    340         }
    341         return NULL;
    342     }
    343 
    344     // Very expensive, but this should never be used.
    345     virtual int32_t count(UErrorCode &status) const {
    346         MutableTrieEnumeration counter(fRoot, status);
    347         int32_t result = 0;
    348         while (counter.snext(status) != NULL && U_SUCCESS(status)) {
    349             ++result;
    350         }
    351         return result;
    352     }
    353 
    354     virtual void reset(UErrorCode &status) {
    355         fNodeStack.removeAllElements();
    356         fBranchStack.removeAllElements();
    357         fNodeStack.push(fRoot, status);
    358         fBranchStack.push(kLessThan, status);
    359         unistr.remove();
    360     }
    361 };
    362 
    363 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration)
    364 
    365 StringEnumeration *
    366 MutableTrieDictionary::openWords( UErrorCode &status ) const {
    367     if (U_FAILURE(status)) {
    368         return NULL;
    369     }
    370     return new MutableTrieEnumeration(fTrie, status);
    371 }
    372 
    373 /*******************************************************************
    374  * CompactTrieDictionary
    375  */
    376 
    377 //TODO further optimization:
    378 // minimise size of trie with logprobs by storing values
    379 // for terminal nodes directly in offsets[]
    380 // --> calculating from next offset *might* be simpler, but would have to add
    381 // one last offset for logprob of last node
    382 // --> if calculate from current offset, need to factor in possible overflow
    383 // as well.
    384 // idea: store in offset, set first bit to indicate logprob storage-->won't
    385 // have to access additional node
    386 
    387 // {'Dic', 1}, version 1: uses old header, no values
    388 #define COMPACT_TRIE_MAGIC_1 0x44696301
    389 // version 2: uses new header (more than 2^16 nodes), no values
    390 #define COMPACT_TRIE_MAGIC_2 0x44696302
    391 // version 3: uses new header, includes values
    392 #define COMPACT_TRIE_MAGIC_3 0x44696303
    393 
    394 struct CompactTrieHeader {
    395     uint32_t        size;           // Size of the data in bytes
    396     uint32_t        magic;          // Magic number (including version)
    397     uint32_t        nodeCount;      // Number of entries in offsets[]
    398     uint32_t        root;           // Node number of the root node
    399     uint32_t        offsets[1];     // Offsets to nodes from start of data
    400 };
    401 
    402 // old version of CompactTrieHeader kept for backwards compatibility
    403 struct CompactTrieHeaderV1 {
    404     uint32_t        size;           // Size of the data in bytes
    405     uint32_t        magic;          // Magic number (including version)
    406     uint16_t        nodeCount;      // Number of entries in offsets[]
    407     uint16_t        root;           // Node number of the root node
    408     uint32_t        offsets[1];     // Offsets to nodes from start of data
    409 };
    410 
    411 // Helper class for managing CompactTrieHeader and CompactTrieHeaderV1
    412 struct CompactTrieInfo {
    413     uint32_t        size;           // Size of the data in bytes
    414     uint32_t        magic;          // Magic number (including version)
    415     uint32_t        nodeCount;      // Number of entries in offsets[]
    416     uint32_t        root;           // Node number of the root node
    417     uint32_t        *offsets;       // Offsets to nodes from start of data
    418     uint8_t         *address;       // pointer to header bytes in memory
    419 
    420     CompactTrieInfo(const void *data, UErrorCode &status){
    421         CompactTrieHeader *header = (CompactTrieHeader *) data;
    422         if (header->magic != COMPACT_TRIE_MAGIC_1 &&
    423                 header->magic != COMPACT_TRIE_MAGIC_2 &&
    424                 header->magic != COMPACT_TRIE_MAGIC_3) {
    425             status = U_ILLEGAL_ARGUMENT_ERROR;
    426         } else {
    427             size = header->size;
    428             magic = header->magic;
    429 
    430             if (header->magic == COMPACT_TRIE_MAGIC_1) {
    431                 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header;
    432                 nodeCount = headerV1->nodeCount;
    433                 root = headerV1->root;
    434                 offsets = &(headerV1->offsets[0]);
    435                 address = (uint8_t *)headerV1;
    436             } else {
    437                 nodeCount = header->nodeCount;
    438                 root = header->root;
    439                 offsets = &(header->offsets[0]);
    440                 address = (uint8_t *)header;
    441             }
    442         }
    443     }
    444 
    445     ~CompactTrieInfo(){}
    446 };
    447 
    448 // Note that to avoid platform-specific alignment issues, all members of the node
    449 // structures should be the same size, or should contain explicit padding to
    450 // natural alignment boundaries.
    451 
    452 // We can't use a bitfield for the flags+count field, because the layout of those
    453 // is not portable. 12 bits of count allows for up to 4096 entries in a node.
    454 struct CompactTrieNode {
    455     uint16_t        flagscount;     // Count of sub-entries, plus flags
    456 };
    457 
    458 enum CompactTrieNodeFlags {
    459     kVerticalNode   = 0x1000,       // This is a vertical node
    460     kParentEndsWord = 0x2000,       // The node whose equal link points to this ends a word
    461     kExceedsCount   = 0x4000,       // new MSB for count >= 4096, originally kReservedFlag1
    462     kEqualOverflows = 0x8000,       // Links to nodeIDs > 2^16, orig. kReservedFlag2
    463     kCountMask      = 0x0FFF,       // The count portion of flagscount
    464     kFlagMask       = 0xF000,       // The flags portion of flagscount
    465     kRootCountMask  = 0x7FFF        // The count portion of flagscount in the root node
    466 
    467     //offset flags:
    468     //kOffsetContainsValue = 0x80000000       // Offset contains value for parent node
    469 };
    470 
    471 // The two node types are distinguished by the kVerticalNode flag.
    472 
    473 struct CompactTrieHorizontalEntry {
    474     uint16_t        ch;             // UChar
    475     uint16_t        equal;          // Equal link node index
    476 };
    477 
    478 // We don't use inheritance here because C++ does not guarantee that the
    479 // base class comes first in memory!!
    480 
    481 struct CompactTrieHorizontalNode {
    482     uint16_t        flagscount;     // Count of sub-entries, plus flags
    483     CompactTrieHorizontalEntry      entries[1];
    484 };
    485 
    486 struct CompactTrieVerticalNode {
    487     uint16_t        flagscount;     // Count of sub-entries, plus flags
    488     uint16_t        equal;          // Equal link node index
    489     uint16_t        chars[1];       // Code units
    490 };
    491 
    492 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
    493                                                 UErrorCode &status )
    494 : fUData(dataObj)
    495 {
    496     fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
    497     *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status);
    498     fOwnData = FALSE;
    499 }
    500 
    501 CompactTrieDictionary::CompactTrieDictionary( const void *data,
    502                                                 UErrorCode &status )
    503 : fUData(NULL)
    504 {
    505     fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
    506     *fInfo = CompactTrieInfo(data, status);
    507     fOwnData = FALSE;
    508 }
    509 
    510 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict,
    511                                                 UErrorCode &status )
    512 : fUData(NULL)
    513 {
    514     const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status);
    515     if (U_SUCCESS(status)) {
    516         fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
    517         *fInfo = CompactTrieInfo(header, status);
    518     }
    519 
    520     fOwnData = !U_FAILURE(status);
    521 }
    522 
    523 CompactTrieDictionary::~CompactTrieDictionary() {
    524     if (fOwnData) {
    525         uprv_free((void *)(fInfo->address));
    526     }
    527     uprv_free((void *)fInfo);
    528 
    529     if (fUData) {
    530         udata_close(fUData);
    531     }
    532 }
    533 
    534 UBool CompactTrieDictionary::getValued() const{
    535     return fInfo->magic == COMPACT_TRIE_MAGIC_3;
    536 }
    537 
    538 uint32_t
    539 CompactTrieDictionary::dataSize() const {
    540     return fInfo->size;
    541 }
    542 
    543 const void *
    544 CompactTrieDictionary::data() const {
    545     return fInfo->address;
    546 }
    547 
    548 //This function finds the address of a node for us, given its node ID
    549 static inline const CompactTrieNode *
    550 getCompactNode(const CompactTrieInfo *info, uint32_t node) {
    551     if(node < info->root-1) {
    552         return (const CompactTrieNode *)(&info->offsets[node]);
    553     } else {
    554         return (const CompactTrieNode *)(info->address + info->offsets[node]);
    555     }
    556 }
    557 
    558 //this version of getCompactNode is currently only used in compactMutableTrieDictionary()
    559 static inline const CompactTrieNode *
    560 getCompactNode(const CompactTrieHeader *header, uint32_t node) {
    561     if(node < header->root-1) {
    562         return (const CompactTrieNode *)(&header->offsets[node]);
    563     } else {
    564         return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
    565     }
    566 }
    567 
    568 
    569 /**
    570  * Calculates the number of links in a node
    571  * @node The specified node
    572  */
    573 static inline const uint16_t
    574 getCount(const CompactTrieNode *node){
    575     return (node->flagscount & kCountMask);
    576     //use the code below if number of links ever exceed 4096
    577     //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCount) >> 2);
    578 }
    579 
    580 /**
    581  * calculates an equal link node ID of a horizontal node
    582  * @hnode The horizontal node containing the equal link
    583  * @param index The index into hnode->entries[]
    584  * @param nodeCount The length of hnode->entries[]
    585  */
    586 static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){
    587     if(vnode->flagscount & kEqualOverflows){
    588         // treat overflow bits as an extension of chars[]
    589         uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNode*)vnode)];
    590         return vnode->equal + (((uint32_t)*overflow) << 16);
    591     }else{
    592         return vnode->equal;
    593     }
    594 }
    595 
    596 /**
    597  * calculates an equal link node ID of a horizontal node
    598  * @hnode The horizontal node containing the equal link
    599  * @param index The index into hnode->entries[]
    600  * @param nodeCount The length of hnode->entries[]
    601  */
    602 static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uint16_t index, uint16_t nodeCount){
    603     if(hnode->flagscount & kEqualOverflows){
    604         //set overflow to point to the uint16_t containing the overflow bits
    605         uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount];
    606         overflow += index/4;
    607         uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10;
    608         return hnode->entries[index].equal + (((uint32_t)extraBits) << 16);
    609     } else {
    610         return hnode->entries[index].equal;
    611     }
    612 }
    613 
    614 /**
    615  * Returns the value stored in the specified node which is associated with its
    616  * parent node.
    617  * TODO: how to tell that value is stored in node or in offset? check whether
    618  * node ID < fInfo->root!
    619  */
    620 static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){
    621     uint16_t count = getCount((CompactTrieNode *)hnode);
    622     uint16_t overflowSize = 0; //size of node ID overflow storage in bytes
    623 
    624     if(hnode->flagscount & kEqualOverflows)
    625         overflowSize = (count + 3) / 4 * sizeof(uint16_t);
    626     return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize));
    627 }
    628 
    629 static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){
    630     // calculate size of total node ID overflow storage in bytes
    631     uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16_t) : 0;
    632     return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)vnode)] + overflowSize));
    633 }
    634 
    635 static inline uint16_t getValue(const CompactTrieNode *node){
    636     if(node->flagscount & kVerticalNode)
    637         return getValue((const CompactTrieVerticalNode *)node);
    638     else
    639         return getValue((const CompactTrieHorizontalNode *)node);
    640 }
    641 
    642 //returns index of match in CompactTrieHorizontalNode.entries[] using binary search
    643 inline int16_t
    644 searchHorizontalEntries(const CompactTrieHorizontalEntry *entries,
    645         UChar uc, uint16_t nodeCount){
    646     int low = 0;
    647     int high = nodeCount-1;
    648     int middle;
    649     while (high >= low) {
    650         middle = (high+low)/2;
    651         if (uc == entries[middle].ch) {
    652             return middle;
    653         }
    654         else if (uc < entries[middle].ch) {
    655             high = middle-1;
    656         }
    657         else {
    658             low = middle+1;
    659         }
    660     }
    661 
    662     return -1;
    663 }
    664 
    665 int32_t
    666 CompactTrieDictionary::matches( UText *text,
    667                                 int32_t maxLength,
    668                                 int32_t *lengths,
    669                                 int &count,
    670                                 int limit,
    671                                 uint16_t *values /*= NULL*/) const {
    672     if (fInfo->magic == COMPACT_TRIE_MAGIC_2)
    673         values = NULL;
    674 
    675     // TODO: current implementation works in UTF-16 space
    676     const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root);
    677     int mycount = 0;
    678 
    679     UChar uc = utext_current32(text);
    680     int i = 0;
    681 
    682     // handle root node with only kEqualOverflows flag: assume horizontal node without parent
    683     if(node != NULL){
    684         const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node;
    685         int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask);
    686         if(index > -1){
    687             node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagscount & kRootCountMask));
    688             utext_next32(text);
    689             uc = utext_current32(text);
    690             ++i;
    691         }else{
    692             node = NULL;
    693         }
    694     }
    695 
    696     while (node != NULL) {
    697         // Check if the node we just exited ends a word
    698         if (limit > 0 && (node->flagscount & kParentEndsWord)) {
    699             if(values != NULL){
    700                 values[mycount] = getValue(node);
    701             }
    702             lengths[mycount++] = i;
    703             --limit;
    704         }
    705         // Check that we haven't exceeded the maximum number of input characters.
    706         // We have to do that here rather than in the while condition so that
    707         // we can check for ending a word, above.
    708         if (i >= maxLength) {
    709             break;
    710         }
    711 
    712         int nodeCount = getCount(node);
    713         if (nodeCount == 0) {
    714             // Special terminal node; return now
    715             break;
    716         }
    717         if (node->flagscount & kVerticalNode) {
    718             // Vertical node; check all the characters in it
    719             const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
    720             for (int j = 0; j < nodeCount && i < maxLength; ++j) {
    721                 if (uc != vnode->chars[j]) {
    722                     // We hit a non-equal character; return
    723                     goto exit;
    724                 }
    725                 utext_next32(text);
    726                 uc = utext_current32(text);
    727                 ++i;
    728             }
    729             // To get here we must have come through the whole list successfully;
    730             // go on to the next node. Note that a word cannot end in the middle
    731             // of a vertical node.
    732             node = getCompactNode(fInfo, calcEqualLink(vnode));
    733         }
    734         else {
    735             // Horizontal node; do binary search
    736             const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
    737             const CompactTrieHorizontalEntry *entries;
    738             entries = hnode->entries;
    739 
    740             int index = searchHorizontalEntries(entries, uc, nodeCount);
    741             if(index > -1){  //
    742                 // We hit a match; get the next node and next character
    743                 node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCount));
    744                 utext_next32(text);
    745                 uc = utext_current32(text);
    746                 ++i;
    747             }else{
    748                 node = NULL;    // If we don't find a match, we'll fall out of the loop
    749             }
    750         }
    751     }
    752     exit:
    753     count = mycount;
    754     return i;
    755 }
    756 
    757 // Implementation of iteration for CompactTrieDictionary
    758 class CompactTrieEnumeration : public StringEnumeration {
    759 private:
    760     UVector32               fNodeStack;     // Stack of nodes to process
    761     UVector32               fIndexStack;    // Stack of where in node we are
    762     const CompactTrieInfo   *fInfo;         // Trie data
    763 
    764 public:
    765     static UClassID U_EXPORT2 getStaticClassID(void);
    766     virtual UClassID getDynamicClassID(void) const;
    767 public:
    768     CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)
    769         : fNodeStack(status), fIndexStack(status) {
    770         fInfo = info;
    771         fNodeStack.push(info->root, status);
    772         fIndexStack.push(0, status);
    773         unistr.remove();
    774     }
    775 
    776     virtual ~CompactTrieEnumeration() {
    777     }
    778 
    779     virtual StringEnumeration *clone() const {
    780         UErrorCode status = U_ZERO_ERROR;
    781         return new CompactTrieEnumeration(fInfo, status);
    782     }
    783 
    784     virtual const UnicodeString * snext(UErrorCode &status);
    785 
    786     // Very expensive, but this should never be used.
    787     virtual int32_t count(UErrorCode &status) const {
    788         CompactTrieEnumeration counter(fInfo, status);
    789         int32_t result = 0;
    790         while (counter.snext(status) != NULL && U_SUCCESS(status)) {
    791             ++result;
    792         }
    793         return result;
    794     }
    795 
    796     virtual void reset(UErrorCode &status) {
    797         fNodeStack.removeAllElements();
    798         fIndexStack.removeAllElements();
    799         fNodeStack.push(fInfo->root, status);
    800         fIndexStack.push(0, status);
    801         unistr.remove();
    802     }
    803 };
    804 
    805 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration)
    806 
    807 const UnicodeString *
    808 CompactTrieEnumeration::snext(UErrorCode &status) {
    809     if (fNodeStack.empty() || U_FAILURE(status)) {
    810         return NULL;
    811     }
    812     const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());
    813     int where = fIndexStack.peeki();
    814     while (!fNodeStack.empty() && U_SUCCESS(status)) {
    815         int nodeCount;
    816 
    817         bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root);
    818         if(isRoot){
    819             nodeCount = node->flagscount & kRootCountMask;
    820         } else {
    821             nodeCount = getCount(node);
    822         }
    823 
    824         UBool goingDown = FALSE;
    825         if (nodeCount == 0) {
    826             // Terminal node; go up immediately
    827             fNodeStack.popi();
    828             fIndexStack.popi();
    829             node = getCompactNode(fInfo, fNodeStack.peeki());
    830             where = fIndexStack.peeki();
    831         }
    832         else if ((node->flagscount & kVerticalNode) && !isRoot) {
    833             // Vertical node
    834             const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
    835             if (where == 0) {
    836                 // Going down
    837                 unistr.append((const UChar *)vnode->chars, nodeCount);
    838                 fIndexStack.setElementAt(1, fIndexStack.size()-1);
    839                 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode), status));
    840                 where = fIndexStack.push(0, status);
    841                 goingDown = TRUE;
    842             }
    843             else {
    844                 // Going up
    845                 unistr.truncate(unistr.length()-nodeCount);
    846                 fNodeStack.popi();
    847                 fIndexStack.popi();
    848                 node = getCompactNode(fInfo, fNodeStack.peeki());
    849                 where = fIndexStack.peeki();
    850             }
    851         }
    852         else {
    853             // Horizontal node
    854             const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
    855             if (where > 0) {
    856                 // Pop previous char
    857                 unistr.truncate(unistr.length()-1);
    858             }
    859             if (where < nodeCount) {
    860                 // Push on next node
    861                 unistr.append((UChar)hnode->entries[where].ch);
    862                 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
    863                 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode, where, nodeCount), status));
    864                 where = fIndexStack.push(0, status);
    865                 goingDown = TRUE;
    866             }
    867             else {
    868                 // Going up
    869                 fNodeStack.popi();
    870                 fIndexStack.popi();
    871                 node = getCompactNode(fInfo, fNodeStack.peeki());
    872                 where = fIndexStack.peeki();
    873             }
    874         }
    875 
    876         // Check if the parent of the node we've just gone down to ends a
    877         // word. If so, return it.
    878         // The root node should never end up here.
    879         if (goingDown && (node->flagscount & kParentEndsWord)) {
    880             return &unistr;
    881         }
    882     }
    883     return NULL;
    884 }
    885 
    886 StringEnumeration *
    887 CompactTrieDictionary::openWords( UErrorCode &status ) const {
    888     if (U_FAILURE(status)) {
    889         return NULL;
    890     }
    891     return new CompactTrieEnumeration(fInfo, status);
    892 }
    893 
    894 //
    895 // Below here is all code related to converting a ternary trie to a compact trie
    896 // and back again
    897 //
    898 
    899 enum CompactTrieNodeType {
    900     kHorizontalType = 0,
    901     kVerticalType = 1,
    902     kValueType = 2
    903 };
    904 
    905 /**
    906  * The following classes (i.e. BuildCompactTrie*Node) are helper classes to
    907  * construct the compact trie by storing information for each node and later
    908  * writing the node to memory in a sequential format.
    909  */
    910 class BuildCompactTrieNode: public UMemory {
    911 public:
    912     UBool           fParentEndsWord;
    913     CompactTrieNodeType fNodeType;
    914     UBool           fHasDuplicate;
    915     UBool           fEqualOverflows;
    916     int32_t         fNodeID;
    917     UnicodeString   fChars;
    918     uint16_t        fValue;
    919 
    920 public:
    921     BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType,
    922             UStack &nodes, UErrorCode &status, uint16_t value = 0) {
    923         fParentEndsWord = parentEndsWord;
    924         fHasDuplicate = FALSE;
    925         fNodeType = nodeType;
    926         fEqualOverflows = FALSE;
    927         fNodeID = nodes.size();
    928         fValue = parentEndsWord? value : 0;
    929         nodes.push(this, status);
    930     }
    931 
    932     virtual ~BuildCompactTrieNode() {
    933     }
    934 
    935     virtual uint32_t size() {
    936         if(fValue > 0)
    937             return sizeof(uint16_t) * 2;
    938         else
    939             return sizeof(uint16_t);
    940     }
    941 
    942     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) {
    943         // Write flag/count
    944 
    945         // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be
    946         // used as a 5th MSB.
    947         U_ASSERT(fChars.length() < 4096 || fNodeID == 2);
    948 
    949         *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) |
    950         ((fNodeID == 2)? (fChars.length() & kRootCountMask):
    951             (
    952                     (fChars.length() & kCountMask) |
    953                     //((fChars.length() << 2) & kExceedsCount) |
    954                     (fNodeType == kVerticalType ? kVerticalNode : 0) |
    955                     (fParentEndsWord ? kParentEndsWord : 0 )
    956             )
    957         );
    958         offset += sizeof(uint16_t);
    959     }
    960 
    961     virtual void writeValue(uint8_t *bytes, uint32_t &offset) {
    962         if(fValue > 0){
    963             *((uint16_t *)(bytes+offset)) = fValue;
    964             offset += sizeof(uint16_t);
    965         }
    966     }
    967 
    968 };
    969 
    970 /**
    971  * Stores value of parent terminating nodes that have no more subtries.
    972  */
    973 class BuildCompactTrieValueNode: public BuildCompactTrieNode {
    974 public:
    975     BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value)
    976         : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){
    977     }
    978 
    979     virtual ~BuildCompactTrieValueNode(){
    980     }
    981 
    982     virtual uint32_t size() {
    983         return sizeof(uint16_t) * 2;
    984     }
    985 
    986     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
    987         // don't write value directly to memory but store it in offset to be written later
    988         //offset = fValue & kOffsetContainsValue;
    989         BuildCompactTrieNode::write(bytes, offset, translate);
    990         BuildCompactTrieNode::writeValue(bytes, offset);
    991     }
    992 };
    993 
    994 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {
    995  public:
    996     UStack          fLinks;
    997     UBool           fMayOverflow; //intermediate value for fEqualOverflows
    998 
    999  public:
   1000     BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
   1001     : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value), fLinks(status) {
   1002         fMayOverflow = FALSE;
   1003     }
   1004 
   1005     virtual ~BuildCompactTrieHorizontalNode() {
   1006     }
   1007 
   1008     // It is impossible to know beforehand exactly how much space the node will
   1009     // need in memory before being written, because the node IDs in the equal
   1010     // links may or may not overflow after node coalescing. Therefore, this method
   1011     // returns the maximum size possible for the node.
   1012     virtual uint32_t size() {
   1013         uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) +
   1014         (fChars.length()*sizeof(CompactTrieHorizontalEntry));
   1015 
   1016         if(fValue > 0)
   1017             estimatedSize += sizeof(uint16_t);
   1018 
   1019         //estimate extra space needed to store overflow for node ID links
   1020         //may be more than what is actually needed
   1021         for(int i=0; i < fChars.length(); i++){
   1022             if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){
   1023                 fMayOverflow = TRUE;
   1024                 break;
   1025             }
   1026         }
   1027         if(fMayOverflow) // added space for overflow should be same as ceil(fChars.length()/4) * sizeof(uint16_t)
   1028             estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4;
   1029 
   1030         return estimatedSize;
   1031     }
   1032 
   1033     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
   1034         int32_t count = fChars.length();
   1035 
   1036         //if largest nodeID > 2^16, set flag
   1037         //large node IDs are more likely to be at the back of the array
   1038         for (int32_t i = count-1; i >= 0; --i) {
   1039             if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) > 0xFFFF){
   1040                 fEqualOverflows = TRUE;
   1041                 break;
   1042             }
   1043         }
   1044 
   1045         BuildCompactTrieNode::write(bytes, offset, translate);
   1046 
   1047         // write entries[] to memory
   1048         for (int32_t i = 0; i < count; ++i) {
   1049             CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset);
   1050             entry->ch = fChars[i];
   1051             entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID);
   1052 #ifdef DEBUG_TRIE_DICT
   1053 
   1054             if ((entry->equal == 0) && !fEqualOverflows) {
   1055                 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
   1056                         i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
   1057             }
   1058 #endif
   1059             offset += sizeof(CompactTrieHorizontalEntry);
   1060         }
   1061 
   1062         // append extra bits of equal nodes to end if fEqualOverflows
   1063         if (fEqualOverflows) {
   1064             uint16_t leftmostBits = 0;
   1065             for (int16_t i = 0; i < count; i++) {
   1066                 leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i);
   1067 
   1068                 // write filled uint16_t to memory
   1069                 if(i % 4 == 3){
   1070                     *((uint16_t *)(bytes+offset)) = leftmostBits;
   1071                     leftmostBits = 0;
   1072                     offset += sizeof(uint16_t);
   1073                 }
   1074             }
   1075 
   1076             // pad last uint16_t with zeroes if necessary
   1077             int remainder = count % 4;
   1078             if (remainder > 0) {
   1079                 *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remainder));
   1080                 offset += sizeof(uint16_t);
   1081             }
   1082         }
   1083 
   1084         BuildCompactTrieNode::writeValue(bytes, offset);
   1085     }
   1086 
   1087     // returns leftmost bits of physical node link
   1088     uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){
   1089         uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) >> 16);
   1090 #ifdef DEBUG_TRIE_DICT
   1091         if (leftmostBits > 0xF) {
   1092             fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n",
   1093                     i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
   1094         }
   1095 #endif
   1096         return leftmostBits;
   1097     }
   1098 
   1099     void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {
   1100         fChars.append(ch);
   1101         fLinks.push(link, status);
   1102     }
   1103 
   1104 };
   1105 
   1106 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {
   1107 public:
   1108     BuildCompactTrieNode    *fEqual;
   1109 
   1110 public:
   1111     BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
   1112     : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) {
   1113         fEqual = NULL;
   1114     }
   1115 
   1116     virtual ~BuildCompactTrieVerticalNode() {
   1117     }
   1118 
   1119     // Returns the maximum possible size of this node. See comment in
   1120     // BuildCompactTrieHorizontal node for more information.
   1121     virtual uint32_t size() {
   1122         uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
   1123         if(fValue > 0){
   1124             estimatedSize += sizeof(uint16_t);
   1125         }
   1126 
   1127         if(fEqual->fNodeID > 0xFFFF){
   1128             estimatedSize += sizeof(uint16_t);
   1129         }
   1130         return estimatedSize;
   1131     }
   1132 
   1133     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
   1134         CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset);
   1135         fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF);
   1136         BuildCompactTrieNode::write(bytes, offset, translate);
   1137         node->equal = translate.elementAti(fEqual->fNodeID);
   1138         offset += sizeof(node->equal);
   1139 #ifdef DEBUG_TRIE_DICT
   1140         if ((node->equal == 0) && !fEqualOverflows) {
   1141             fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n",
   1142                     fEqual->fNodeID);
   1143         }
   1144 #endif
   1145         fChars.extract(0, fChars.length(), (UChar *)node->chars);
   1146         offset += sizeof(UChar)*fChars.length();
   1147 
   1148         // append 16 bits of to end for equal node if fEqualOverflows
   1149         if (fEqualOverflows) {
   1150             *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeID) >> 16);
   1151             offset += sizeof(uint16_t);
   1152         }
   1153 
   1154         BuildCompactTrieNode::writeValue(bytes, offset);
   1155     }
   1156 
   1157     void addChar(UChar ch) {
   1158         fChars.append(ch);
   1159     }
   1160 
   1161     void setLink(BuildCompactTrieNode *node) {
   1162         fEqual = node;
   1163     }
   1164 
   1165 };
   1166 
   1167 // Forward declaration
   1168 static void walkHorizontal(const TernaryNode *node,
   1169                             BuildCompactTrieHorizontalNode *building,
   1170                             UStack &nodes,
   1171                             UErrorCode &status,
   1172                             Hashtable *values);
   1173 
   1174 // Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion.
   1175 
   1176 static BuildCompactTrieNode *
   1177 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes,
   1178         UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) {
   1179     if (U_FAILURE(status)) {
   1180         return NULL;
   1181     }
   1182     BuildCompactTrieNode *result = NULL;
   1183     UBool horizontal = (node->low != NULL || node->high != NULL);
   1184     if (horizontal) {
   1185         BuildCompactTrieHorizontalNode *hResult;
   1186         if(values != NULL){
   1187             hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue);
   1188         } else {
   1189             hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
   1190         }
   1191 
   1192         if (hResult == NULL) {
   1193             status = U_MEMORY_ALLOCATION_ERROR;
   1194             return NULL;
   1195         }
   1196         if (U_SUCCESS(status)) {
   1197             walkHorizontal(node, hResult, nodes, status, values);
   1198             result = hResult;
   1199         }
   1200     }
   1201     else {
   1202         BuildCompactTrieVerticalNode *vResult;
   1203         if(values != NULL){
   1204             vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status, parentValue);
   1205         } else {
   1206             vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
   1207         }
   1208 
   1209         if (vResult == NULL) {
   1210             status = U_MEMORY_ALLOCATION_ERROR;
   1211             return NULL;
   1212         }
   1213         else if (U_SUCCESS(status)) {
   1214             uint16_t   value = 0;
   1215             UBool endsWord = FALSE;
   1216             // Take up nodes until we end a word, or hit a node with < or > links
   1217             do {
   1218                 vResult->addChar(node->ch);
   1219                 value = node->flags;
   1220                 endsWord = value > 0;
   1221                 node = node->equal;
   1222             }
   1223             while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);
   1224 
   1225             if (node == NULL) {
   1226                 if (!endsWord) {
   1227                     status = U_ILLEGAL_ARGUMENT_ERROR;  // Corrupt input trie
   1228                 }
   1229                 else if(values != NULL){
   1230                     UnicodeString key(value); //store value as a single-char UnicodeString
   1231                     BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key);
   1232                     if(link == NULL){
   1233                         link = new BuildCompactTrieValueNode(nodes, status, value); //take out nodes?
   1234                         values->put(key, link, status);
   1235                     }
   1236                     vResult->setLink(link);
   1237                 } else {
   1238                     vResult->setLink((BuildCompactTrieNode *)nodes[1]);
   1239                 }
   1240             }
   1241             else {
   1242                 vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));
   1243             }
   1244             result = vResult;
   1245         }
   1246     }
   1247     return result;
   1248 }
   1249 
   1250 // Walk the set of peers at the same level, to build a horizontal node.
   1251 // Uses recursion.
   1252 
   1253 static void walkHorizontal(const TernaryNode *node,
   1254                            BuildCompactTrieHorizontalNode *building,
   1255                            UStack &nodes,
   1256                            UErrorCode &status, Hashtable *values = NULL) {
   1257     while (U_SUCCESS(status) && node != NULL) {
   1258         if (node->low != NULL) {
   1259             walkHorizontal(node->low, building, nodes, status, values);
   1260         }
   1261         BuildCompactTrieNode *link = NULL;
   1262         if (node->equal != NULL) {
   1263             link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);
   1264         }
   1265         else if (node->flags > 0) {
   1266             if(values != NULL) {
   1267                 UnicodeString key(node->flags); //store value as a single-char UnicodeString
   1268                 link = (BuildCompactTrieValueNode *) values->get(key);
   1269                 if(link == NULL) {
   1270                     link = new BuildCompactTrieValueNode(nodes, status, node->flags); //take out nodes?
   1271                     values->put(key, link, status);
   1272                 }
   1273             } else {
   1274                 link = (BuildCompactTrieNode *)nodes[1];
   1275             }
   1276         }
   1277         if (U_SUCCESS(status) && link != NULL) {
   1278             building->addNode(node->ch, link, status);
   1279         }
   1280         // Tail recurse manually instead of leaving it to the compiler.
   1281         //if (node->high != NULL) {
   1282         //    walkHorizontal(node->high, building, nodes, status);
   1283         //}
   1284         node = node->high;
   1285     }
   1286 }
   1287 
   1288 U_NAMESPACE_END
   1289 U_NAMESPACE_USE
   1290 U_CDECL_BEGIN
   1291 static int32_t U_CALLCONV
   1292 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
   1293     BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
   1294     BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
   1295 
   1296     // Check for comparing a node to itself, to avoid spurious duplicates
   1297     if (left == right) {
   1298         return 0;
   1299     }
   1300 
   1301     // Most significant is type of node. Can never coalesce.
   1302     if (left->fNodeType != right->fNodeType) {
   1303         return left->fNodeType - right->fNodeType;
   1304     }
   1305     // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
   1306     if (left->fParentEndsWord != right->fParentEndsWord) {
   1307         return left->fParentEndsWord - right->fParentEndsWord;
   1308     }
   1309     // Next, the string. If that differs, we can never coalesce.
   1310     int32_t result = left->fChars.compare(right->fChars);
   1311     if (result != 0) {
   1312         return result;
   1313     }
   1314 
   1315     // If the node value differs, we should not coalesce.
   1316     // If values aren't stored, all fValues should be 0.
   1317     if (left->fValue != right->fValue) {
   1318         return left->fValue - right->fValue;
   1319     }
   1320 
   1321     // We know they're both the same node type, so branch for the two cases.
   1322     if (left->fNodeType == kVerticalType) {
   1323         result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
   1324         - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
   1325     }
   1326     else if(left->fChars.length() > 0 && right->fChars.length() > 0){
   1327         // We need to compare the links vectors. They should be the
   1328         // same size because the strings were equal.
   1329         // We compare the node IDs instead of the pointers, to handle
   1330         // coalesced nodes.
   1331         BuildCompactTrieHorizontalNode *hleft, *hright;
   1332         hleft = (BuildCompactTrieHorizontalNode *)left;
   1333         hright = (BuildCompactTrieHorizontalNode *)right;
   1334         int32_t count = hleft->fLinks.size();
   1335         for (int32_t i = 0; i < count && result == 0; ++i) {
   1336             result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
   1337             ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
   1338         }
   1339     }
   1340 
   1341     // If they are equal to each other, mark them (speeds coalescing)
   1342     if (result == 0) {
   1343         left->fHasDuplicate = TRUE;
   1344         right->fHasDuplicate = TRUE;
   1345     }
   1346     return result;
   1347 }
   1348 U_CDECL_END
   1349 U_NAMESPACE_BEGIN
   1350 
   1351 static void coalesceDuplicates(UStack &nodes, UErrorCode &status) {
   1352     // We sort the array of nodes to place duplicates next to each other
   1353     if (U_FAILURE(status)) {
   1354         return;
   1355     }
   1356     int32_t size = nodes.size();
   1357     void **array = (void **)uprv_malloc(sizeof(void *)*size);
   1358     if (array == NULL) {
   1359         status = U_MEMORY_ALLOCATION_ERROR;
   1360         return;
   1361     }
   1362     (void) nodes.toArray(array);
   1363 
   1364     // Now repeatedly identify duplicates until there are no more
   1365     int32_t dupes = 0;
   1366     long    passCount = 0;
   1367 #ifdef DEBUG_TRIE_DICT
   1368     long    totalDupes = 0;
   1369 #endif
   1370     do {
   1371         BuildCompactTrieNode *node;
   1372         BuildCompactTrieNode *first = NULL;
   1373         BuildCompactTrieNode **p;
   1374         BuildCompactTrieNode **pFirst = NULL;
   1375         int32_t counter = size - 2;
   1376         // Sort the array, skipping nodes 0 and 1. Use quicksort for the first
   1377         // pass for speed. For the second and subsequent passes, we use stable
   1378         // (insertion) sort for two reasons:
   1379         // 1. The array is already mostly ordered, so we get better performance.
   1380         // 2. The way we find one and only one instance of a set of duplicates is to
   1381         //    check that the node ID equals the array index. If we used an unstable
   1382         //    sort for the second or later passes, it's possible that none of the
   1383         //    duplicates would wind up with a node ID equal to its array index.
   1384         //    The sort stability guarantees that, because as we coalesce more and
   1385         //    more groups, the first element of the resultant group will be one of
   1386         //    the first elements of the groups being coalesced.
   1387         // To use quicksort for the second and subsequent passes, we would have to
   1388         // find the minimum of the node numbers in a group, and set all the nodes
   1389         // in the group to that node number.
   1390         uprv_sortArray(array+2, counter, sizeof(void *), _sortBuildNodes, NULL, (passCount > 0), &status);
   1391         dupes = 0;
   1392         for (p = (BuildCompactTrieNode **)array + 2; counter > 0; --counter, ++p) {
   1393             node = *p;
   1394             if (node->fHasDuplicate) {
   1395                 if (first == NULL) {
   1396                     first = node;
   1397                     pFirst = p;
   1398                 }
   1399                 else if (_sortBuildNodes(NULL, pFirst, p) != 0) {
   1400                     // Starting a new run of dupes
   1401                     first = node;
   1402                     pFirst = p;
   1403                 }
   1404                 else if (node->fNodeID != first->fNodeID) {
   1405                     // Slave one to the other, note duplicate
   1406                     node->fNodeID = first->fNodeID;
   1407                     dupes += 1;
   1408                 }
   1409             }
   1410             else {
   1411                 // This node has no dupes
   1412                 first = NULL;
   1413                 pFirst = NULL;
   1414             }
   1415         }
   1416         passCount += 1;
   1417 #ifdef DEBUG_TRIE_DICT
   1418         totalDupes += dupes;
   1419         fprintf(stderr, "Trie node dupe removal, pass %d: %d nodes tagged\n", passCount, dupes);
   1420 #endif
   1421     }
   1422     while (dupes > 0);
   1423 #ifdef DEBUG_TRIE_DICT
   1424     fprintf(stderr, "Trie node dupe removal complete: %d tagged in %d passes\n", totalDupes, passCount);
   1425 #endif
   1426 
   1427     // We no longer need the temporary array, as the nodes have all been marked appropriately.
   1428     uprv_free(array);
   1429 }
   1430 
   1431 U_NAMESPACE_END
   1432 U_CDECL_BEGIN
   1433 static void U_CALLCONV _deleteBuildNode(void *obj) {
   1434     delete (BuildCompactTrieNode *) obj;
   1435 }
   1436 U_CDECL_END
   1437 U_NAMESPACE_BEGIN
   1438 
   1439 CompactTrieHeader *
   1440 CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary &dict,
   1441                                 UErrorCode &status ) {
   1442     if (U_FAILURE(status)) {
   1443         return NULL;
   1444     }
   1445 #ifdef DEBUG_TRIE_DICT
   1446     struct tms timing;
   1447     struct tms previous;
   1448     (void) ::times(&previous);
   1449 #endif
   1450     UStack nodes(_deleteBuildNode, NULL, status);      // Index of nodes
   1451 
   1452     // Add node 0, used as the NULL pointer/sentinel.
   1453     nodes.addElement((int32_t)0, status);
   1454 
   1455     Hashtable *values = NULL;                           // Index of (unique) values
   1456     if (dict.fValued) {
   1457         values = new Hashtable(status);
   1458     }
   1459 
   1460     // Start by creating the special empty node we use to indicate that the parent
   1461     // terminates a word. This must be node 1, because the builder assumes
   1462     // that. This node will never be used for tries storing numerical values.
   1463     if (U_FAILURE(status)) {
   1464         return NULL;
   1465     }
   1466     BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalType, nodes, status);
   1467     if (terminal == NULL) {
   1468         status = U_MEMORY_ALLOCATION_ERROR;
   1469     }
   1470 
   1471     // This call does all the work of building the new trie structure. The root
   1472     // will have node ID 2 before writing to memory.
   1473     BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status, values);
   1474 #ifdef DEBUG_TRIE_DICT
   1475     (void) ::times(&timing);
   1476     fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
   1477         nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
   1478         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
   1479     previous = timing;
   1480 #endif
   1481 
   1482     // Now coalesce all duplicate nodes.
   1483     coalesceDuplicates(nodes, status);
   1484 #ifdef DEBUG_TRIE_DICT
   1485     (void) ::times(&timing);
   1486     fprintf(stderr, "Duplicates coalesced, time user %f system %f\n",
   1487         (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
   1488         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
   1489     previous = timing;
   1490 #endif
   1491 
   1492     // Next, build the output trie.
   1493     // First we compute all the sizes and build the node ID translation table.
   1494     uint32_t totalSize = offsetof(CompactTrieHeader,offsets);
   1495     int32_t count = nodes.size();
   1496     int32_t nodeCount = 1;              // The sentinel node we already have
   1497     BuildCompactTrieNode *node;
   1498     int32_t i;
   1499     UVector32 translate(count, status); // Should be no growth needed after this
   1500     translate.push(0, status);          // The sentinel node
   1501 
   1502     if (U_FAILURE(status)) {
   1503         return NULL;
   1504     }
   1505 
   1506     //map terminal value nodes
   1507     int valueCount = 0;
   1508     UVector valueNodes(status);
   1509     if(values != NULL) {
   1510         valueCount = values->count(); //number of unique terminal value nodes
   1511     }
   1512 
   1513     // map non-terminal nodes
   1514     int valuePos = 1;//, nodePos = valueCount + valuePos;
   1515     nodeCount = valueCount + valuePos;
   1516     for (i = 1; i < count; ++i) {
   1517         node = (BuildCompactTrieNode *)nodes[i];
   1518         if (node->fNodeID == i) {
   1519             // Only one node out of each duplicate set is used
   1520             if (node->fNodeID >= translate.size()) {
   1521                 // Logically extend the mapping table
   1522                 translate.setSize(i + 1);
   1523             }
   1524             //translate.setElementAt(object, index)!
   1525             if(node->fNodeType == kValueType) {
   1526                 valueNodes.addElement(node, status);
   1527                translate.setElementAt(valuePos++, i);
   1528              } else {
   1529                 translate.setElementAt(nodeCount++, i);
   1530             }
   1531             totalSize += node->size();
   1532         }
   1533     }
   1534 
   1535     // Check for overflowing 20 bits worth of nodes.
   1536     if (nodeCount > 0x100000) {
   1537         status = U_ILLEGAL_ARGUMENT_ERROR;
   1538         return NULL;
   1539     }
   1540 
   1541     // Add enough room for the offsets.
   1542     totalSize += nodeCount*sizeof(uint32_t);
   1543 #ifdef DEBUG_TRIE_DICT
   1544     (void) ::times(&timing);
   1545     fprintf(stderr, "Sizes/mapping done, time user %f system %f\n",
   1546         (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
   1547         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
   1548     previous = timing;
   1549     fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount, totalSize);
   1550 #endif
   1551     uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize);
   1552     if (bytes == NULL) {
   1553         status = U_MEMORY_ALLOCATION_ERROR;
   1554         return NULL;
   1555     }
   1556 
   1557     CompactTrieHeader *header = (CompactTrieHeader *)bytes;
   1558     //header->size = totalSize;
   1559     if(dict.fValued){
   1560         header->magic = COMPACT_TRIE_MAGIC_3;
   1561     } else {
   1562         header->magic = COMPACT_TRIE_MAGIC_2;
   1563     }
   1564     header->nodeCount = nodeCount;
   1565     header->offsets[0] = 0;                     // Sentinel
   1566     header->root = translate.elementAti(root->fNodeID);
   1567 #ifdef DEBUG_TRIE_DICT
   1568     if (header->root == 0) {
   1569         fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root->fNodeID);
   1570     }
   1571 #endif
   1572     uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));
   1573     nodeCount = valueCount + 1;
   1574 
   1575     // Write terminal value nodes to memory
   1576     for (i=0; i < valueNodes.size(); i++) {
   1577         //header->offsets[i + 1] = offset;
   1578         uint32_t tmpOffset = 0;
   1579         node = (BuildCompactTrieNode *) valueNodes.elementAt(i);
   1580         //header->offsets[i + 1] = (uint32_t)node->fValue;
   1581         node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate);
   1582     }
   1583 
   1584     // Now write the data
   1585     for (i = 1; i < count; ++i) {
   1586         node = (BuildCompactTrieNode *)nodes[i];
   1587         if (node->fNodeID == i && node->fNodeType != kValueType) {
   1588             header->offsets[nodeCount++] = offset;
   1589             node->write(bytes, offset, translate);
   1590         }
   1591     }
   1592 
   1593     //free all extra space
   1594     uprv_realloc(bytes, offset);
   1595     header->size = offset;
   1596 
   1597 #ifdef DEBUG_TRIE_DICT
   1598     fprintf(stdout, "Space freed: %d\n", totalSize-offset);
   1599 
   1600     (void) ::times(&timing);
   1601     fprintf(stderr, "Trie built, time user %f system %f\n",
   1602         (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
   1603         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
   1604     previous = timing;
   1605     fprintf(stderr, "Final offset is %d\n", offset);
   1606 
   1607     // Collect statistics on node types and sizes
   1608     int hCount = 0;
   1609     int vCount = 0;
   1610     size_t hSize = 0;
   1611     size_t vSize = 0;
   1612     size_t hItemCount = 0;
   1613     size_t vItemCount = 0;
   1614     uint32_t previousOff = offset;
   1615     uint32_t numOverflow = 0;
   1616     uint32_t valueSpace = 0;
   1617     for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
   1618         const CompactTrieNode *node = getCompactNode(header, nodeIdx);
   1619         int itemCount;
   1620         if(nodeIdx == header->root)
   1621             itemCount = node->flagscount & kRootCountMask;
   1622         else
   1623             itemCount = getCount(node);
   1624         if(node->flagscount & kEqualOverflows){
   1625             numOverflow++;
   1626         }
   1627         if (node->flagscount & kVerticalNode && nodeIdx != header->root) {
   1628             vCount += 1;
   1629             vItemCount += itemCount;
   1630             vSize += previousOff-header->offsets[nodeIdx];
   1631         }
   1632         else {
   1633             hCount += 1;
   1634             hItemCount += itemCount;
   1635             if(nodeIdx >= header->root) {
   1636                 hSize += previousOff-header->offsets[nodeIdx];
   1637             }
   1638         }
   1639 
   1640         if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEndsWord)
   1641             valueSpace += sizeof(uint16_t);
   1642         previousOff = header->offsets[nodeIdx];
   1643     }
   1644     fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount,
   1645                 (double)hSize/hCount, (double)hItemCount/hCount);
   1646     fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount,
   1647                 (double)vSize/vCount, (double)vItemCount/vCount);
   1648     fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverflow);
   1649     fprintf(stderr, "Space taken up by values: %d \n", valueSpace);
   1650 #endif
   1651 
   1652     if (U_FAILURE(status)) {
   1653         uprv_free(bytes);
   1654         header = NULL;
   1655     }
   1656     return header;
   1657 }
   1658 
   1659 // Forward declaration
   1660 static TernaryNode *
   1661 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status );
   1662 
   1663 // Convert a horizontal node (or subarray thereof) into a ternary subtrie
   1664 static TernaryNode *
   1665 unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalNode *hnode,
   1666         int low, int high, int nodeCount, UErrorCode &status) {
   1667     if (U_FAILURE(status) || low > high) {
   1668         return NULL;
   1669     }
   1670     int middle = (low+high)/2;
   1671     TernaryNode *result = new TernaryNode(hnode->entries[middle].ch);
   1672     if (result == NULL) {
   1673         status = U_MEMORY_ALLOCATION_ERROR;
   1674         return NULL;
   1675     }
   1676     const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, middle, nodeCount));
   1677     if (equal->flagscount & kParentEndsWord) {
   1678         if(info->magic == COMPACT_TRIE_MAGIC_3){
   1679             result->flags = getValue(equal);
   1680         }else{
   1681             result->flags |= kEndsWord;
   1682         }
   1683     }
   1684     result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status);
   1685     result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status);
   1686     result->equal = unpackOneNode(info, equal, status);
   1687     return result;
   1688 }
   1689 
   1690 // Convert one compact trie node into a ternary subtrie
   1691 static TernaryNode *
   1692 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ) {
   1693     int nodeCount = getCount(node);
   1694     if (nodeCount == 0 || U_FAILURE(status)) {
   1695         // Failure, or terminal node
   1696         return NULL;
   1697     }
   1698     if (node->flagscount & kVerticalNode) {
   1699         const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
   1700         TernaryNode *head = NULL;
   1701         TernaryNode *previous = NULL;
   1702         TernaryNode *latest = NULL;
   1703         for (int i = 0; i < nodeCount; ++i) {
   1704             latest = new TernaryNode(vnode->chars[i]);
   1705             if (latest == NULL) {
   1706                 status = U_MEMORY_ALLOCATION_ERROR;
   1707                 break;
   1708             }
   1709             if (head == NULL) {
   1710                 head = latest;
   1711             }
   1712             if (previous != NULL) {
   1713                 previous->equal = latest;
   1714             }
   1715             previous = latest;
   1716         }
   1717         if (latest != NULL) {
   1718             const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vnode));
   1719             if (equal->flagscount & kParentEndsWord) {
   1720                 if(info->magic == COMPACT_TRIE_MAGIC_3){
   1721                     latest->flags = getValue(equal);
   1722                 } else {
   1723                     latest->flags |= kEndsWord;
   1724                 }
   1725             }
   1726             latest->equal = unpackOneNode(info, equal, status);
   1727         }
   1728         return head;
   1729     }
   1730     else {
   1731         // Horizontal node
   1732         const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
   1733         return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, status);
   1734     }
   1735 }
   1736 
   1737 // returns a MutableTrieDictionary generated from the CompactTrieDictionary
   1738 MutableTrieDictionary *
   1739 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {
   1740     MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->magic == COMPACT_TRIE_MAGIC_3 );
   1741     if (result == NULL) {
   1742         status = U_MEMORY_ALLOCATION_ERROR;
   1743         return NULL;
   1744     }
   1745     // treat root node as special case: don't call unpackOneNode() or unpackHorizontalArray() directly
   1746     // because only kEqualOverflows flag should be checked in root's flagscount
   1747     const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)
   1748     getCompactNode(fInfo, fInfo->root);
   1749     uint16_t nodeCount = hnode->flagscount & kRootCountMask;
   1750     TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,
   1751             nodeCount, status);
   1752 
   1753     if (U_FAILURE(status)) {
   1754         delete root;    // Clean up
   1755         delete result;
   1756         return NULL;
   1757     }
   1758     result->fTrie = root;
   1759     return result;
   1760 }
   1761 
   1762 U_NAMESPACE_END
   1763 
   1764 U_CAPI int32_t U_EXPORT2
   1765 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
   1766         UErrorCode *status) {
   1767 
   1768     if (status == NULL || U_FAILURE(*status)) {
   1769         return 0;
   1770     }
   1771     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
   1772         *status=U_ILLEGAL_ARGUMENT_ERROR;
   1773         return 0;
   1774     }
   1775 
   1776     //
   1777     //  Check that the data header is for for dictionary data.
   1778     //    (Header contents are defined in genxxx.cpp)
   1779     //
   1780     const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
   1781     if(!(  pInfo->dataFormat[0]==0x54 &&   /* dataFormat="TrDc" */
   1782             pInfo->dataFormat[1]==0x72 &&
   1783             pInfo->dataFormat[2]==0x44 &&
   1784             pInfo->dataFormat[3]==0x63 &&
   1785             pInfo->formatVersion[0]==1  )) {
   1786         udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
   1787                 pInfo->dataFormat[0], pInfo->dataFormat[1],
   1788                 pInfo->dataFormat[2], pInfo->dataFormat[3],
   1789                 pInfo->formatVersion[0]);
   1790         *status=U_UNSUPPORTED_ERROR;
   1791         return 0;
   1792     }
   1793 
   1794     //
   1795     // Swap the data header.  (This is the generic ICU Data Header, not the
   1796     //                         CompactTrieHeader).  This swap also conveniently gets us
   1797     //                         the size of the ICU d.h., which lets us locate the start
   1798     //                         of the RBBI specific data.
   1799     //
   1800     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
   1801 
   1802     //
   1803     // Get the CompactTrieHeader, and check that it appears to be OK.
   1804     //
   1805     const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
   1806     const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
   1807     uint32_t magic = ds->readUInt32(header->magic);
   1808     if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3
   1809             || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)
   1810             || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
   1811     {
   1812         udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");
   1813         *status=U_UNSUPPORTED_ERROR;
   1814         return 0;
   1815     }
   1816 
   1817     //
   1818     // Prefight operation?  Just return the size
   1819     //
   1820     uint32_t totalSize = ds->readUInt32(header->size);
   1821     int32_t sizeWithUData = (int32_t)totalSize + headerSize;
   1822     if (length < 0) {
   1823         return sizeWithUData;
   1824     }
   1825 
   1826     //
   1827     // Check that length passed in is consistent with length from RBBI data header.
   1828     //
   1829     if (length < sizeWithUData) {
   1830         udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
   1831                 totalSize);
   1832         *status=U_INDEX_OUTOFBOUNDS_ERROR;
   1833         return 0;
   1834     }
   1835 
   1836     //
   1837     // Swap the Data.  Do the data itself first, then the CompactTrieHeader, because
   1838     //                 we need to reference the header to locate the data, and an
   1839     //                 inplace swap of the header leaves it unusable.
   1840     //
   1841     uint8_t             *outBytes = (uint8_t *)outData + headerSize;
   1842     CompactTrieHeader   *outputHeader = (CompactTrieHeader *)outBytes;
   1843 
   1844 #if 0
   1845     //
   1846     // If not swapping in place, zero out the output buffer before starting.
   1847     //
   1848     if (inBytes != outBytes) {
   1849         uprv_memset(outBytes, 0, totalSize);
   1850     }
   1851 
   1852     // We need to loop through all the nodes in the offset table, and swap each one.
   1853     uint32_t nodeCount, rootId;
   1854     if(header->magic == COMPACT_TRIE_MAGIC_1) {
   1855         nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);
   1856         rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);
   1857     } else {
   1858         nodeCount = ds->readUInt32(header->nodeCount);
   1859         rootId = ds->readUInt32(header->root);
   1860     }
   1861 
   1862     // Skip node 0, which should always be 0.
   1863     for (uint32_t i = 1; i < nodeCount; ++i) {
   1864         uint32_t nodeOff = ds->readUInt32(header->offsets[i]);
   1865         const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff);
   1866         CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);
   1867         uint16_t flagscount = ds->readUInt16(inNode->flagscount);
   1868         uint16_t itemCount = getCount(inNode);
   1869         //uint16_t itemCount = flagscount & kCountMask;
   1870         ds->writeUInt16(&outNode->flagscount, flagscount);
   1871         if (itemCount > 0) {
   1872             uint16_t overflow = 0; //number of extra uint16_ts needed to be swapped
   1873             if (flagscount & kVerticalNode && i != rootId) {
   1874                 if(flagscount & kEqualOverflows){
   1875                     // include overflow bits
   1876                     overflow += 1;
   1877                 }
   1878                 if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsParentWord) {
   1879                     //include values
   1880                     overflow += 1;
   1881                 }
   1882                 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars),
   1883                         (itemCount + overflow)*sizeof(uint16_t),
   1884                         outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
   1885                 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);
   1886                 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));
   1887             }
   1888             else {
   1889                 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHorizontalNode *)inNode;
   1890                 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode *)outNode;
   1891                 for (int j = 0; j < itemCount; ++j) {
   1892                     uint16_t word = ds->readUInt16(inHNode->entries[j].ch);
   1893                     ds->writeUInt16(&outHNode->entries[j].ch, word);
   1894                     word = ds->readUInt16(inHNode->entries[j].equal);
   1895                     ds->writeUInt16(&outHNode->entries[j].equal, word);
   1896                 }
   1897 
   1898                 // swap overflow/value information
   1899                 if(flagscount & kEqualOverflows){
   1900                     overflow += (itemCount + 3) / 4;
   1901                 }
   1902 
   1903                 if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flagscount & kEndsParentWord) {
   1904                     //include values
   1905                     overflow += 1;
   1906                 }
   1907 
   1908                 uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount];
   1909                 uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCount];
   1910                 for(int j = 0; j<overflow; j++){
   1911                     uint16_t extraInfo = ds->readUInt16(*inOverflow);
   1912                     ds->writeUInt16(outOverflow, extraInfo);
   1913 
   1914                     inOverflow++;
   1915                     outOverflow++;
   1916                 }
   1917             }
   1918         }
   1919     }
   1920 #endif
   1921 
   1922     // Swap the header
   1923     ds->writeUInt32(&outputHeader->size, totalSize);
   1924     ds->writeUInt32(&outputHeader->magic, magic);
   1925 
   1926     uint32_t nodeCount;
   1927     uint32_t offsetPos;
   1928     if (header->magic == COMPACT_TRIE_MAGIC_1) {
   1929         CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header;
   1930         CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeader;
   1931 
   1932         nodeCount = ds->readUInt16(headerV1->nodeCount);
   1933         ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount);
   1934         uint16_t root = ds->readUInt16(headerV1->root);
   1935         ds->writeUInt16(&outputHeaderV1->root, root);
   1936         offsetPos = offsetof(CompactTrieHeaderV1,offsets);
   1937     } else {
   1938         nodeCount = ds->readUInt32(header->nodeCount);
   1939         ds->writeUInt32(&outputHeader->nodeCount, nodeCount);
   1940         uint32_t root = ds->readUInt32(header->root);
   1941         ds->writeUInt32(&outputHeader->root, root);
   1942         offsetPos = offsetof(CompactTrieHeader,offsets);
   1943     }
   1944 
   1945     // All the data in all the nodes consist of 16 bit items. Swap them all at once.
   1946     uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t));
   1947     ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
   1948 
   1949     //swap offsets
   1950     ds->swapArray32(ds, inBytes+offsetPos,
   1951             sizeof(uint32_t)*(uint32_t)nodeCount,
   1952             outBytes+offsetPos, status);
   1953 
   1954     return sizeWithUData;
   1955 }
   1956 
   1957 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1958