Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2010-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ucharstrie.h
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2010nov14
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #include "unicode/utypes.h"
     16 #include "unicode/appendable.h"
     17 #include "unicode/ucharstrie.h"
     18 #include "unicode/uobject.h"
     19 #include "cmemory.h"
     20 #include "uassert.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 UCharsTrie::~UCharsTrie() {
     25     uprv_free(ownedArray_);
     26 }
     27 
     28 UStringTrieResult
     29 UCharsTrie::current() const {
     30     const UChar *pos=pos_;
     31     if(pos==NULL) {
     32         return USTRINGTRIE_NO_MATCH;
     33     } else {
     34         int32_t node;
     35         return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
     36                 valueResult(node) : USTRINGTRIE_NO_VALUE;
     37     }
     38 }
     39 
     40 UStringTrieResult
     41 UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) {
     42     // Branch according to the current unit.
     43     if(length==0) {
     44         length=*pos++;
     45     }
     46     ++length;
     47     // The length of the branch is the number of units to select from.
     48     // The data structure encodes a binary search.
     49     while(length>kMaxBranchLinearSubNodeLength) {
     50         if(uchar<*pos++) {
     51             length>>=1;
     52             pos=jumpByDelta(pos);
     53         } else {
     54             length=length-(length>>1);
     55             pos=skipDelta(pos);
     56         }
     57     }
     58     // Drop down to linear search for the last few units.
     59     // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
     60     // and divides length by 2.
     61     do {
     62         if(uchar==*pos++) {
     63             UStringTrieResult result;
     64             int32_t node=*pos;
     65             if(node&kValueIsFinal) {
     66                 // Leave the final value for getValue() to read.
     67                 result=USTRINGTRIE_FINAL_VALUE;
     68             } else {
     69                 // Use the non-final value as the jump delta.
     70                 ++pos;
     71                 // int32_t delta=readValue(pos, node);
     72                 int32_t delta;
     73                 if(node<kMinTwoUnitValueLead) {
     74                     delta=node;
     75                 } else if(node<kThreeUnitValueLead) {
     76                     delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
     77                 } else {
     78                     delta=(pos[0]<<16)|pos[1];
     79                     pos+=2;
     80                 }
     81                 // end readValue()
     82                 pos+=delta;
     83                 node=*pos;
     84                 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
     85             }
     86             pos_=pos;
     87             return result;
     88         }
     89         --length;
     90         pos=skipValue(pos);
     91     } while(length>1);
     92     if(uchar==*pos++) {
     93         pos_=pos;
     94         int32_t node=*pos;
     95         return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
     96     } else {
     97         stop();
     98         return USTRINGTRIE_NO_MATCH;
     99     }
    100 }
    101 
    102 UStringTrieResult
    103 UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) {
    104     int32_t node=*pos++;
    105     for(;;) {
    106         if(node<kMinLinearMatch) {
    107             return branchNext(pos, node, uchar);
    108         } else if(node<kMinValueLead) {
    109             // Match the first of length+1 units.
    110             int32_t length=node-kMinLinearMatch;  // Actual match length minus 1.
    111             if(uchar==*pos++) {
    112                 remainingMatchLength_=--length;
    113                 pos_=pos;
    114                 return (length<0 && (node=*pos)>=kMinValueLead) ?
    115                         valueResult(node) : USTRINGTRIE_NO_VALUE;
    116             } else {
    117                 // No match.
    118                 break;
    119             }
    120         } else if(node&kValueIsFinal) {
    121             // No further matching units.
    122             break;
    123         } else {
    124             // Skip intermediate value.
    125             pos=skipNodeValue(pos, node);
    126             node&=kNodeTypeMask;
    127         }
    128     }
    129     stop();
    130     return USTRINGTRIE_NO_MATCH;
    131 }
    132 
    133 UStringTrieResult
    134 UCharsTrie::next(int32_t uchar) {
    135     const UChar *pos=pos_;
    136     if(pos==NULL) {
    137         return USTRINGTRIE_NO_MATCH;
    138     }
    139     int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
    140     if(length>=0) {
    141         // Remaining part of a linear-match node.
    142         if(uchar==*pos++) {
    143             remainingMatchLength_=--length;
    144             pos_=pos;
    145             int32_t node;
    146             return (length<0 && (node=*pos)>=kMinValueLead) ?
    147                     valueResult(node) : USTRINGTRIE_NO_VALUE;
    148         } else {
    149             stop();
    150             return USTRINGTRIE_NO_MATCH;
    151         }
    152     }
    153     return nextImpl(pos, uchar);
    154 }
    155 
    156 UStringTrieResult
    157 UCharsTrie::next(const UChar *s, int32_t sLength) {
    158     if(sLength<0 ? *s==0 : sLength==0) {
    159         // Empty input.
    160         return current();
    161     }
    162     const UChar *pos=pos_;
    163     if(pos==NULL) {
    164         return USTRINGTRIE_NO_MATCH;
    165     }
    166     int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
    167     for(;;) {
    168         // Fetch the next input unit, if there is one.
    169         // Continue a linear-match node without rechecking sLength<0.
    170         int32_t uchar;
    171         if(sLength<0) {
    172             for(;;) {
    173                 if((uchar=*s++)==0) {
    174                     remainingMatchLength_=length;
    175                     pos_=pos;
    176                     int32_t node;
    177                     return (length<0 && (node=*pos)>=kMinValueLead) ?
    178                             valueResult(node) : USTRINGTRIE_NO_VALUE;
    179                 }
    180                 if(length<0) {
    181                     remainingMatchLength_=length;
    182                     break;
    183                 }
    184                 if(uchar!=*pos) {
    185                     stop();
    186                     return USTRINGTRIE_NO_MATCH;
    187                 }
    188                 ++pos;
    189                 --length;
    190             }
    191         } else {
    192             for(;;) {
    193                 if(sLength==0) {
    194                     remainingMatchLength_=length;
    195                     pos_=pos;
    196                     int32_t node;
    197                     return (length<0 && (node=*pos)>=kMinValueLead) ?
    198                             valueResult(node) : USTRINGTRIE_NO_VALUE;
    199                 }
    200                 uchar=*s++;
    201                 --sLength;
    202                 if(length<0) {
    203                     remainingMatchLength_=length;
    204                     break;
    205                 }
    206                 if(uchar!=*pos) {
    207                     stop();
    208                     return USTRINGTRIE_NO_MATCH;
    209                 }
    210                 ++pos;
    211                 --length;
    212             }
    213         }
    214         int32_t node=*pos++;
    215         for(;;) {
    216             if(node<kMinLinearMatch) {
    217                 UStringTrieResult result=branchNext(pos, node, uchar);
    218                 if(result==USTRINGTRIE_NO_MATCH) {
    219                     return USTRINGTRIE_NO_MATCH;
    220                 }
    221                 // Fetch the next input unit, if there is one.
    222                 if(sLength<0) {
    223                     if((uchar=*s++)==0) {
    224                         return result;
    225                     }
    226                 } else {
    227                     if(sLength==0) {
    228                         return result;
    229                     }
    230                     uchar=*s++;
    231                     --sLength;
    232                 }
    233                 if(result==USTRINGTRIE_FINAL_VALUE) {
    234                     // No further matching units.
    235                     stop();
    236                     return USTRINGTRIE_NO_MATCH;
    237                 }
    238                 pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
    239                 node=*pos++;
    240             } else if(node<kMinValueLead) {
    241                 // Match length+1 units.
    242                 length=node-kMinLinearMatch;  // Actual match length minus 1.
    243                 if(uchar!=*pos) {
    244                     stop();
    245                     return USTRINGTRIE_NO_MATCH;
    246                 }
    247                 ++pos;
    248                 --length;
    249                 break;
    250             } else if(node&kValueIsFinal) {
    251                 // No further matching units.
    252                 stop();
    253                 return USTRINGTRIE_NO_MATCH;
    254             } else {
    255                 // Skip intermediate value.
    256                 pos=skipNodeValue(pos, node);
    257                 node&=kNodeTypeMask;
    258             }
    259         }
    260     }
    261 }
    262 
    263 const UChar *
    264 UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length,
    265                                       UBool haveUniqueValue, int32_t &uniqueValue) {
    266     while(length>kMaxBranchLinearSubNodeLength) {
    267         ++pos;  // ignore the comparison unit
    268         if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
    269             return NULL;
    270         }
    271         length=length-(length>>1);
    272         pos=skipDelta(pos);
    273     }
    274     do {
    275         ++pos;  // ignore a comparison unit
    276         // handle its value
    277         int32_t node=*pos++;
    278         UBool isFinal=(UBool)(node>>15);
    279         node&=0x7fff;
    280         int32_t value=readValue(pos, node);
    281         pos=skipValue(pos, node);
    282         if(isFinal) {
    283             if(haveUniqueValue) {
    284                 if(value!=uniqueValue) {
    285                     return NULL;
    286                 }
    287             } else {
    288                 uniqueValue=value;
    289                 haveUniqueValue=TRUE;
    290             }
    291         } else {
    292             if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
    293                 return NULL;
    294             }
    295             haveUniqueValue=TRUE;
    296         }
    297     } while(--length>1);
    298     return pos+1;  // ignore the last comparison unit
    299 }
    300 
    301 UBool
    302 UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
    303     int32_t node=*pos++;
    304     for(;;) {
    305         if(node<kMinLinearMatch) {
    306             if(node==0) {
    307                 node=*pos++;
    308             }
    309             pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
    310             if(pos==NULL) {
    311                 return FALSE;
    312             }
    313             haveUniqueValue=TRUE;
    314             node=*pos++;
    315         } else if(node<kMinValueLead) {
    316             // linear-match node
    317             pos+=node-kMinLinearMatch+1;  // Ignore the match units.
    318             node=*pos++;
    319         } else {
    320             UBool isFinal=(UBool)(node>>15);
    321             int32_t value;
    322             if(isFinal) {
    323                 value=readValue(pos, node&0x7fff);
    324             } else {
    325                 value=readNodeValue(pos, node);
    326             }
    327             if(haveUniqueValue) {
    328                 if(value!=uniqueValue) {
    329                     return FALSE;
    330                 }
    331             } else {
    332                 uniqueValue=value;
    333                 haveUniqueValue=TRUE;
    334             }
    335             if(isFinal) {
    336                 return TRUE;
    337             }
    338             pos=skipNodeValue(pos, node);
    339             node&=kNodeTypeMask;
    340         }
    341     }
    342 }
    343 
    344 int32_t
    345 UCharsTrie::getNextUChars(Appendable &out) const {
    346     const UChar *pos=pos_;
    347     if(pos==NULL) {
    348         return 0;
    349     }
    350     if(remainingMatchLength_>=0) {
    351         out.appendCodeUnit(*pos);  // Next unit of a pending linear-match node.
    352         return 1;
    353     }
    354     int32_t node=*pos++;
    355     if(node>=kMinValueLead) {
    356         if(node&kValueIsFinal) {
    357             return 0;
    358         } else {
    359             pos=skipNodeValue(pos, node);
    360             node&=kNodeTypeMask;
    361         }
    362     }
    363     if(node<kMinLinearMatch) {
    364         if(node==0) {
    365             node=*pos++;
    366         }
    367         out.reserveAppendCapacity(++node);
    368         getNextBranchUChars(pos, node, out);
    369         return node;
    370     } else {
    371         // First unit of the linear-match node.
    372         out.appendCodeUnit(*pos);
    373         return 1;
    374     }
    375 }
    376 
    377 void
    378 UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) {
    379     while(length>kMaxBranchLinearSubNodeLength) {
    380         ++pos;  // ignore the comparison unit
    381         getNextBranchUChars(jumpByDelta(pos), length>>1, out);
    382         length=length-(length>>1);
    383         pos=skipDelta(pos);
    384     }
    385     do {
    386         out.appendCodeUnit(*pos++);
    387         pos=skipValue(pos);
    388     } while(--length>1);
    389     out.appendCodeUnit(*pos);
    390 }
    391 
    392 U_NAMESPACE_END
    393