Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2010-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ucharstrie.h
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2010nov14
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #include "unicode/utypes.h"
     16 #include "unicode/appendable.h"
     17 #include "unicode/ucharstrie.h"
     18 #include "unicode/uobject.h"
     19 #include "unicode/utf16.h"
     20 #include "cmemory.h"
     21 #include "uassert.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 UCharsTrie::~UCharsTrie() {
     26     uprv_free(ownedArray_);
     27 }
     28 
     29 UStringTrieResult
     30 UCharsTrie::current() const {
     31     const UChar *pos=pos_;
     32     if(pos==NULL) {
     33         return USTRINGTRIE_NO_MATCH;
     34     } else {
     35         int32_t node;
     36         return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
     37                 valueResult(node) : USTRINGTRIE_NO_VALUE;
     38     }
     39 }
     40 
     41 UStringTrieResult
     42 UCharsTrie::firstForCodePoint(UChar32 cp) {
     43     return cp<=0xffff ?
     44         first(cp) :
     45         (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ?
     46             next(U16_TRAIL(cp)) :
     47             USTRINGTRIE_NO_MATCH);
     48 }
     49 
     50 UStringTrieResult
     51 UCharsTrie::nextForCodePoint(UChar32 cp) {
     52     return cp<=0xffff ?
     53         next(cp) :
     54         (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ?
     55             next(U16_TRAIL(cp)) :
     56             USTRINGTRIE_NO_MATCH);
     57 }
     58 
     59 UStringTrieResult
     60 UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) {
     61     // Branch according to the current unit.
     62     if(length==0) {
     63         length=*pos++;
     64     }
     65     ++length;
     66     // The length of the branch is the number of units to select from.
     67     // The data structure encodes a binary search.
     68     while(length>kMaxBranchLinearSubNodeLength) {
     69         if(uchar<*pos++) {
     70             length>>=1;
     71             pos=jumpByDelta(pos);
     72         } else {
     73             length=length-(length>>1);
     74             pos=skipDelta(pos);
     75         }
     76     }
     77     // Drop down to linear search for the last few units.
     78     // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
     79     // and divides length by 2.
     80     do {
     81         if(uchar==*pos++) {
     82             UStringTrieResult result;
     83             int32_t node=*pos;
     84             if(node&kValueIsFinal) {
     85                 // Leave the final value for getValue() to read.
     86                 result=USTRINGTRIE_FINAL_VALUE;
     87             } else {
     88                 // Use the non-final value as the jump delta.
     89                 ++pos;
     90                 // int32_t delta=readValue(pos, node);
     91                 int32_t delta;
     92                 if(node<kMinTwoUnitValueLead) {
     93                     delta=node;
     94                 } else if(node<kThreeUnitValueLead) {
     95                     delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
     96                 } else {
     97                     delta=(pos[0]<<16)|pos[1];
     98                     pos+=2;
     99                 }
    100                 // end readValue()
    101                 pos+=delta;
    102                 node=*pos;
    103                 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
    104             }
    105             pos_=pos;
    106             return result;
    107         }
    108         --length;
    109         pos=skipValue(pos);
    110     } while(length>1);
    111     if(uchar==*pos++) {
    112         pos_=pos;
    113         int32_t node=*pos;
    114         return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
    115     } else {
    116         stop();
    117         return USTRINGTRIE_NO_MATCH;
    118     }
    119 }
    120 
    121 UStringTrieResult
    122 UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) {
    123     int32_t node=*pos++;
    124     for(;;) {
    125         if(node<kMinLinearMatch) {
    126             return branchNext(pos, node, uchar);
    127         } else if(node<kMinValueLead) {
    128             // Match the first of length+1 units.
    129             int32_t length=node-kMinLinearMatch;  // Actual match length minus 1.
    130             if(uchar==*pos++) {
    131                 remainingMatchLength_=--length;
    132                 pos_=pos;
    133                 return (length<0 && (node=*pos)>=kMinValueLead) ?
    134                         valueResult(node) : USTRINGTRIE_NO_VALUE;
    135             } else {
    136                 // No match.
    137                 break;
    138             }
    139         } else if(node&kValueIsFinal) {
    140             // No further matching units.
    141             break;
    142         } else {
    143             // Skip intermediate value.
    144             pos=skipNodeValue(pos, node);
    145             node&=kNodeTypeMask;
    146         }
    147     }
    148     stop();
    149     return USTRINGTRIE_NO_MATCH;
    150 }
    151 
    152 UStringTrieResult
    153 UCharsTrie::next(int32_t uchar) {
    154     const UChar *pos=pos_;
    155     if(pos==NULL) {
    156         return USTRINGTRIE_NO_MATCH;
    157     }
    158     int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
    159     if(length>=0) {
    160         // Remaining part of a linear-match node.
    161         if(uchar==*pos++) {
    162             remainingMatchLength_=--length;
    163             pos_=pos;
    164             int32_t node;
    165             return (length<0 && (node=*pos)>=kMinValueLead) ?
    166                     valueResult(node) : USTRINGTRIE_NO_VALUE;
    167         } else {
    168             stop();
    169             return USTRINGTRIE_NO_MATCH;
    170         }
    171     }
    172     return nextImpl(pos, uchar);
    173 }
    174 
    175 UStringTrieResult
    176 UCharsTrie::next(const UChar *s, int32_t sLength) {
    177     if(sLength<0 ? *s==0 : sLength==0) {
    178         // Empty input.
    179         return current();
    180     }
    181     const UChar *pos=pos_;
    182     if(pos==NULL) {
    183         return USTRINGTRIE_NO_MATCH;
    184     }
    185     int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
    186     for(;;) {
    187         // Fetch the next input unit, if there is one.
    188         // Continue a linear-match node without rechecking sLength<0.
    189         int32_t uchar;
    190         if(sLength<0) {
    191             for(;;) {
    192                 if((uchar=*s++)==0) {
    193                     remainingMatchLength_=length;
    194                     pos_=pos;
    195                     int32_t node;
    196                     return (length<0 && (node=*pos)>=kMinValueLead) ?
    197                             valueResult(node) : USTRINGTRIE_NO_VALUE;
    198                 }
    199                 if(length<0) {
    200                     remainingMatchLength_=length;
    201                     break;
    202                 }
    203                 if(uchar!=*pos) {
    204                     stop();
    205                     return USTRINGTRIE_NO_MATCH;
    206                 }
    207                 ++pos;
    208                 --length;
    209             }
    210         } else {
    211             for(;;) {
    212                 if(sLength==0) {
    213                     remainingMatchLength_=length;
    214                     pos_=pos;
    215                     int32_t node;
    216                     return (length<0 && (node=*pos)>=kMinValueLead) ?
    217                             valueResult(node) : USTRINGTRIE_NO_VALUE;
    218                 }
    219                 uchar=*s++;
    220                 --sLength;
    221                 if(length<0) {
    222                     remainingMatchLength_=length;
    223                     break;
    224                 }
    225                 if(uchar!=*pos) {
    226                     stop();
    227                     return USTRINGTRIE_NO_MATCH;
    228                 }
    229                 ++pos;
    230                 --length;
    231             }
    232         }
    233         int32_t node=*pos++;
    234         for(;;) {
    235             if(node<kMinLinearMatch) {
    236                 UStringTrieResult result=branchNext(pos, node, uchar);
    237                 if(result==USTRINGTRIE_NO_MATCH) {
    238                     return USTRINGTRIE_NO_MATCH;
    239                 }
    240                 // Fetch the next input unit, if there is one.
    241                 if(sLength<0) {
    242                     if((uchar=*s++)==0) {
    243                         return result;
    244                     }
    245                 } else {
    246                     if(sLength==0) {
    247                         return result;
    248                     }
    249                     uchar=*s++;
    250                     --sLength;
    251                 }
    252                 if(result==USTRINGTRIE_FINAL_VALUE) {
    253                     // No further matching units.
    254                     stop();
    255                     return USTRINGTRIE_NO_MATCH;
    256                 }
    257                 pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
    258                 node=*pos++;
    259             } else if(node<kMinValueLead) {
    260                 // Match length+1 units.
    261                 length=node-kMinLinearMatch;  // Actual match length minus 1.
    262                 if(uchar!=*pos) {
    263                     stop();
    264                     return USTRINGTRIE_NO_MATCH;
    265                 }
    266                 ++pos;
    267                 --length;
    268                 break;
    269             } else if(node&kValueIsFinal) {
    270                 // No further matching units.
    271                 stop();
    272                 return USTRINGTRIE_NO_MATCH;
    273             } else {
    274                 // Skip intermediate value.
    275                 pos=skipNodeValue(pos, node);
    276                 node&=kNodeTypeMask;
    277             }
    278         }
    279     }
    280 }
    281 
    282 const UChar *
    283 UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length,
    284                                       UBool haveUniqueValue, int32_t &uniqueValue) {
    285     while(length>kMaxBranchLinearSubNodeLength) {
    286         ++pos;  // ignore the comparison unit
    287         if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
    288             return NULL;
    289         }
    290         length=length-(length>>1);
    291         pos=skipDelta(pos);
    292     }
    293     do {
    294         ++pos;  // ignore a comparison unit
    295         // handle its value
    296         int32_t node=*pos++;
    297         UBool isFinal=(UBool)(node>>15);
    298         node&=0x7fff;
    299         int32_t value=readValue(pos, node);
    300         pos=skipValue(pos, node);
    301         if(isFinal) {
    302             if(haveUniqueValue) {
    303                 if(value!=uniqueValue) {
    304                     return NULL;
    305                 }
    306             } else {
    307                 uniqueValue=value;
    308                 haveUniqueValue=TRUE;
    309             }
    310         } else {
    311             if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
    312                 return NULL;
    313             }
    314             haveUniqueValue=TRUE;
    315         }
    316     } while(--length>1);
    317     return pos+1;  // ignore the last comparison unit
    318 }
    319 
    320 UBool
    321 UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
    322     int32_t node=*pos++;
    323     for(;;) {
    324         if(node<kMinLinearMatch) {
    325             if(node==0) {
    326                 node=*pos++;
    327             }
    328             pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
    329             if(pos==NULL) {
    330                 return FALSE;
    331             }
    332             haveUniqueValue=TRUE;
    333             node=*pos++;
    334         } else if(node<kMinValueLead) {
    335             // linear-match node
    336             pos+=node-kMinLinearMatch+1;  // Ignore the match units.
    337             node=*pos++;
    338         } else {
    339             UBool isFinal=(UBool)(node>>15);
    340             int32_t value;
    341             if(isFinal) {
    342                 value=readValue(pos, node&0x7fff);
    343             } else {
    344                 value=readNodeValue(pos, node);
    345             }
    346             if(haveUniqueValue) {
    347                 if(value!=uniqueValue) {
    348                     return FALSE;
    349                 }
    350             } else {
    351                 uniqueValue=value;
    352                 haveUniqueValue=TRUE;
    353             }
    354             if(isFinal) {
    355                 return TRUE;
    356             }
    357             pos=skipNodeValue(pos, node);
    358             node&=kNodeTypeMask;
    359         }
    360     }
    361 }
    362 
    363 int32_t
    364 UCharsTrie::getNextUChars(Appendable &out) const {
    365     const UChar *pos=pos_;
    366     if(pos==NULL) {
    367         return 0;
    368     }
    369     if(remainingMatchLength_>=0) {
    370         out.appendCodeUnit(*pos);  // Next unit of a pending linear-match node.
    371         return 1;
    372     }
    373     int32_t node=*pos++;
    374     if(node>=kMinValueLead) {
    375         if(node&kValueIsFinal) {
    376             return 0;
    377         } else {
    378             pos=skipNodeValue(pos, node);
    379             node&=kNodeTypeMask;
    380         }
    381     }
    382     if(node<kMinLinearMatch) {
    383         if(node==0) {
    384             node=*pos++;
    385         }
    386         out.reserveAppendCapacity(++node);
    387         getNextBranchUChars(pos, node, out);
    388         return node;
    389     } else {
    390         // First unit of the linear-match node.
    391         out.appendCodeUnit(*pos);
    392         return 1;
    393     }
    394 }
    395 
    396 void
    397 UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) {
    398     while(length>kMaxBranchLinearSubNodeLength) {
    399         ++pos;  // ignore the comparison unit
    400         getNextBranchUChars(jumpByDelta(pos), length>>1, out);
    401         length=length-(length>>1);
    402         pos=skipDelta(pos);
    403     }
    404     do {
    405         out.appendCodeUnit(*pos++);
    406         pos=skipValue(pos);
    407     } while(--length>1);
    408     out.appendCodeUnit(*pos);
    409 }
    410 
    411 U_NAMESPACE_END
    412