Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2010-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  bytestrie.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2010sep25
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #include "unicode/utypes.h"
     16 #include "unicode/bytestream.h"
     17 #include "unicode/bytestrie.h"
     18 #include "unicode/uobject.h"
     19 #include "cmemory.h"
     20 #include "uassert.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 BytesTrie::~BytesTrie() {
     25     uprv_free(ownedArray_);
     26 }
     27 
     28 // lead byte already shifted right by 1.
     29 int32_t
     30 BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) {
     31     int32_t value;
     32     if(leadByte<kMinTwoByteValueLead) {
     33         value=leadByte-kMinOneByteValueLead;
     34     } else if(leadByte<kMinThreeByteValueLead) {
     35         value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;
     36     } else if(leadByte<kFourByteValueLead) {
     37         value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
     38     } else if(leadByte==kFourByteValueLead) {
     39         value=(pos[0]<<16)|(pos[1]<<8)|pos[2];
     40     } else {
     41         value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
     42     }
     43     return value;
     44 }
     45 
     46 const uint8_t *
     47 BytesTrie::jumpByDelta(const uint8_t *pos) {
     48     int32_t delta=*pos++;
     49     if(delta<kMinTwoByteDeltaLead) {
     50         // nothing to do
     51     } else if(delta<kMinThreeByteDeltaLead) {
     52         delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;
     53     } else if(delta<kFourByteDeltaLead) {
     54         delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];
     55         pos+=2;
     56     } else if(delta==kFourByteDeltaLead) {
     57         delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
     58         pos+=3;
     59     } else {
     60         delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
     61         pos+=4;
     62     }
     63     return pos+delta;
     64 }
     65 
     66 UStringTrieResult
     67 BytesTrie::current() const {
     68     const uint8_t *pos=pos_;
     69     if(pos==NULL) {
     70         return USTRINGTRIE_NO_MATCH;
     71     } else {
     72         int32_t node;
     73         return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
     74                 valueResult(node) : USTRINGTRIE_NO_VALUE;
     75     }
     76 }
     77 
     78 UStringTrieResult
     79 BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {
     80     // Branch according to the current byte.
     81     if(length==0) {
     82         length=*pos++;
     83     }
     84     ++length;
     85     // The length of the branch is the number of bytes to select from.
     86     // The data structure encodes a binary search.
     87     while(length>kMaxBranchLinearSubNodeLength) {
     88         if(inByte<*pos++) {
     89             length>>=1;
     90             pos=jumpByDelta(pos);
     91         } else {
     92             length=length-(length>>1);
     93             pos=skipDelta(pos);
     94         }
     95     }
     96     // Drop down to linear search for the last few bytes.
     97     // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
     98     // and divides length by 2.
     99     do {
    100         if(inByte==*pos++) {
    101             UStringTrieResult result;
    102             int32_t node=*pos;
    103             U_ASSERT(node>=kMinValueLead);
    104             if(node&kValueIsFinal) {
    105                 // Leave the final value for getValue() to read.
    106                 result=USTRINGTRIE_FINAL_VALUE;
    107             } else {
    108                 // Use the non-final value as the jump delta.
    109                 ++pos;
    110                 // int32_t delta=readValue(pos, node>>1);
    111                 node>>=1;
    112                 int32_t delta;
    113                 if(node<kMinTwoByteValueLead) {
    114                     delta=node-kMinOneByteValueLead;
    115                 } else if(node<kMinThreeByteValueLead) {
    116                     delta=((node-kMinTwoByteValueLead)<<8)|*pos++;
    117                 } else if(node<kFourByteValueLead) {
    118                     delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
    119                     pos+=2;
    120                 } else if(node==kFourByteValueLead) {
    121                     delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
    122                     pos+=3;
    123                 } else {
    124                     delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
    125                     pos+=4;
    126                 }
    127                 // end readValue()
    128                 pos+=delta;
    129                 node=*pos;
    130                 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
    131             }
    132             pos_=pos;
    133             return result;
    134         }
    135         --length;
    136         pos=skipValue(pos);
    137     } while(length>1);
    138     if(inByte==*pos++) {
    139         pos_=pos;
    140         int32_t node=*pos;
    141         return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
    142     } else {
    143         stop();
    144         return USTRINGTRIE_NO_MATCH;
    145     }
    146 }
    147 
    148 UStringTrieResult
    149 BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) {
    150     for(;;) {
    151         int32_t node=*pos++;
    152         if(node<kMinLinearMatch) {
    153             return branchNext(pos, node, inByte);
    154         } else if(node<kMinValueLead) {
    155             // Match the first of length+1 bytes.
    156             int32_t length=node-kMinLinearMatch;  // Actual match length minus 1.
    157             if(inByte==*pos++) {
    158                 remainingMatchLength_=--length;
    159                 pos_=pos;
    160                 return (length<0 && (node=*pos)>=kMinValueLead) ?
    161                         valueResult(node) : USTRINGTRIE_NO_VALUE;
    162             } else {
    163                 // No match.
    164                 break;
    165             }
    166         } else if(node&kValueIsFinal) {
    167             // No further matching bytes.
    168             break;
    169         } else {
    170             // Skip intermediate value.
    171             pos=skipValue(pos, node);
    172             // The next node must not also be a value node.
    173             U_ASSERT(*pos<kMinValueLead);
    174         }
    175     }
    176     stop();
    177     return USTRINGTRIE_NO_MATCH;
    178 }
    179 
    180 UStringTrieResult
    181 BytesTrie::next(int32_t inByte) {
    182     const uint8_t *pos=pos_;
    183     if(pos==NULL) {
    184         return USTRINGTRIE_NO_MATCH;
    185     }
    186     if(inByte<0) {
    187         inByte+=0x100;
    188     }
    189     int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
    190     if(length>=0) {
    191         // Remaining part of a linear-match node.
    192         if(inByte==*pos++) {
    193             remainingMatchLength_=--length;
    194             pos_=pos;
    195             int32_t node;
    196             return (length<0 && (node=*pos)>=kMinValueLead) ?
    197                     valueResult(node) : USTRINGTRIE_NO_VALUE;
    198         } else {
    199             stop();
    200             return USTRINGTRIE_NO_MATCH;
    201         }
    202     }
    203     return nextImpl(pos, inByte);
    204 }
    205 
    206 UStringTrieResult
    207 BytesTrie::next(const char *s, int32_t sLength) {
    208     if(sLength<0 ? *s==0 : sLength==0) {
    209         // Empty input.
    210         return current();
    211     }
    212     const uint8_t *pos=pos_;
    213     if(pos==NULL) {
    214         return USTRINGTRIE_NO_MATCH;
    215     }
    216     int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
    217     for(;;) {
    218         // Fetch the next input byte, if there is one.
    219         // Continue a linear-match node without rechecking sLength<0.
    220         int32_t inByte;
    221         if(sLength<0) {
    222             for(;;) {
    223                 if((inByte=*s++)==0) {
    224                     remainingMatchLength_=length;
    225                     pos_=pos;
    226                     int32_t node;
    227                     return (length<0 && (node=*pos)>=kMinValueLead) ?
    228                             valueResult(node) : USTRINGTRIE_NO_VALUE;
    229                 }
    230                 if(length<0) {
    231                     remainingMatchLength_=length;
    232                     break;
    233                 }
    234                 if(inByte!=*pos) {
    235                     stop();
    236                     return USTRINGTRIE_NO_MATCH;
    237                 }
    238                 ++pos;
    239                 --length;
    240             }
    241         } else {
    242             for(;;) {
    243                 if(sLength==0) {
    244                     remainingMatchLength_=length;
    245                     pos_=pos;
    246                     int32_t node;
    247                     return (length<0 && (node=*pos)>=kMinValueLead) ?
    248                             valueResult(node) : USTRINGTRIE_NO_VALUE;
    249                 }
    250                 inByte=*s++;
    251                 --sLength;
    252                 if(length<0) {
    253                     remainingMatchLength_=length;
    254                     break;
    255                 }
    256                 if(inByte!=*pos) {
    257                     stop();
    258                     return USTRINGTRIE_NO_MATCH;
    259                 }
    260                 ++pos;
    261                 --length;
    262             }
    263         }
    264         for(;;) {
    265             int32_t node=*pos++;
    266             if(node<kMinLinearMatch) {
    267                 UStringTrieResult result=branchNext(pos, node, inByte);
    268                 if(result==USTRINGTRIE_NO_MATCH) {
    269                     return USTRINGTRIE_NO_MATCH;
    270                 }
    271                 // Fetch the next input byte, if there is one.
    272                 if(sLength<0) {
    273                     if((inByte=*s++)==0) {
    274                         return result;
    275                     }
    276                 } else {
    277                     if(sLength==0) {
    278                         return result;
    279                     }
    280                     inByte=*s++;
    281                     --sLength;
    282                 }
    283                 if(result==USTRINGTRIE_FINAL_VALUE) {
    284                     // No further matching bytes.
    285                     stop();
    286                     return USTRINGTRIE_NO_MATCH;
    287                 }
    288                 pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
    289             } else if(node<kMinValueLead) {
    290                 // Match length+1 bytes.
    291                 length=node-kMinLinearMatch;  // Actual match length minus 1.
    292                 if(inByte!=*pos) {
    293                     stop();
    294                     return USTRINGTRIE_NO_MATCH;
    295                 }
    296                 ++pos;
    297                 --length;
    298                 break;
    299             } else if(node&kValueIsFinal) {
    300                 // No further matching bytes.
    301                 stop();
    302                 return USTRINGTRIE_NO_MATCH;
    303             } else {
    304                 // Skip intermediate value.
    305                 pos=skipValue(pos, node);
    306                 // The next node must not also be a value node.
    307                 U_ASSERT(*pos<kMinValueLead);
    308             }
    309         }
    310     }
    311 }
    312 
    313 const uint8_t *
    314 BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
    315                                      UBool haveUniqueValue, int32_t &uniqueValue) {
    316     while(length>kMaxBranchLinearSubNodeLength) {
    317         ++pos;  // ignore the comparison byte
    318         if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
    319             return NULL;
    320         }
    321         length=length-(length>>1);
    322         pos=skipDelta(pos);
    323     }
    324     do {
    325         ++pos;  // ignore a comparison byte
    326         // handle its value
    327         int32_t node=*pos++;
    328         UBool isFinal=(UBool)(node&kValueIsFinal);
    329         int32_t value=readValue(pos, node>>1);
    330         pos=skipValue(pos, node);
    331         if(isFinal) {
    332             if(haveUniqueValue) {
    333                 if(value!=uniqueValue) {
    334                     return NULL;
    335                 }
    336             } else {
    337                 uniqueValue=value;
    338                 haveUniqueValue=TRUE;
    339             }
    340         } else {
    341             if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
    342                 return NULL;
    343             }
    344             haveUniqueValue=TRUE;
    345         }
    346     } while(--length>1);
    347     return pos+1;  // ignore the last comparison byte
    348 }
    349 
    350 UBool
    351 BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
    352     for(;;) {
    353         int32_t node=*pos++;
    354         if(node<kMinLinearMatch) {
    355             if(node==0) {
    356                 node=*pos++;
    357             }
    358             pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
    359             if(pos==NULL) {
    360                 return FALSE;
    361             }
    362             haveUniqueValue=TRUE;
    363         } else if(node<kMinValueLead) {
    364             // linear-match node
    365             pos+=node-kMinLinearMatch+1;  // Ignore the match bytes.
    366         } else {
    367             UBool isFinal=(UBool)(node&kValueIsFinal);
    368             int32_t value=readValue(pos, node>>1);
    369             if(haveUniqueValue) {
    370                 if(value!=uniqueValue) {
    371                     return FALSE;
    372                 }
    373             } else {
    374                 uniqueValue=value;
    375                 haveUniqueValue=TRUE;
    376             }
    377             if(isFinal) {
    378                 return TRUE;
    379             }
    380             pos=skipValue(pos, node);
    381         }
    382     }
    383 }
    384 
    385 int32_t
    386 BytesTrie::getNextBytes(ByteSink &out) const {
    387     const uint8_t *pos=pos_;
    388     if(pos==NULL) {
    389         return 0;
    390     }
    391     if(remainingMatchLength_>=0) {
    392         append(out, *pos);  // Next byte of a pending linear-match node.
    393         return 1;
    394     }
    395     int32_t node=*pos++;
    396     if(node>=kMinValueLead) {
    397         if(node&kValueIsFinal) {
    398             return 0;
    399         } else {
    400             pos=skipValue(pos, node);
    401             node=*pos++;
    402             U_ASSERT(node<kMinValueLead);
    403         }
    404     }
    405     if(node<kMinLinearMatch) {
    406         if(node==0) {
    407             node=*pos++;
    408         }
    409         getNextBranchBytes(pos, ++node, out);
    410         return node;
    411     } else {
    412         // First byte of the linear-match node.
    413         append(out, *pos);
    414         return 1;
    415     }
    416 }
    417 
    418 void
    419 BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {
    420     while(length>kMaxBranchLinearSubNodeLength) {
    421         ++pos;  // ignore the comparison byte
    422         getNextBranchBytes(jumpByDelta(pos), length>>1, out);
    423         length=length-(length>>1);
    424         pos=skipDelta(pos);
    425     }
    426     do {
    427         append(out, *pos++);
    428         pos=skipValue(pos);
    429     } while(--length>1);
    430     append(out, *pos);
    431 }
    432 
    433 void
    434 BytesTrie::append(ByteSink &out, int c) {
    435     char ch=(char)c;
    436     out.Append(&ch, 1);
    437 }
    438 
    439 U_NAMESPACE_END
    440