1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucharstrie.h 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010nov14 12 * created by: Markus W. Scherer 13 */ 14 15 #include "unicode/utypes.h" 16 #include "unicode/appendable.h" 17 #include "unicode/ucharstrie.h" 18 #include "unicode/uobject.h" 19 #include "cmemory.h" 20 #include "uassert.h" 21 22 U_NAMESPACE_BEGIN 23 24 UCharsTrie::~UCharsTrie() { 25 uprv_free(ownedArray_); 26 } 27 28 UStringTrieResult 29 UCharsTrie::current() const { 30 const UChar *pos=pos_; 31 if(pos==NULL) { 32 return USTRINGTRIE_NO_MATCH; 33 } else { 34 int32_t node; 35 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ? 36 valueResult(node) : USTRINGTRIE_NO_VALUE; 37 } 38 } 39 40 UStringTrieResult 41 UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) { 42 // Branch according to the current unit. 43 if(length==0) { 44 length=*pos++; 45 } 46 ++length; 47 // The length of the branch is the number of units to select from. 48 // The data structure encodes a binary search. 49 while(length>kMaxBranchLinearSubNodeLength) { 50 if(uchar<*pos++) { 51 length>>=1; 52 pos=jumpByDelta(pos); 53 } else { 54 length=length-(length>>1); 55 pos=skipDelta(pos); 56 } 57 } 58 // Drop down to linear search for the last few units. 59 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 60 // and divides length by 2. 61 do { 62 if(uchar==*pos++) { 63 UStringTrieResult result; 64 int32_t node=*pos; 65 if(node&kValueIsFinal) { 66 // Leave the final value for getValue() to read. 67 result=USTRINGTRIE_FINAL_VALUE; 68 } else { 69 // Use the non-final value as the jump delta. 70 ++pos; 71 // int32_t delta=readValue(pos, node); 72 int32_t delta; 73 if(node<kMinTwoUnitValueLead) { 74 delta=node; 75 } else if(node<kThreeUnitValueLead) { 76 delta=((node-kMinTwoUnitValueLead)<<16)|*pos++; 77 } else { 78 delta=(pos[0]<<16)|pos[1]; 79 pos+=2; 80 } 81 // end readValue() 82 pos+=delta; 83 node=*pos; 84 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 85 } 86 pos_=pos; 87 return result; 88 } 89 --length; 90 pos=skipValue(pos); 91 } while(length>1); 92 if(uchar==*pos++) { 93 pos_=pos; 94 int32_t node=*pos; 95 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 96 } else { 97 stop(); 98 return USTRINGTRIE_NO_MATCH; 99 } 100 } 101 102 UStringTrieResult 103 UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) { 104 int32_t node=*pos++; 105 for(;;) { 106 if(node<kMinLinearMatch) { 107 return branchNext(pos, node, uchar); 108 } else if(node<kMinValueLead) { 109 // Match the first of length+1 units. 110 int32_t length=node-kMinLinearMatch; // Actual match length minus 1. 111 if(uchar==*pos++) { 112 remainingMatchLength_=--length; 113 pos_=pos; 114 return (length<0 && (node=*pos)>=kMinValueLead) ? 115 valueResult(node) : USTRINGTRIE_NO_VALUE; 116 } else { 117 // No match. 118 break; 119 } 120 } else if(node&kValueIsFinal) { 121 // No further matching units. 122 break; 123 } else { 124 // Skip intermediate value. 125 pos=skipNodeValue(pos, node); 126 node&=kNodeTypeMask; 127 } 128 } 129 stop(); 130 return USTRINGTRIE_NO_MATCH; 131 } 132 133 UStringTrieResult 134 UCharsTrie::next(int32_t uchar) { 135 const UChar *pos=pos_; 136 if(pos==NULL) { 137 return USTRINGTRIE_NO_MATCH; 138 } 139 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 140 if(length>=0) { 141 // Remaining part of a linear-match node. 142 if(uchar==*pos++) { 143 remainingMatchLength_=--length; 144 pos_=pos; 145 int32_t node; 146 return (length<0 && (node=*pos)>=kMinValueLead) ? 147 valueResult(node) : USTRINGTRIE_NO_VALUE; 148 } else { 149 stop(); 150 return USTRINGTRIE_NO_MATCH; 151 } 152 } 153 return nextImpl(pos, uchar); 154 } 155 156 UStringTrieResult 157 UCharsTrie::next(const UChar *s, int32_t sLength) { 158 if(sLength<0 ? *s==0 : sLength==0) { 159 // Empty input. 160 return current(); 161 } 162 const UChar *pos=pos_; 163 if(pos==NULL) { 164 return USTRINGTRIE_NO_MATCH; 165 } 166 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 167 for(;;) { 168 // Fetch the next input unit, if there is one. 169 // Continue a linear-match node without rechecking sLength<0. 170 int32_t uchar; 171 if(sLength<0) { 172 for(;;) { 173 if((uchar=*s++)==0) { 174 remainingMatchLength_=length; 175 pos_=pos; 176 int32_t node; 177 return (length<0 && (node=*pos)>=kMinValueLead) ? 178 valueResult(node) : USTRINGTRIE_NO_VALUE; 179 } 180 if(length<0) { 181 remainingMatchLength_=length; 182 break; 183 } 184 if(uchar!=*pos) { 185 stop(); 186 return USTRINGTRIE_NO_MATCH; 187 } 188 ++pos; 189 --length; 190 } 191 } else { 192 for(;;) { 193 if(sLength==0) { 194 remainingMatchLength_=length; 195 pos_=pos; 196 int32_t node; 197 return (length<0 && (node=*pos)>=kMinValueLead) ? 198 valueResult(node) : USTRINGTRIE_NO_VALUE; 199 } 200 uchar=*s++; 201 --sLength; 202 if(length<0) { 203 remainingMatchLength_=length; 204 break; 205 } 206 if(uchar!=*pos) { 207 stop(); 208 return USTRINGTRIE_NO_MATCH; 209 } 210 ++pos; 211 --length; 212 } 213 } 214 int32_t node=*pos++; 215 for(;;) { 216 if(node<kMinLinearMatch) { 217 UStringTrieResult result=branchNext(pos, node, uchar); 218 if(result==USTRINGTRIE_NO_MATCH) { 219 return USTRINGTRIE_NO_MATCH; 220 } 221 // Fetch the next input unit, if there is one. 222 if(sLength<0) { 223 if((uchar=*s++)==0) { 224 return result; 225 } 226 } else { 227 if(sLength==0) { 228 return result; 229 } 230 uchar=*s++; 231 --sLength; 232 } 233 if(result==USTRINGTRIE_FINAL_VALUE) { 234 // No further matching units. 235 stop(); 236 return USTRINGTRIE_NO_MATCH; 237 } 238 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 239 node=*pos++; 240 } else if(node<kMinValueLead) { 241 // Match length+1 units. 242 length=node-kMinLinearMatch; // Actual match length minus 1. 243 if(uchar!=*pos) { 244 stop(); 245 return USTRINGTRIE_NO_MATCH; 246 } 247 ++pos; 248 --length; 249 break; 250 } else if(node&kValueIsFinal) { 251 // No further matching units. 252 stop(); 253 return USTRINGTRIE_NO_MATCH; 254 } else { 255 // Skip intermediate value. 256 pos=skipNodeValue(pos, node); 257 node&=kNodeTypeMask; 258 } 259 } 260 } 261 } 262 263 const UChar * 264 UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length, 265 UBool haveUniqueValue, int32_t &uniqueValue) { 266 while(length>kMaxBranchLinearSubNodeLength) { 267 ++pos; // ignore the comparison unit 268 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) { 269 return NULL; 270 } 271 length=length-(length>>1); 272 pos=skipDelta(pos); 273 } 274 do { 275 ++pos; // ignore a comparison unit 276 // handle its value 277 int32_t node=*pos++; 278 UBool isFinal=(UBool)(node>>15); 279 node&=0x7fff; 280 int32_t value=readValue(pos, node); 281 pos=skipValue(pos, node); 282 if(isFinal) { 283 if(haveUniqueValue) { 284 if(value!=uniqueValue) { 285 return NULL; 286 } 287 } else { 288 uniqueValue=value; 289 haveUniqueValue=TRUE; 290 } 291 } else { 292 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) { 293 return NULL; 294 } 295 haveUniqueValue=TRUE; 296 } 297 } while(--length>1); 298 return pos+1; // ignore the last comparison unit 299 } 300 301 UBool 302 UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) { 303 int32_t node=*pos++; 304 for(;;) { 305 if(node<kMinLinearMatch) { 306 if(node==0) { 307 node=*pos++; 308 } 309 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue); 310 if(pos==NULL) { 311 return FALSE; 312 } 313 haveUniqueValue=TRUE; 314 node=*pos++; 315 } else if(node<kMinValueLead) { 316 // linear-match node 317 pos+=node-kMinLinearMatch+1; // Ignore the match units. 318 node=*pos++; 319 } else { 320 UBool isFinal=(UBool)(node>>15); 321 int32_t value; 322 if(isFinal) { 323 value=readValue(pos, node&0x7fff); 324 } else { 325 value=readNodeValue(pos, node); 326 } 327 if(haveUniqueValue) { 328 if(value!=uniqueValue) { 329 return FALSE; 330 } 331 } else { 332 uniqueValue=value; 333 haveUniqueValue=TRUE; 334 } 335 if(isFinal) { 336 return TRUE; 337 } 338 pos=skipNodeValue(pos, node); 339 node&=kNodeTypeMask; 340 } 341 } 342 } 343 344 int32_t 345 UCharsTrie::getNextUChars(Appendable &out) const { 346 const UChar *pos=pos_; 347 if(pos==NULL) { 348 return 0; 349 } 350 if(remainingMatchLength_>=0) { 351 out.appendCodeUnit(*pos); // Next unit of a pending linear-match node. 352 return 1; 353 } 354 int32_t node=*pos++; 355 if(node>=kMinValueLead) { 356 if(node&kValueIsFinal) { 357 return 0; 358 } else { 359 pos=skipNodeValue(pos, node); 360 node&=kNodeTypeMask; 361 } 362 } 363 if(node<kMinLinearMatch) { 364 if(node==0) { 365 node=*pos++; 366 } 367 out.reserveAppendCapacity(++node); 368 getNextBranchUChars(pos, node, out); 369 return node; 370 } else { 371 // First unit of the linear-match node. 372 out.appendCodeUnit(*pos); 373 return 1; 374 } 375 } 376 377 void 378 UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) { 379 while(length>kMaxBranchLinearSubNodeLength) { 380 ++pos; // ignore the comparison unit 381 getNextBranchUChars(jumpByDelta(pos), length>>1, out); 382 length=length-(length>>1); 383 pos=skipDelta(pos); 384 } 385 do { 386 out.appendCodeUnit(*pos++); 387 pos=skipValue(pos); 388 } while(--length>1); 389 out.appendCodeUnit(*pos); 390 } 391 392 U_NAMESPACE_END 393