1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucharstrie.h 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010nov14 12 * created by: Markus W. Scherer 13 */ 14 15 #include "unicode/utypes.h" 16 #include "unicode/appendable.h" 17 #include "unicode/ucharstrie.h" 18 #include "unicode/uobject.h" 19 #include "unicode/utf16.h" 20 #include "cmemory.h" 21 #include "uassert.h" 22 23 U_NAMESPACE_BEGIN 24 25 UCharsTrie::~UCharsTrie() { 26 uprv_free(ownedArray_); 27 } 28 29 UStringTrieResult 30 UCharsTrie::current() const { 31 const UChar *pos=pos_; 32 if(pos==NULL) { 33 return USTRINGTRIE_NO_MATCH; 34 } else { 35 int32_t node; 36 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ? 37 valueResult(node) : USTRINGTRIE_NO_VALUE; 38 } 39 } 40 41 UStringTrieResult 42 UCharsTrie::firstForCodePoint(UChar32 cp) { 43 return cp<=0xffff ? 44 first(cp) : 45 (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ? 46 next(U16_TRAIL(cp)) : 47 USTRINGTRIE_NO_MATCH); 48 } 49 50 UStringTrieResult 51 UCharsTrie::nextForCodePoint(UChar32 cp) { 52 return cp<=0xffff ? 53 next(cp) : 54 (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ? 55 next(U16_TRAIL(cp)) : 56 USTRINGTRIE_NO_MATCH); 57 } 58 59 UStringTrieResult 60 UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) { 61 // Branch according to the current unit. 62 if(length==0) { 63 length=*pos++; 64 } 65 ++length; 66 // The length of the branch is the number of units to select from. 67 // The data structure encodes a binary search. 68 while(length>kMaxBranchLinearSubNodeLength) { 69 if(uchar<*pos++) { 70 length>>=1; 71 pos=jumpByDelta(pos); 72 } else { 73 length=length-(length>>1); 74 pos=skipDelta(pos); 75 } 76 } 77 // Drop down to linear search for the last few units. 78 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 79 // and divides length by 2. 80 do { 81 if(uchar==*pos++) { 82 UStringTrieResult result; 83 int32_t node=*pos; 84 if(node&kValueIsFinal) { 85 // Leave the final value for getValue() to read. 86 result=USTRINGTRIE_FINAL_VALUE; 87 } else { 88 // Use the non-final value as the jump delta. 89 ++pos; 90 // int32_t delta=readValue(pos, node); 91 int32_t delta; 92 if(node<kMinTwoUnitValueLead) { 93 delta=node; 94 } else if(node<kThreeUnitValueLead) { 95 delta=((node-kMinTwoUnitValueLead)<<16)|*pos++; 96 } else { 97 delta=(pos[0]<<16)|pos[1]; 98 pos+=2; 99 } 100 // end readValue() 101 pos+=delta; 102 node=*pos; 103 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 104 } 105 pos_=pos; 106 return result; 107 } 108 --length; 109 pos=skipValue(pos); 110 } while(length>1); 111 if(uchar==*pos++) { 112 pos_=pos; 113 int32_t node=*pos; 114 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 115 } else { 116 stop(); 117 return USTRINGTRIE_NO_MATCH; 118 } 119 } 120 121 UStringTrieResult 122 UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) { 123 int32_t node=*pos++; 124 for(;;) { 125 if(node<kMinLinearMatch) { 126 return branchNext(pos, node, uchar); 127 } else if(node<kMinValueLead) { 128 // Match the first of length+1 units. 129 int32_t length=node-kMinLinearMatch; // Actual match length minus 1. 130 if(uchar==*pos++) { 131 remainingMatchLength_=--length; 132 pos_=pos; 133 return (length<0 && (node=*pos)>=kMinValueLead) ? 134 valueResult(node) : USTRINGTRIE_NO_VALUE; 135 } else { 136 // No match. 137 break; 138 } 139 } else if(node&kValueIsFinal) { 140 // No further matching units. 141 break; 142 } else { 143 // Skip intermediate value. 144 pos=skipNodeValue(pos, node); 145 node&=kNodeTypeMask; 146 } 147 } 148 stop(); 149 return USTRINGTRIE_NO_MATCH; 150 } 151 152 UStringTrieResult 153 UCharsTrie::next(int32_t uchar) { 154 const UChar *pos=pos_; 155 if(pos==NULL) { 156 return USTRINGTRIE_NO_MATCH; 157 } 158 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 159 if(length>=0) { 160 // Remaining part of a linear-match node. 161 if(uchar==*pos++) { 162 remainingMatchLength_=--length; 163 pos_=pos; 164 int32_t node; 165 return (length<0 && (node=*pos)>=kMinValueLead) ? 166 valueResult(node) : USTRINGTRIE_NO_VALUE; 167 } else { 168 stop(); 169 return USTRINGTRIE_NO_MATCH; 170 } 171 } 172 return nextImpl(pos, uchar); 173 } 174 175 UStringTrieResult 176 UCharsTrie::next(const UChar *s, int32_t sLength) { 177 if(sLength<0 ? *s==0 : sLength==0) { 178 // Empty input. 179 return current(); 180 } 181 const UChar *pos=pos_; 182 if(pos==NULL) { 183 return USTRINGTRIE_NO_MATCH; 184 } 185 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 186 for(;;) { 187 // Fetch the next input unit, if there is one. 188 // Continue a linear-match node without rechecking sLength<0. 189 int32_t uchar; 190 if(sLength<0) { 191 for(;;) { 192 if((uchar=*s++)==0) { 193 remainingMatchLength_=length; 194 pos_=pos; 195 int32_t node; 196 return (length<0 && (node=*pos)>=kMinValueLead) ? 197 valueResult(node) : USTRINGTRIE_NO_VALUE; 198 } 199 if(length<0) { 200 remainingMatchLength_=length; 201 break; 202 } 203 if(uchar!=*pos) { 204 stop(); 205 return USTRINGTRIE_NO_MATCH; 206 } 207 ++pos; 208 --length; 209 } 210 } else { 211 for(;;) { 212 if(sLength==0) { 213 remainingMatchLength_=length; 214 pos_=pos; 215 int32_t node; 216 return (length<0 && (node=*pos)>=kMinValueLead) ? 217 valueResult(node) : USTRINGTRIE_NO_VALUE; 218 } 219 uchar=*s++; 220 --sLength; 221 if(length<0) { 222 remainingMatchLength_=length; 223 break; 224 } 225 if(uchar!=*pos) { 226 stop(); 227 return USTRINGTRIE_NO_MATCH; 228 } 229 ++pos; 230 --length; 231 } 232 } 233 int32_t node=*pos++; 234 for(;;) { 235 if(node<kMinLinearMatch) { 236 UStringTrieResult result=branchNext(pos, node, uchar); 237 if(result==USTRINGTRIE_NO_MATCH) { 238 return USTRINGTRIE_NO_MATCH; 239 } 240 // Fetch the next input unit, if there is one. 241 if(sLength<0) { 242 if((uchar=*s++)==0) { 243 return result; 244 } 245 } else { 246 if(sLength==0) { 247 return result; 248 } 249 uchar=*s++; 250 --sLength; 251 } 252 if(result==USTRINGTRIE_FINAL_VALUE) { 253 // No further matching units. 254 stop(); 255 return USTRINGTRIE_NO_MATCH; 256 } 257 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 258 node=*pos++; 259 } else if(node<kMinValueLead) { 260 // Match length+1 units. 261 length=node-kMinLinearMatch; // Actual match length minus 1. 262 if(uchar!=*pos) { 263 stop(); 264 return USTRINGTRIE_NO_MATCH; 265 } 266 ++pos; 267 --length; 268 break; 269 } else if(node&kValueIsFinal) { 270 // No further matching units. 271 stop(); 272 return USTRINGTRIE_NO_MATCH; 273 } else { 274 // Skip intermediate value. 275 pos=skipNodeValue(pos, node); 276 node&=kNodeTypeMask; 277 } 278 } 279 } 280 } 281 282 const UChar * 283 UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length, 284 UBool haveUniqueValue, int32_t &uniqueValue) { 285 while(length>kMaxBranchLinearSubNodeLength) { 286 ++pos; // ignore the comparison unit 287 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) { 288 return NULL; 289 } 290 length=length-(length>>1); 291 pos=skipDelta(pos); 292 } 293 do { 294 ++pos; // ignore a comparison unit 295 // handle its value 296 int32_t node=*pos++; 297 UBool isFinal=(UBool)(node>>15); 298 node&=0x7fff; 299 int32_t value=readValue(pos, node); 300 pos=skipValue(pos, node); 301 if(isFinal) { 302 if(haveUniqueValue) { 303 if(value!=uniqueValue) { 304 return NULL; 305 } 306 } else { 307 uniqueValue=value; 308 haveUniqueValue=TRUE; 309 } 310 } else { 311 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) { 312 return NULL; 313 } 314 haveUniqueValue=TRUE; 315 } 316 } while(--length>1); 317 return pos+1; // ignore the last comparison unit 318 } 319 320 UBool 321 UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) { 322 int32_t node=*pos++; 323 for(;;) { 324 if(node<kMinLinearMatch) { 325 if(node==0) { 326 node=*pos++; 327 } 328 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue); 329 if(pos==NULL) { 330 return FALSE; 331 } 332 haveUniqueValue=TRUE; 333 node=*pos++; 334 } else if(node<kMinValueLead) { 335 // linear-match node 336 pos+=node-kMinLinearMatch+1; // Ignore the match units. 337 node=*pos++; 338 } else { 339 UBool isFinal=(UBool)(node>>15); 340 int32_t value; 341 if(isFinal) { 342 value=readValue(pos, node&0x7fff); 343 } else { 344 value=readNodeValue(pos, node); 345 } 346 if(haveUniqueValue) { 347 if(value!=uniqueValue) { 348 return FALSE; 349 } 350 } else { 351 uniqueValue=value; 352 haveUniqueValue=TRUE; 353 } 354 if(isFinal) { 355 return TRUE; 356 } 357 pos=skipNodeValue(pos, node); 358 node&=kNodeTypeMask; 359 } 360 } 361 } 362 363 int32_t 364 UCharsTrie::getNextUChars(Appendable &out) const { 365 const UChar *pos=pos_; 366 if(pos==NULL) { 367 return 0; 368 } 369 if(remainingMatchLength_>=0) { 370 out.appendCodeUnit(*pos); // Next unit of a pending linear-match node. 371 return 1; 372 } 373 int32_t node=*pos++; 374 if(node>=kMinValueLead) { 375 if(node&kValueIsFinal) { 376 return 0; 377 } else { 378 pos=skipNodeValue(pos, node); 379 node&=kNodeTypeMask; 380 } 381 } 382 if(node<kMinLinearMatch) { 383 if(node==0) { 384 node=*pos++; 385 } 386 out.reserveAppendCapacity(++node); 387 getNextBranchUChars(pos, node, out); 388 return node; 389 } else { 390 // First unit of the linear-match node. 391 out.appendCodeUnit(*pos); 392 return 1; 393 } 394 } 395 396 void 397 UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) { 398 while(length>kMaxBranchLinearSubNodeLength) { 399 ++pos; // ignore the comparison unit 400 getNextBranchUChars(jumpByDelta(pos), length>>1, out); 401 length=length-(length>>1); 402 pos=skipDelta(pos); 403 } 404 do { 405 out.appendCodeUnit(*pos++); 406 pos=skipValue(pos); 407 } while(--length>1); 408 out.appendCodeUnit(*pos); 409 } 410 411 U_NAMESPACE_END 412