1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: bytestrie.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010sep25 12 * created by: Markus W. Scherer 13 */ 14 15 #include "unicode/utypes.h" 16 #include "unicode/bytestream.h" 17 #include "unicode/bytestrie.h" 18 #include "unicode/uobject.h" 19 #include "cmemory.h" 20 #include "uassert.h" 21 22 U_NAMESPACE_BEGIN 23 24 BytesTrie::~BytesTrie() { 25 uprv_free(ownedArray_); 26 } 27 28 // lead byte already shifted right by 1. 29 int32_t 30 BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) { 31 int32_t value; 32 if(leadByte<kMinTwoByteValueLead) { 33 value=leadByte-kMinOneByteValueLead; 34 } else if(leadByte<kMinThreeByteValueLead) { 35 value=((leadByte-kMinTwoByteValueLead)<<8)|*pos; 36 } else if(leadByte<kFourByteValueLead) { 37 value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1]; 38 } else if(leadByte==kFourByteValueLead) { 39 value=(pos[0]<<16)|(pos[1]<<8)|pos[2]; 40 } else { 41 value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; 42 } 43 return value; 44 } 45 46 const uint8_t * 47 BytesTrie::jumpByDelta(const uint8_t *pos) { 48 int32_t delta=*pos++; 49 if(delta<kMinTwoByteDeltaLead) { 50 // nothing to do 51 } else if(delta<kMinThreeByteDeltaLead) { 52 delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++; 53 } else if(delta<kFourByteDeltaLead) { 54 delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1]; 55 pos+=2; 56 } else if(delta==kFourByteDeltaLead) { 57 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2]; 58 pos+=3; 59 } else { 60 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; 61 pos+=4; 62 } 63 return pos+delta; 64 } 65 66 UStringTrieResult 67 BytesTrie::current() const { 68 const uint8_t *pos=pos_; 69 if(pos==NULL) { 70 return USTRINGTRIE_NO_MATCH; 71 } else { 72 int32_t node; 73 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ? 74 valueResult(node) : USTRINGTRIE_NO_VALUE; 75 } 76 } 77 78 UStringTrieResult 79 BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) { 80 // Branch according to the current byte. 81 if(length==0) { 82 length=*pos++; 83 } 84 ++length; 85 // The length of the branch is the number of bytes to select from. 86 // The data structure encodes a binary search. 87 while(length>kMaxBranchLinearSubNodeLength) { 88 if(inByte<*pos++) { 89 length>>=1; 90 pos=jumpByDelta(pos); 91 } else { 92 length=length-(length>>1); 93 pos=skipDelta(pos); 94 } 95 } 96 // Drop down to linear search for the last few bytes. 97 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 98 // and divides length by 2. 99 do { 100 if(inByte==*pos++) { 101 UStringTrieResult result; 102 int32_t node=*pos; 103 U_ASSERT(node>=kMinValueLead); 104 if(node&kValueIsFinal) { 105 // Leave the final value for getValue() to read. 106 result=USTRINGTRIE_FINAL_VALUE; 107 } else { 108 // Use the non-final value as the jump delta. 109 ++pos; 110 // int32_t delta=readValue(pos, node>>1); 111 node>>=1; 112 int32_t delta; 113 if(node<kMinTwoByteValueLead) { 114 delta=node-kMinOneByteValueLead; 115 } else if(node<kMinThreeByteValueLead) { 116 delta=((node-kMinTwoByteValueLead)<<8)|*pos++; 117 } else if(node<kFourByteValueLead) { 118 delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1]; 119 pos+=2; 120 } else if(node==kFourByteValueLead) { 121 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2]; 122 pos+=3; 123 } else { 124 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; 125 pos+=4; 126 } 127 // end readValue() 128 pos+=delta; 129 node=*pos; 130 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 131 } 132 pos_=pos; 133 return result; 134 } 135 --length; 136 pos=skipValue(pos); 137 } while(length>1); 138 if(inByte==*pos++) { 139 pos_=pos; 140 int32_t node=*pos; 141 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 142 } else { 143 stop(); 144 return USTRINGTRIE_NO_MATCH; 145 } 146 } 147 148 UStringTrieResult 149 BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) { 150 for(;;) { 151 int32_t node=*pos++; 152 if(node<kMinLinearMatch) { 153 return branchNext(pos, node, inByte); 154 } else if(node<kMinValueLead) { 155 // Match the first of length+1 bytes. 156 int32_t length=node-kMinLinearMatch; // Actual match length minus 1. 157 if(inByte==*pos++) { 158 remainingMatchLength_=--length; 159 pos_=pos; 160 return (length<0 && (node=*pos)>=kMinValueLead) ? 161 valueResult(node) : USTRINGTRIE_NO_VALUE; 162 } else { 163 // No match. 164 break; 165 } 166 } else if(node&kValueIsFinal) { 167 // No further matching bytes. 168 break; 169 } else { 170 // Skip intermediate value. 171 pos=skipValue(pos, node); 172 // The next node must not also be a value node. 173 U_ASSERT(*pos<kMinValueLead); 174 } 175 } 176 stop(); 177 return USTRINGTRIE_NO_MATCH; 178 } 179 180 UStringTrieResult 181 BytesTrie::next(int32_t inByte) { 182 const uint8_t *pos=pos_; 183 if(pos==NULL) { 184 return USTRINGTRIE_NO_MATCH; 185 } 186 if(inByte<0) { 187 inByte+=0x100; 188 } 189 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 190 if(length>=0) { 191 // Remaining part of a linear-match node. 192 if(inByte==*pos++) { 193 remainingMatchLength_=--length; 194 pos_=pos; 195 int32_t node; 196 return (length<0 && (node=*pos)>=kMinValueLead) ? 197 valueResult(node) : USTRINGTRIE_NO_VALUE; 198 } else { 199 stop(); 200 return USTRINGTRIE_NO_MATCH; 201 } 202 } 203 return nextImpl(pos, inByte); 204 } 205 206 UStringTrieResult 207 BytesTrie::next(const char *s, int32_t sLength) { 208 if(sLength<0 ? *s==0 : sLength==0) { 209 // Empty input. 210 return current(); 211 } 212 const uint8_t *pos=pos_; 213 if(pos==NULL) { 214 return USTRINGTRIE_NO_MATCH; 215 } 216 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 217 for(;;) { 218 // Fetch the next input byte, if there is one. 219 // Continue a linear-match node without rechecking sLength<0. 220 int32_t inByte; 221 if(sLength<0) { 222 for(;;) { 223 if((inByte=*s++)==0) { 224 remainingMatchLength_=length; 225 pos_=pos; 226 int32_t node; 227 return (length<0 && (node=*pos)>=kMinValueLead) ? 228 valueResult(node) : USTRINGTRIE_NO_VALUE; 229 } 230 if(length<0) { 231 remainingMatchLength_=length; 232 break; 233 } 234 if(inByte!=*pos) { 235 stop(); 236 return USTRINGTRIE_NO_MATCH; 237 } 238 ++pos; 239 --length; 240 } 241 } else { 242 for(;;) { 243 if(sLength==0) { 244 remainingMatchLength_=length; 245 pos_=pos; 246 int32_t node; 247 return (length<0 && (node=*pos)>=kMinValueLead) ? 248 valueResult(node) : USTRINGTRIE_NO_VALUE; 249 } 250 inByte=*s++; 251 --sLength; 252 if(length<0) { 253 remainingMatchLength_=length; 254 break; 255 } 256 if(inByte!=*pos) { 257 stop(); 258 return USTRINGTRIE_NO_MATCH; 259 } 260 ++pos; 261 --length; 262 } 263 } 264 for(;;) { 265 int32_t node=*pos++; 266 if(node<kMinLinearMatch) { 267 UStringTrieResult result=branchNext(pos, node, inByte); 268 if(result==USTRINGTRIE_NO_MATCH) { 269 return USTRINGTRIE_NO_MATCH; 270 } 271 // Fetch the next input byte, if there is one. 272 if(sLength<0) { 273 if((inByte=*s++)==0) { 274 return result; 275 } 276 } else { 277 if(sLength==0) { 278 return result; 279 } 280 inByte=*s++; 281 --sLength; 282 } 283 if(result==USTRINGTRIE_FINAL_VALUE) { 284 // No further matching bytes. 285 stop(); 286 return USTRINGTRIE_NO_MATCH; 287 } 288 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 289 } else if(node<kMinValueLead) { 290 // Match length+1 bytes. 291 length=node-kMinLinearMatch; // Actual match length minus 1. 292 if(inByte!=*pos) { 293 stop(); 294 return USTRINGTRIE_NO_MATCH; 295 } 296 ++pos; 297 --length; 298 break; 299 } else if(node&kValueIsFinal) { 300 // No further matching bytes. 301 stop(); 302 return USTRINGTRIE_NO_MATCH; 303 } else { 304 // Skip intermediate value. 305 pos=skipValue(pos, node); 306 // The next node must not also be a value node. 307 U_ASSERT(*pos<kMinValueLead); 308 } 309 } 310 } 311 } 312 313 const uint8_t * 314 BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length, 315 UBool haveUniqueValue, int32_t &uniqueValue) { 316 while(length>kMaxBranchLinearSubNodeLength) { 317 ++pos; // ignore the comparison byte 318 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) { 319 return NULL; 320 } 321 length=length-(length>>1); 322 pos=skipDelta(pos); 323 } 324 do { 325 ++pos; // ignore a comparison byte 326 // handle its value 327 int32_t node=*pos++; 328 UBool isFinal=(UBool)(node&kValueIsFinal); 329 int32_t value=readValue(pos, node>>1); 330 pos=skipValue(pos, node); 331 if(isFinal) { 332 if(haveUniqueValue) { 333 if(value!=uniqueValue) { 334 return NULL; 335 } 336 } else { 337 uniqueValue=value; 338 haveUniqueValue=TRUE; 339 } 340 } else { 341 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) { 342 return NULL; 343 } 344 haveUniqueValue=TRUE; 345 } 346 } while(--length>1); 347 return pos+1; // ignore the last comparison byte 348 } 349 350 UBool 351 BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) { 352 for(;;) { 353 int32_t node=*pos++; 354 if(node<kMinLinearMatch) { 355 if(node==0) { 356 node=*pos++; 357 } 358 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue); 359 if(pos==NULL) { 360 return FALSE; 361 } 362 haveUniqueValue=TRUE; 363 } else if(node<kMinValueLead) { 364 // linear-match node 365 pos+=node-kMinLinearMatch+1; // Ignore the match bytes. 366 } else { 367 UBool isFinal=(UBool)(node&kValueIsFinal); 368 int32_t value=readValue(pos, node>>1); 369 if(haveUniqueValue) { 370 if(value!=uniqueValue) { 371 return FALSE; 372 } 373 } else { 374 uniqueValue=value; 375 haveUniqueValue=TRUE; 376 } 377 if(isFinal) { 378 return TRUE; 379 } 380 pos=skipValue(pos, node); 381 } 382 } 383 } 384 385 int32_t 386 BytesTrie::getNextBytes(ByteSink &out) const { 387 const uint8_t *pos=pos_; 388 if(pos==NULL) { 389 return 0; 390 } 391 if(remainingMatchLength_>=0) { 392 append(out, *pos); // Next byte of a pending linear-match node. 393 return 1; 394 } 395 int32_t node=*pos++; 396 if(node>=kMinValueLead) { 397 if(node&kValueIsFinal) { 398 return 0; 399 } else { 400 pos=skipValue(pos, node); 401 node=*pos++; 402 U_ASSERT(node<kMinValueLead); 403 } 404 } 405 if(node<kMinLinearMatch) { 406 if(node==0) { 407 node=*pos++; 408 } 409 getNextBranchBytes(pos, ++node, out); 410 return node; 411 } else { 412 // First byte of the linear-match node. 413 append(out, *pos); 414 return 1; 415 } 416 } 417 418 void 419 BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) { 420 while(length>kMaxBranchLinearSubNodeLength) { 421 ++pos; // ignore the comparison byte 422 getNextBranchBytes(jumpByDelta(pos), length>>1, out); 423 length=length-(length>>1); 424 pos=skipDelta(pos); 425 } 426 do { 427 append(out, *pos++); 428 pos=skipValue(pos); 429 } while(--length>1); 430 append(out, *pos); 431 } 432 433 void 434 BytesTrie::append(ByteSink &out, int c) { 435 char ch=(char)c; 436 out.Append(&ch, 1); 437 } 438 439 U_NAMESPACE_END 440