1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: stringtriebuilder.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010dec24 12 * created by: Markus W. Scherer 13 */ 14 15 #include "utypeinfo.h" // for 'typeid' to work 16 #include "unicode/utypes.h" 17 #include "unicode/stringtriebuilder.h" 18 #include "uassert.h" 19 #include "uhash.h" 20 21 U_CDECL_BEGIN 22 23 static int32_t U_CALLCONV 24 hashStringTrieNode(const UHashTok key) { 25 return icu::StringTrieBuilder::hashNode(key.pointer); 26 } 27 28 static UBool U_CALLCONV 29 equalStringTrieNodes(const UHashTok key1, const UHashTok key2) { 30 return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer); 31 } 32 33 U_CDECL_END 34 35 U_NAMESPACE_BEGIN 36 37 StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {} 38 39 StringTrieBuilder::~StringTrieBuilder() { 40 deleteCompactBuilder(); 41 } 42 43 void 44 StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) { 45 if(U_FAILURE(errorCode)) { 46 return; 47 } 48 nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL, 49 sizeGuess, &errorCode); 50 if(U_SUCCESS(errorCode)) { 51 if(nodes==NULL) { 52 errorCode=U_MEMORY_ALLOCATION_ERROR; 53 } else { 54 uhash_setKeyDeleter(nodes, uprv_deleteUObject); 55 } 56 } 57 } 58 59 void 60 StringTrieBuilder::deleteCompactBuilder() { 61 uhash_close(nodes); 62 nodes=NULL; 63 } 64 65 void 66 StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength, 67 UErrorCode &errorCode) { 68 if(buildOption==USTRINGTRIE_BUILD_FAST) { 69 writeNode(0, elementsLength, 0); 70 } else /* USTRINGTRIE_BUILD_SMALL */ { 71 createCompactBuilder(2*elementsLength, errorCode); 72 Node *root=makeNode(0, elementsLength, 0, errorCode); 73 if(U_SUCCESS(errorCode)) { 74 root->markRightEdgesFirst(-1); 75 root->write(*this); 76 } 77 deleteCompactBuilder(); 78 } 79 } 80 81 // Requires start<limit, 82 // and all strings of the [start..limit[ elements must be sorted and 83 // have a common prefix of length unitIndex. 84 int32_t 85 StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) { 86 UBool hasValue=FALSE; 87 int32_t value=0; 88 int32_t type; 89 if(unitIndex==getElementStringLength(start)) { 90 // An intermediate or final value. 91 value=getElementValue(start++); 92 if(start==limit) { 93 return writeValueAndFinal(value, TRUE); // final-value node 94 } 95 hasValue=TRUE; 96 } 97 // Now all [start..limit[ strings are longer than unitIndex. 98 int32_t minUnit=getElementUnit(start, unitIndex); 99 int32_t maxUnit=getElementUnit(limit-1, unitIndex); 100 if(minUnit==maxUnit) { 101 // Linear-match node: All strings have the same character at unitIndex. 102 int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); 103 writeNode(start, limit, lastUnitIndex); 104 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. 105 int32_t length=lastUnitIndex-unitIndex; 106 int32_t maxLinearMatchLength=getMaxLinearMatchLength(); 107 while(length>maxLinearMatchLength) { 108 lastUnitIndex-=maxLinearMatchLength; 109 length-=maxLinearMatchLength; 110 writeElementUnits(start, lastUnitIndex, maxLinearMatchLength); 111 write(getMinLinearMatch()+maxLinearMatchLength-1); 112 } 113 writeElementUnits(start, unitIndex, length); 114 type=getMinLinearMatch()+length-1; 115 } else { 116 // Branch node. 117 int32_t length=countElementUnits(start, limit, unitIndex); 118 // length>=2 because minUnit!=maxUnit. 119 writeBranchSubNode(start, limit, unitIndex, length); 120 if(--length<getMinLinearMatch()) { 121 type=length; 122 } else { 123 write(length); 124 type=0; 125 } 126 } 127 return writeValueAndType(hasValue, value, type); 128 } 129 130 // start<limit && all strings longer than unitIndex && 131 // length different units at unitIndex 132 int32_t 133 StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) { 134 UChar middleUnits[kMaxSplitBranchLevels]; 135 int32_t lessThan[kMaxSplitBranchLevels]; 136 int32_t ltLength=0; 137 while(length>getMaxBranchLinearSubNodeLength()) { 138 // Branch on the middle unit. 139 // First, find the middle unit. 140 int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); 141 // Encode the less-than branch first. 142 middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit 143 lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2); 144 ++ltLength; 145 // Continue for the greater-or-equal branch. 146 start=i; 147 length=length-length/2; 148 } 149 // For each unit, find its elements array start and whether it has a final value. 150 int32_t starts[kMaxBranchLinearSubNodeLength]; 151 UBool isFinal[kMaxBranchLinearSubNodeLength-1]; 152 int32_t unitNumber=0; 153 do { 154 int32_t i=starts[unitNumber]=start; 155 UChar unit=getElementUnit(i++, unitIndex); 156 i=indexOfElementWithNextUnit(i, unitIndex, unit); 157 isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start); 158 start=i; 159 } while(++unitNumber<length-1); 160 // unitNumber==length-1, and the maxUnit elements range is [start..limit[ 161 starts[unitNumber]=start; 162 163 // Write the sub-nodes in reverse order: The jump lengths are deltas from 164 // after their own positions, so if we wrote the minUnit sub-node first, 165 // then its jump delta would be larger. 166 // Instead we write the minUnit sub-node last, for a shorter delta. 167 int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1]; 168 do { 169 --unitNumber; 170 if(!isFinal[unitNumber]) { 171 jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1); 172 } 173 } while(unitNumber>0); 174 // The maxUnit sub-node is written as the very last one because we do 175 // not jump for it at all. 176 unitNumber=length-1; 177 writeNode(start, limit, unitIndex+1); 178 int32_t offset=write(getElementUnit(start, unitIndex)); 179 // Write the rest of this node's unit-value pairs. 180 while(--unitNumber>=0) { 181 start=starts[unitNumber]; 182 int32_t value; 183 if(isFinal[unitNumber]) { 184 // Write the final value for the one string ending with this unit. 185 value=getElementValue(start); 186 } else { 187 // Write the delta to the start position of the sub-node. 188 value=offset-jumpTargets[unitNumber]; 189 } 190 writeValueAndFinal(value, isFinal[unitNumber]); 191 offset=write(getElementUnit(start, unitIndex)); 192 } 193 // Write the split-branch nodes. 194 while(ltLength>0) { 195 --ltLength; 196 writeDeltaTo(lessThan[ltLength]); 197 offset=write(middleUnits[ltLength]); 198 } 199 return offset; 200 } 201 202 // Requires start<limit, 203 // and all strings of the [start..limit[ elements must be sorted and 204 // have a common prefix of length unitIndex. 205 StringTrieBuilder::Node * 206 StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) { 207 if(U_FAILURE(errorCode)) { 208 return NULL; 209 } 210 UBool hasValue=FALSE; 211 int32_t value=0; 212 if(unitIndex==getElementStringLength(start)) { 213 // An intermediate or final value. 214 value=getElementValue(start++); 215 if(start==limit) { 216 return registerFinalValue(value, errorCode); 217 } 218 hasValue=TRUE; 219 } 220 Node *node; 221 // Now all [start..limit[ strings are longer than unitIndex. 222 int32_t minUnit=getElementUnit(start, unitIndex); 223 int32_t maxUnit=getElementUnit(limit-1, unitIndex); 224 if(minUnit==maxUnit) { 225 // Linear-match node: All strings have the same character at unitIndex. 226 int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); 227 Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode); 228 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. 229 int32_t length=lastUnitIndex-unitIndex; 230 int32_t maxLinearMatchLength=getMaxLinearMatchLength(); 231 while(length>maxLinearMatchLength) { 232 lastUnitIndex-=maxLinearMatchLength; 233 length-=maxLinearMatchLength; 234 node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode); 235 nextNode=registerNode(node, errorCode); 236 } 237 node=createLinearMatchNode(start, unitIndex, length, nextNode); 238 } else { 239 // Branch node. 240 int32_t length=countElementUnits(start, limit, unitIndex); 241 // length>=2 because minUnit!=maxUnit. 242 Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode); 243 node=new BranchHeadNode(length, subNode); 244 } 245 if(hasValue && node!=NULL) { 246 if(matchNodesCanHaveValues()) { 247 ((ValueNode *)node)->setValue(value); 248 } else { 249 node=new IntermediateValueNode(value, registerNode(node, errorCode)); 250 } 251 } 252 return registerNode(node, errorCode); 253 } 254 255 // start<limit && all strings longer than unitIndex && 256 // length different units at unitIndex 257 StringTrieBuilder::Node * 258 StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, 259 int32_t length, UErrorCode &errorCode) { 260 if(U_FAILURE(errorCode)) { 261 return NULL; 262 } 263 UChar middleUnits[kMaxSplitBranchLevels]; 264 Node *lessThan[kMaxSplitBranchLevels]; 265 int32_t ltLength=0; 266 while(length>getMaxBranchLinearSubNodeLength()) { 267 // Branch on the middle unit. 268 // First, find the middle unit. 269 int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); 270 // Create the less-than branch. 271 middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit 272 lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode); 273 ++ltLength; 274 // Continue for the greater-or-equal branch. 275 start=i; 276 length=length-length/2; 277 } 278 if(U_FAILURE(errorCode)) { 279 return NULL; 280 } 281 ListBranchNode *listNode=new ListBranchNode(); 282 if(listNode==NULL) { 283 errorCode=U_MEMORY_ALLOCATION_ERROR; 284 return NULL; 285 } 286 // For each unit, find its elements array start and whether it has a final value. 287 int32_t unitNumber=0; 288 do { 289 int32_t i=start; 290 UChar unit=getElementUnit(i++, unitIndex); 291 i=indexOfElementWithNextUnit(i, unitIndex, unit); 292 if(start==i-1 && unitIndex+1==getElementStringLength(start)) { 293 listNode->add(unit, getElementValue(start)); 294 } else { 295 listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode)); 296 } 297 start=i; 298 } while(++unitNumber<length-1); 299 // unitNumber==length-1, and the maxUnit elements range is [start..limit[ 300 UChar unit=getElementUnit(start, unitIndex); 301 if(start==limit-1 && unitIndex+1==getElementStringLength(start)) { 302 listNode->add(unit, getElementValue(start)); 303 } else { 304 listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode)); 305 } 306 Node *node=registerNode(listNode, errorCode); 307 // Create the split-branch nodes. 308 while(ltLength>0) { 309 --ltLength; 310 node=registerNode( 311 new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode); 312 } 313 return node; 314 } 315 316 StringTrieBuilder::Node * 317 StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) { 318 if(U_FAILURE(errorCode)) { 319 delete newNode; 320 return NULL; 321 } 322 if(newNode==NULL) { 323 errorCode=U_MEMORY_ALLOCATION_ERROR; 324 return NULL; 325 } 326 const UHashElement *old=uhash_find(nodes, newNode); 327 if(old!=NULL) { 328 delete newNode; 329 return (Node *)old->key.pointer; 330 } 331 // If uhash_puti() returns a non-zero value from an equivalent, previously 332 // registered node, then uhash_find() failed to find that and we will leak newNode. 333 #if U_DEBUG 334 int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. 335 #endif 336 uhash_puti(nodes, newNode, 1, &errorCode); 337 U_ASSERT(oldValue==0); 338 if(U_FAILURE(errorCode)) { 339 delete newNode; 340 return NULL; 341 } 342 return newNode; 343 } 344 345 StringTrieBuilder::Node * 346 StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) { 347 if(U_FAILURE(errorCode)) { 348 return NULL; 349 } 350 FinalValueNode key(value); 351 const UHashElement *old=uhash_find(nodes, &key); 352 if(old!=NULL) { 353 return (Node *)old->key.pointer; 354 } 355 Node *newNode=new FinalValueNode(value); 356 if(newNode==NULL) { 357 errorCode=U_MEMORY_ALLOCATION_ERROR; 358 return NULL; 359 } 360 // If uhash_puti() returns a non-zero value from an equivalent, previously 361 // registered node, then uhash_find() failed to find that and we will leak newNode. 362 #if U_DEBUG 363 int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. 364 #endif 365 uhash_puti(nodes, newNode, 1, &errorCode); 366 U_ASSERT(oldValue==0); 367 if(U_FAILURE(errorCode)) { 368 delete newNode; 369 return NULL; 370 } 371 return newNode; 372 } 373 374 UBool 375 StringTrieBuilder::hashNode(const void *node) { 376 return ((const Node *)node)->hashCode(); 377 } 378 379 UBool 380 StringTrieBuilder::equalNodes(const void *left, const void *right) { 381 return *(const Node *)left==*(const Node *)right; 382 } 383 384 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(StringTrieBuilder) 385 386 UBool 387 StringTrieBuilder::Node::operator==(const Node &other) const { 388 return this==&other || (typeid(*this)==typeid(other) && hash==other.hash); 389 } 390 391 int32_t 392 StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) { 393 if(offset==0) { 394 offset=edgeNumber; 395 } 396 return edgeNumber; 397 } 398 399 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(StringTrieBuilder::Node) 400 401 UBool 402 StringTrieBuilder::FinalValueNode::operator==(const Node &other) const { 403 if(this==&other) { 404 return TRUE; 405 } 406 if(!Node::operator==(other)) { 407 return FALSE; 408 } 409 const FinalValueNode &o=(const FinalValueNode &)other; 410 return value==o.value; 411 } 412 413 void 414 StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) { 415 offset=builder.writeValueAndFinal(value, TRUE); 416 } 417 418 UBool 419 StringTrieBuilder::ValueNode::operator==(const Node &other) const { 420 if(this==&other) { 421 return TRUE; 422 } 423 if(!Node::operator==(other)) { 424 return FALSE; 425 } 426 const ValueNode &o=(const ValueNode &)other; 427 return hasValue==o.hasValue && (!hasValue || value==o.value); 428 } 429 430 UBool 431 StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const { 432 if(this==&other) { 433 return TRUE; 434 } 435 if(!ValueNode::operator==(other)) { 436 return FALSE; 437 } 438 const IntermediateValueNode &o=(const IntermediateValueNode &)other; 439 return next==o.next; 440 } 441 442 int32_t 443 StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) { 444 if(offset==0) { 445 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); 446 } 447 return edgeNumber; 448 } 449 450 void 451 StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) { 452 next->write(builder); 453 offset=builder.writeValueAndFinal(value, FALSE); 454 } 455 456 UBool 457 StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const { 458 if(this==&other) { 459 return TRUE; 460 } 461 if(!ValueNode::operator==(other)) { 462 return FALSE; 463 } 464 const LinearMatchNode &o=(const LinearMatchNode &)other; 465 return length==o.length && next==o.next; 466 } 467 468 int32_t 469 StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) { 470 if(offset==0) { 471 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); 472 } 473 return edgeNumber; 474 } 475 476 UBool 477 StringTrieBuilder::ListBranchNode::operator==(const Node &other) const { 478 if(this==&other) { 479 return TRUE; 480 } 481 if(!Node::operator==(other)) { 482 return FALSE; 483 } 484 const ListBranchNode &o=(const ListBranchNode &)other; 485 for(int32_t i=0; i<length; ++i) { 486 if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) { 487 return FALSE; 488 } 489 } 490 return TRUE; 491 } 492 493 int32_t 494 StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) { 495 if(offset==0) { 496 firstEdgeNumber=edgeNumber; 497 int32_t step=0; 498 int32_t i=length; 499 do { 500 Node *edge=equal[--i]; 501 if(edge!=NULL) { 502 edgeNumber=edge->markRightEdgesFirst(edgeNumber-step); 503 } 504 // For all but the rightmost edge, decrement the edge number. 505 step=1; 506 } while(i>0); 507 offset=edgeNumber; 508 } 509 return edgeNumber; 510 } 511 512 void 513 StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) { 514 // Write the sub-nodes in reverse order: The jump lengths are deltas from 515 // after their own positions, so if we wrote the minUnit sub-node first, 516 // then its jump delta would be larger. 517 // Instead we write the minUnit sub-node last, for a shorter delta. 518 int32_t unitNumber=length-1; 519 Node *rightEdge=equal[unitNumber]; 520 int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset(); 521 do { 522 --unitNumber; 523 if(equal[unitNumber]!=NULL) { 524 equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder); 525 } 526 } while(unitNumber>0); 527 // The maxUnit sub-node is written as the very last one because we do 528 // not jump for it at all. 529 unitNumber=length-1; 530 if(rightEdge==NULL) { 531 builder.writeValueAndFinal(values[unitNumber], TRUE); 532 } else { 533 rightEdge->write(builder); 534 } 535 offset=builder.write(units[unitNumber]); 536 // Write the rest of this node's unit-value pairs. 537 while(--unitNumber>=0) { 538 int32_t value; 539 UBool isFinal; 540 if(equal[unitNumber]==NULL) { 541 // Write the final value for the one string ending with this unit. 542 value=values[unitNumber]; 543 isFinal=TRUE; 544 } else { 545 // Write the delta to the start position of the sub-node. 546 U_ASSERT(equal[unitNumber]->getOffset()>0); 547 value=offset-equal[unitNumber]->getOffset(); 548 isFinal=FALSE; 549 } 550 builder.writeValueAndFinal(value, isFinal); 551 offset=builder.write(units[unitNumber]); 552 } 553 } 554 555 UBool 556 StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const { 557 if(this==&other) { 558 return TRUE; 559 } 560 if(!Node::operator==(other)) { 561 return FALSE; 562 } 563 const SplitBranchNode &o=(const SplitBranchNode &)other; 564 return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual; 565 } 566 567 int32_t 568 StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) { 569 if(offset==0) { 570 firstEdgeNumber=edgeNumber; 571 edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber); 572 offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1); 573 } 574 return edgeNumber; 575 } 576 577 void 578 StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) { 579 // Encode the less-than branch first. 580 lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder); 581 // Encode the greater-or-equal branch last because we do not jump for it at all. 582 greaterOrEqual->write(builder); 583 // Write this node. 584 U_ASSERT(lessThan->getOffset()>0); 585 builder.writeDeltaTo(lessThan->getOffset()); // less-than 586 offset=builder.write(unit); 587 } 588 589 UBool 590 StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const { 591 if(this==&other) { 592 return TRUE; 593 } 594 if(!ValueNode::operator==(other)) { 595 return FALSE; 596 } 597 const BranchHeadNode &o=(const BranchHeadNode &)other; 598 return length==o.length && next==o.next; 599 } 600 601 int32_t 602 StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) { 603 if(offset==0) { 604 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); 605 } 606 return edgeNumber; 607 } 608 609 void 610 StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) { 611 next->write(builder); 612 if(length<=builder.getMinLinearMatch()) { 613 offset=builder.writeValueAndType(hasValue, value, length-1); 614 } else { 615 builder.write(length-1); 616 offset=builder.writeValueAndType(hasValue, value, 0); 617 } 618 } 619 620 U_NAMESPACE_END 621