1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: stringtriebuilder.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010dec24 12 * created by: Markus W. Scherer 13 */ 14 15 #include <typeinfo> // for 'typeid' to work 16 #include "unicode/utypes.h" 17 #include "unicode/stringtriebuilder.h" 18 #include "uassert.h" 19 #include "uhash.h" 20 21 U_CDECL_BEGIN 22 23 static int32_t U_CALLCONV 24 hashStringTrieNode(const UHashTok key) { 25 return U_NAMESPACE_QUALIFIER StringTrieBuilder::hashNode(key.pointer); 26 } 27 28 static UBool U_CALLCONV 29 equalStringTrieNodes(const UHashTok key1, const UHashTok key2) { 30 return U_NAMESPACE_QUALIFIER StringTrieBuilder::equalNodes(key1.pointer, key2.pointer); 31 } 32 33 U_CDECL_END 34 35 U_NAMESPACE_BEGIN 36 37 StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {} 38 39 StringTrieBuilder::~StringTrieBuilder() { 40 deleteCompactBuilder(); 41 } 42 43 void 44 StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) { 45 if(U_FAILURE(errorCode)) { 46 return; 47 } 48 nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL, 49 sizeGuess, &errorCode); 50 if(U_SUCCESS(errorCode) && nodes==NULL) { 51 errorCode=U_MEMORY_ALLOCATION_ERROR; 52 } 53 if(U_SUCCESS(errorCode)) { 54 uhash_setKeyDeleter(nodes, uhash_deleteUObject); 55 } 56 } 57 58 void 59 StringTrieBuilder::deleteCompactBuilder() { 60 uhash_close(nodes); 61 nodes=NULL; 62 } 63 64 void 65 StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength, 66 UErrorCode &errorCode) { 67 if(buildOption==USTRINGTRIE_BUILD_FAST) { 68 writeNode(0, elementsLength, 0); 69 } else /* USTRINGTRIE_BUILD_SMALL */ { 70 createCompactBuilder(2*elementsLength, errorCode); 71 Node *root=makeNode(0, elementsLength, 0, errorCode); 72 if(U_SUCCESS(errorCode)) { 73 root->markRightEdgesFirst(-1); 74 root->write(*this); 75 } 76 deleteCompactBuilder(); 77 } 78 } 79 80 // Requires start<limit, 81 // and all strings of the [start..limit[ elements must be sorted and 82 // have a common prefix of length unitIndex. 83 int32_t 84 StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) { 85 UBool hasValue=FALSE; 86 int32_t value=0; 87 int32_t type; 88 if(unitIndex==getElementStringLength(start)) { 89 // An intermediate or final value. 90 value=getElementValue(start++); 91 if(start==limit) { 92 return writeValueAndFinal(value, TRUE); // final-value node 93 } 94 hasValue=TRUE; 95 } 96 // Now all [start..limit[ strings are longer than unitIndex. 97 int32_t minUnit=getElementUnit(start, unitIndex); 98 int32_t maxUnit=getElementUnit(limit-1, unitIndex); 99 if(minUnit==maxUnit) { 100 // Linear-match node: All strings have the same character at unitIndex. 101 int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); 102 writeNode(start, limit, lastUnitIndex); 103 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. 104 int32_t length=lastUnitIndex-unitIndex; 105 int32_t maxLinearMatchLength=getMaxLinearMatchLength(); 106 while(length>maxLinearMatchLength) { 107 lastUnitIndex-=maxLinearMatchLength; 108 length-=maxLinearMatchLength; 109 writeElementUnits(start, lastUnitIndex, maxLinearMatchLength); 110 write(getMinLinearMatch()+maxLinearMatchLength-1); 111 } 112 writeElementUnits(start, unitIndex, length); 113 type=getMinLinearMatch()+length-1; 114 } else { 115 // Branch node. 116 int32_t length=countElementUnits(start, limit, unitIndex); 117 // length>=2 because minUnit!=maxUnit. 118 writeBranchSubNode(start, limit, unitIndex, length); 119 if(--length<getMinLinearMatch()) { 120 type=length; 121 } else { 122 write(length); 123 type=0; 124 } 125 } 126 return writeValueAndType(hasValue, value, type); 127 } 128 129 // start<limit && all strings longer than unitIndex && 130 // length different units at unitIndex 131 int32_t 132 StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) { 133 UChar middleUnits[kMaxSplitBranchLevels]; 134 int32_t lessThan[kMaxSplitBranchLevels]; 135 int32_t ltLength=0; 136 while(length>getMaxBranchLinearSubNodeLength()) { 137 // Branch on the middle unit. 138 // First, find the middle unit. 139 int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); 140 // Encode the less-than branch first. 141 middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit 142 lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2); 143 ++ltLength; 144 // Continue for the greater-or-equal branch. 145 start=i; 146 length=length-length/2; 147 } 148 // For each unit, find its elements array start and whether it has a final value. 149 int32_t starts[kMaxBranchLinearSubNodeLength]; 150 UBool isFinal[kMaxBranchLinearSubNodeLength-1]; 151 int32_t unitNumber=0; 152 do { 153 int32_t i=starts[unitNumber]=start; 154 UChar unit=getElementUnit(i++, unitIndex); 155 i=indexOfElementWithNextUnit(i, unitIndex, unit); 156 isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start); 157 start=i; 158 } while(++unitNumber<length-1); 159 // unitNumber==length-1, and the maxUnit elements range is [start..limit[ 160 starts[unitNumber]=start; 161 162 // Write the sub-nodes in reverse order: The jump lengths are deltas from 163 // after their own positions, so if we wrote the minUnit sub-node first, 164 // then its jump delta would be larger. 165 // Instead we write the minUnit sub-node last, for a shorter delta. 166 int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1]; 167 do { 168 --unitNumber; 169 if(!isFinal[unitNumber]) { 170 jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1); 171 } 172 } while(unitNumber>0); 173 // The maxUnit sub-node is written as the very last one because we do 174 // not jump for it at all. 175 unitNumber=length-1; 176 writeNode(start, limit, unitIndex+1); 177 int32_t offset=write(getElementUnit(start, unitIndex)); 178 // Write the rest of this node's unit-value pairs. 179 while(--unitNumber>=0) { 180 start=starts[unitNumber]; 181 int32_t value; 182 if(isFinal[unitNumber]) { 183 // Write the final value for the one string ending with this unit. 184 value=getElementValue(start); 185 } else { 186 // Write the delta to the start position of the sub-node. 187 value=offset-jumpTargets[unitNumber]; 188 } 189 writeValueAndFinal(value, isFinal[unitNumber]); 190 offset=write(getElementUnit(start, unitIndex)); 191 } 192 // Write the split-branch nodes. 193 while(ltLength>0) { 194 --ltLength; 195 writeDeltaTo(lessThan[ltLength]); 196 offset=write(middleUnits[ltLength]); 197 } 198 return offset; 199 } 200 201 // Requires start<limit, 202 // and all strings of the [start..limit[ elements must be sorted and 203 // have a common prefix of length unitIndex. 204 StringTrieBuilder::Node * 205 StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) { 206 if(U_FAILURE(errorCode)) { 207 return NULL; 208 } 209 UBool hasValue=FALSE; 210 int32_t value=0; 211 if(unitIndex==getElementStringLength(start)) { 212 // An intermediate or final value. 213 value=getElementValue(start++); 214 if(start==limit) { 215 return registerFinalValue(value, errorCode); 216 } 217 hasValue=TRUE; 218 } 219 Node *node; 220 // Now all [start..limit[ strings are longer than unitIndex. 221 int32_t minUnit=getElementUnit(start, unitIndex); 222 int32_t maxUnit=getElementUnit(limit-1, unitIndex); 223 if(minUnit==maxUnit) { 224 // Linear-match node: All strings have the same character at unitIndex. 225 int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); 226 Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode); 227 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. 228 int32_t length=lastUnitIndex-unitIndex; 229 int32_t maxLinearMatchLength=getMaxLinearMatchLength(); 230 while(length>maxLinearMatchLength) { 231 lastUnitIndex-=maxLinearMatchLength; 232 length-=maxLinearMatchLength; 233 node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode); 234 nextNode=registerNode(node, errorCode); 235 } 236 node=createLinearMatchNode(start, unitIndex, length, nextNode); 237 } else { 238 // Branch node. 239 int32_t length=countElementUnits(start, limit, unitIndex); 240 // length>=2 because minUnit!=maxUnit. 241 Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode); 242 node=new BranchHeadNode(length, subNode); 243 } 244 if(hasValue && node!=NULL) { 245 if(matchNodesCanHaveValues()) { 246 ((ValueNode *)node)->setValue(value); 247 } else { 248 node=new IntermediateValueNode(value, registerNode(node, errorCode)); 249 } 250 } 251 return registerNode(node, errorCode); 252 } 253 254 // start<limit && all strings longer than unitIndex && 255 // length different units at unitIndex 256 StringTrieBuilder::Node * 257 StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, 258 int32_t length, UErrorCode &errorCode) { 259 if(U_FAILURE(errorCode)) { 260 return NULL; 261 } 262 UChar middleUnits[kMaxSplitBranchLevels]; 263 Node *lessThan[kMaxSplitBranchLevels]; 264 int32_t ltLength=0; 265 while(length>getMaxBranchLinearSubNodeLength()) { 266 // Branch on the middle unit. 267 // First, find the middle unit. 268 int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); 269 // Create the less-than branch. 270 middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit 271 lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode); 272 ++ltLength; 273 // Continue for the greater-or-equal branch. 274 start=i; 275 length=length-length/2; 276 } 277 if(U_FAILURE(errorCode)) { 278 return NULL; 279 } 280 ListBranchNode *listNode=new ListBranchNode(); 281 if(listNode==NULL) { 282 errorCode=U_MEMORY_ALLOCATION_ERROR; 283 return NULL; 284 } 285 // For each unit, find its elements array start and whether it has a final value. 286 int32_t unitNumber=0; 287 do { 288 int32_t i=start; 289 UChar unit=getElementUnit(i++, unitIndex); 290 i=indexOfElementWithNextUnit(i, unitIndex, unit); 291 if(start==i-1 && unitIndex+1==getElementStringLength(start)) { 292 listNode->add(unit, getElementValue(start)); 293 } else { 294 listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode)); 295 } 296 start=i; 297 } while(++unitNumber<length-1); 298 // unitNumber==length-1, and the maxUnit elements range is [start..limit[ 299 UChar unit=getElementUnit(start, unitIndex); 300 if(start==limit-1 && unitIndex+1==getElementStringLength(start)) { 301 listNode->add(unit, getElementValue(start)); 302 } else { 303 listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode)); 304 } 305 Node *node=registerNode(listNode, errorCode); 306 // Create the split-branch nodes. 307 while(ltLength>0) { 308 --ltLength; 309 node=registerNode( 310 new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode); 311 } 312 return node; 313 } 314 315 StringTrieBuilder::Node * 316 StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) { 317 if(U_FAILURE(errorCode)) { 318 delete newNode; 319 return NULL; 320 } 321 if(newNode==NULL) { 322 errorCode=U_MEMORY_ALLOCATION_ERROR; 323 return NULL; 324 } 325 const UHashElement *old=uhash_find(nodes, newNode); 326 if(old!=NULL) { 327 delete newNode; 328 return (Node *)old->key.pointer; 329 } 330 // If uhash_puti() returns a non-zero value from an equivalent, previously 331 // registered node, then uhash_find() failed to find that and we will leak newNode. 332 #if !U_RELEASE 333 int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. 334 #endif 335 uhash_puti(nodes, newNode, 1, &errorCode); 336 U_ASSERT(oldValue==0); 337 if(U_FAILURE(errorCode)) { 338 delete newNode; 339 return NULL; 340 } 341 return newNode; 342 } 343 344 StringTrieBuilder::Node * 345 StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) { 346 if(U_FAILURE(errorCode)) { 347 return NULL; 348 } 349 FinalValueNode key(value); 350 const UHashElement *old=uhash_find(nodes, &key); 351 if(old!=NULL) { 352 return (Node *)old->key.pointer; 353 } 354 Node *newNode=new FinalValueNode(value); 355 if(newNode==NULL) { 356 errorCode=U_MEMORY_ALLOCATION_ERROR; 357 return NULL; 358 } 359 // If uhash_puti() returns a non-zero value from an equivalent, previously 360 // registered node, then uhash_find() failed to find that and we will leak newNode. 361 #if !U_RELEASE 362 int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. 363 #endif 364 uhash_puti(nodes, newNode, 1, &errorCode); 365 U_ASSERT(oldValue==0); 366 if(U_FAILURE(errorCode)) { 367 delete newNode; 368 return NULL; 369 } 370 return newNode; 371 } 372 373 UBool 374 StringTrieBuilder::hashNode(const void *node) { 375 return ((const Node *)node)->hashCode(); 376 } 377 378 UBool 379 StringTrieBuilder::equalNodes(const void *left, const void *right) { 380 return *(const Node *)left==*(const Node *)right; 381 } 382 383 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(StringTrieBuilder) 384 385 UBool 386 StringTrieBuilder::Node::operator==(const Node &other) const { 387 return this==&other || (typeid(*this)==typeid(other) && hash==other.hash); 388 } 389 390 int32_t 391 StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) { 392 if(offset==0) { 393 offset=edgeNumber; 394 } 395 return edgeNumber; 396 } 397 398 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(StringTrieBuilder::Node) 399 400 UBool 401 StringTrieBuilder::FinalValueNode::operator==(const Node &other) const { 402 if(this==&other) { 403 return TRUE; 404 } 405 if(!Node::operator==(other)) { 406 return FALSE; 407 } 408 const FinalValueNode &o=(const FinalValueNode &)other; 409 return value==o.value; 410 } 411 412 void 413 StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) { 414 offset=builder.writeValueAndFinal(value, TRUE); 415 } 416 417 UBool 418 StringTrieBuilder::ValueNode::operator==(const Node &other) const { 419 if(this==&other) { 420 return TRUE; 421 } 422 if(!Node::operator==(other)) { 423 return FALSE; 424 } 425 const ValueNode &o=(const ValueNode &)other; 426 return hasValue==o.hasValue && (!hasValue || value==o.value); 427 } 428 429 UBool 430 StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const { 431 if(this==&other) { 432 return TRUE; 433 } 434 if(!ValueNode::operator==(other)) { 435 return FALSE; 436 } 437 const IntermediateValueNode &o=(const IntermediateValueNode &)other; 438 return next==o.next; 439 } 440 441 int32_t 442 StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) { 443 if(offset==0) { 444 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); 445 } 446 return edgeNumber; 447 } 448 449 void 450 StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) { 451 next->write(builder); 452 offset=builder.writeValueAndFinal(value, FALSE); 453 } 454 455 UBool 456 StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const { 457 if(this==&other) { 458 return TRUE; 459 } 460 if(!ValueNode::operator==(other)) { 461 return FALSE; 462 } 463 const LinearMatchNode &o=(const LinearMatchNode &)other; 464 return length==o.length && next==o.next; 465 } 466 467 int32_t 468 StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) { 469 if(offset==0) { 470 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); 471 } 472 return edgeNumber; 473 } 474 475 UBool 476 StringTrieBuilder::ListBranchNode::operator==(const Node &other) const { 477 if(this==&other) { 478 return TRUE; 479 } 480 if(!Node::operator==(other)) { 481 return FALSE; 482 } 483 const ListBranchNode &o=(const ListBranchNode &)other; 484 for(int32_t i=0; i<length; ++i) { 485 if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) { 486 return FALSE; 487 } 488 } 489 return TRUE; 490 } 491 492 int32_t 493 StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) { 494 if(offset==0) { 495 firstEdgeNumber=edgeNumber; 496 int32_t step=0; 497 int32_t i=length; 498 do { 499 Node *edge=equal[--i]; 500 if(edge!=NULL) { 501 edgeNumber=edge->markRightEdgesFirst(edgeNumber-step); 502 } 503 // For all but the rightmost edge, decrement the edge number. 504 step=1; 505 } while(i>0); 506 offset=edgeNumber; 507 } 508 return edgeNumber; 509 } 510 511 void 512 StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) { 513 // Write the sub-nodes in reverse order: The jump lengths are deltas from 514 // after their own positions, so if we wrote the minUnit sub-node first, 515 // then its jump delta would be larger. 516 // Instead we write the minUnit sub-node last, for a shorter delta. 517 int32_t unitNumber=length-1; 518 Node *rightEdge=equal[unitNumber]; 519 int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset(); 520 do { 521 --unitNumber; 522 if(equal[unitNumber]!=NULL) { 523 equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder); 524 } 525 } while(unitNumber>0); 526 // The maxUnit sub-node is written as the very last one because we do 527 // not jump for it at all. 528 unitNumber=length-1; 529 if(rightEdge==NULL) { 530 builder.writeValueAndFinal(values[unitNumber], TRUE); 531 } else { 532 rightEdge->write(builder); 533 } 534 offset=builder.write(units[unitNumber]); 535 // Write the rest of this node's unit-value pairs. 536 while(--unitNumber>=0) { 537 int32_t value; 538 UBool isFinal; 539 if(equal[unitNumber]==NULL) { 540 // Write the final value for the one string ending with this unit. 541 value=values[unitNumber]; 542 isFinal=TRUE; 543 } else { 544 // Write the delta to the start position of the sub-node. 545 U_ASSERT(equal[unitNumber]->getOffset()>0); 546 value=offset-equal[unitNumber]->getOffset(); 547 isFinal=FALSE; 548 } 549 builder.writeValueAndFinal(value, isFinal); 550 offset=builder.write(units[unitNumber]); 551 } 552 } 553 554 UBool 555 StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const { 556 if(this==&other) { 557 return TRUE; 558 } 559 if(!Node::operator==(other)) { 560 return FALSE; 561 } 562 const SplitBranchNode &o=(const SplitBranchNode &)other; 563 return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual; 564 } 565 566 int32_t 567 StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) { 568 if(offset==0) { 569 firstEdgeNumber=edgeNumber; 570 edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber); 571 offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1); 572 } 573 return edgeNumber; 574 } 575 576 void 577 StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) { 578 // Encode the less-than branch first. 579 lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder); 580 // Encode the greater-or-equal branch last because we do not jump for it at all. 581 greaterOrEqual->write(builder); 582 // Write this node. 583 U_ASSERT(lessThan->getOffset()>0); 584 builder.writeDeltaTo(lessThan->getOffset()); // less-than 585 offset=builder.write(unit); 586 } 587 588 UBool 589 StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const { 590 if(this==&other) { 591 return TRUE; 592 } 593 if(!ValueNode::operator==(other)) { 594 return FALSE; 595 } 596 const BranchHeadNode &o=(const BranchHeadNode &)other; 597 return length==o.length && next==o.next; 598 } 599 600 int32_t 601 StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) { 602 if(offset==0) { 603 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); 604 } 605 return edgeNumber; 606 } 607 608 void 609 StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) { 610 next->write(builder); 611 if(length<=builder.getMinLinearMatch()) { 612 offset=builder.writeValueAndType(hasValue, value, length-1); 613 } else { 614 builder.write(length-1); 615 offset=builder.writeValueAndType(hasValue, value, 0); 616 } 617 } 618 619 U_NAMESPACE_END 620