1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // rbbisetb.cpp 5 // 6 /* 7 *************************************************************************** 8 * Copyright (C) 2002-2008 International Business Machines Corporation * 9 * and others. All rights reserved. * 10 *************************************************************************** 11 */ 12 // 13 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules 14 // (part of the rule building process.) 15 // 16 // Starting with the rules parse tree from the scanner, 17 // 18 // - Enumerate the set of UnicodeSets that are referenced 19 // by the RBBI rules. 20 // - compute a set of non-overlapping character ranges 21 // with all characters within a range belonging to the same 22 // set of input uniocde sets. 23 // - Derive a set of non-overlapping UnicodeSet (like things) 24 // that will correspond to columns in the state table for 25 // the RBBI execution engine. All characters within one 26 // of these sets belong to the same set of the original 27 // UnicodeSets from the user's rules. 28 // - construct the trie table that maps input characters 29 // to the index of the matching non-overlapping set of set from 30 // the previous step. 31 // 32 33 #include "unicode/utypes.h" 34 35 #if !UCONFIG_NO_BREAK_ITERATION 36 37 #include "unicode/uniset.h" 38 #include "utrie2.h" 39 #include "uvector.h" 40 #include "uassert.h" 41 #include "cmemory.h" 42 #include "cstring.h" 43 44 #include "rbbisetb.h" 45 #include "rbbinode.h" 46 47 U_NAMESPACE_BEGIN 48 49 //------------------------------------------------------------------------ 50 // 51 // Constructor 52 // 53 //------------------------------------------------------------------------ 54 RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) 55 { 56 fRB = rb; 57 fStatus = rb->fStatus; 58 fRangeList = 0; 59 fTrie = 0; 60 fTrieSize = 0; 61 fGroupCount = 0; 62 fSawBOF = FALSE; 63 } 64 65 66 //------------------------------------------------------------------------ 67 // 68 // Destructor 69 // 70 //------------------------------------------------------------------------ 71 RBBISetBuilder::~RBBISetBuilder() 72 { 73 RangeDescriptor *nextRangeDesc; 74 75 // Walk through & delete the linked list of RangeDescriptors 76 for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) { 77 RangeDescriptor *r = nextRangeDesc; 78 nextRangeDesc = r->fNext; 79 delete r; 80 } 81 82 utrie2_close(fTrie); 83 } 84 85 86 87 88 //------------------------------------------------------------------------ 89 // 90 // build Build the list of non-overlapping character ranges 91 // from the Unicode Sets. 92 // 93 //------------------------------------------------------------------------ 94 void RBBISetBuilder::buildRanges() { 95 RBBINode *usetNode; 96 RangeDescriptor *rlRange; 97 98 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();} 99 100 // 101 // Initialize the process by creating a single range encompassing all characters 102 // that is in no sets. 103 // 104 fRangeList = new RangeDescriptor(*fStatus); // will check for status here 105 if (fRangeList == NULL) { 106 *fStatus = U_MEMORY_ALLOCATION_ERROR; 107 return; 108 } 109 fRangeList->fStartChar = 0; 110 fRangeList->fEndChar = 0x10ffff; 111 112 if (U_FAILURE(*fStatus)) { 113 return; 114 } 115 116 // 117 // Find the set of non-overlapping ranges of characters 118 // 119 int ni; 120 for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules 121 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); 122 if (usetNode==NULL) { 123 break; 124 } 125 126 UnicodeSet *inputSet = usetNode->fInputSet; 127 int32_t inputSetRangeCount = inputSet->getRangeCount(); 128 int inputSetRangeIndex = 0; 129 rlRange = fRangeList; 130 131 for (;;) { 132 if (inputSetRangeIndex >= inputSetRangeCount) { 133 break; 134 } 135 UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); 136 UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); 137 138 // skip over ranges from the range list that are completely 139 // below the current range from the input unicode set. 140 while (rlRange->fEndChar < inputSetRangeBegin) { 141 rlRange = rlRange->fNext; 142 } 143 144 // If the start of the range from the range list is before with 145 // the start of the range from the unicode set, split the range list range 146 // in two, with one part being before (wholly outside of) the unicode set 147 // and the other containing the rest. 148 // Then continue the loop; the post-split current range will then be skipped 149 // over 150 if (rlRange->fStartChar < inputSetRangeBegin) { 151 rlRange->split(inputSetRangeBegin, *fStatus); 152 if (U_FAILURE(*fStatus)) { 153 return; 154 } 155 continue; 156 } 157 158 // Same thing at the end of the ranges... 159 // If the end of the range from the range list doesn't coincide with 160 // the end of the range from the unicode set, split the range list 161 // range in two. The first part of the split range will be 162 // wholly inside the Unicode set. 163 if (rlRange->fEndChar > inputSetRangeEnd) { 164 rlRange->split(inputSetRangeEnd+1, *fStatus); 165 if (U_FAILURE(*fStatus)) { 166 return; 167 } 168 } 169 170 // The current rlRange is now entirely within the UnicodeSet range. 171 // Add this unicode set to the list of sets for this rlRange 172 if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { 173 rlRange->fIncludesSets->addElement(usetNode, *fStatus); 174 if (U_FAILURE(*fStatus)) { 175 return; 176 } 177 } 178 179 // Advance over ranges that we are finished with. 180 if (inputSetRangeEnd == rlRange->fEndChar) { 181 inputSetRangeIndex++; 182 } 183 rlRange = rlRange->fNext; 184 } 185 } 186 187 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();} 188 189 // 190 // Group the above ranges, with each group consisting of one or more 191 // ranges that are in exactly the same set of original UnicodeSets. 192 // The groups are numbered, and these group numbers are the set of 193 // input symbols recognized by the run-time state machine. 194 // 195 // Numbering: # 0 (state table column 0) is unused. 196 // # 1 is reserved - table column 1 is for end-of-input 197 // # 2 is reserved - table column 2 is for beginning-in-input 198 // # 3 is the first range list. 199 // 200 RangeDescriptor *rlSearchRange; 201 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { 202 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { 203 if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { 204 rlRange->fNum = rlSearchRange->fNum; 205 break; 206 } 207 } 208 if (rlRange->fNum == 0) { 209 fGroupCount ++; 210 rlRange->fNum = fGroupCount+2; 211 rlRange->setDictionaryFlag(); 212 addValToSets(rlRange->fIncludesSets, fGroupCount+2); 213 } 214 } 215 216 // Handle input sets that contain the special string {eof}. 217 // Column 1 of the state table is reserved for EOF on input. 218 // Column 2 is reserved for before-the-start-input. 219 // (This column can be optimized away later if there are no rule 220 // references to {bof}.) 221 // Add this column value (1 or 2) to the equivalent expression 222 // subtree for each UnicodeSet that contains the string {eof} 223 // Because {bof} and {eof} are not a characters in the normal sense, 224 // they doesn't affect the computation of ranges or TRIE. 225 static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; 226 static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; 227 228 UnicodeString eofString(eofUString); 229 UnicodeString bofString(bofUString); 230 for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules 231 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); 232 if (usetNode==NULL) { 233 break; 234 } 235 UnicodeSet *inputSet = usetNode->fInputSet; 236 if (inputSet->contains(eofString)) { 237 addValToSet(usetNode, 1); 238 } 239 if (inputSet->contains(bofString)) { 240 addValToSet(usetNode, 2); 241 fSawBOF = TRUE; 242 } 243 } 244 245 246 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();} 247 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();} 248 } 249 250 251 // 252 // Build the Trie table for mapping UChar32 values to the corresponding 253 // range group number. 254 // 255 void RBBISetBuilder::buildTrie() { 256 RangeDescriptor *rlRange; 257 258 fTrie = utrie2_open(0, // Initial value for all code points. 259 0, // Error value for out-of-range input. 260 fStatus); 261 262 for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) { 263 utrie2_setRange32(fTrie, 264 rlRange->fStartChar, // Range start 265 rlRange->fEndChar, // Range end (inclusive) 266 rlRange->fNum, // value for range 267 TRUE, // Overwrite previously written values 268 fStatus); 269 } 270 } 271 272 273 void RBBISetBuilder::mergeCategories(IntPair categories) { 274 U_ASSERT(categories.first >= 1); 275 U_ASSERT(categories.second > categories.first); 276 for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) { 277 int32_t rangeNum = rd->fNum & ~DICT_BIT; 278 int32_t rangeDict = rd->fNum & DICT_BIT; 279 if (rangeNum == categories.second) { 280 rd->fNum = categories.first | rangeDict; 281 } else if (rangeNum > categories.second) { 282 rd->fNum--; 283 } 284 } 285 --fGroupCount; 286 } 287 288 289 //----------------------------------------------------------------------------------- 290 // 291 // getTrieSize() Return the size that will be required to serialize the Trie. 292 // 293 //----------------------------------------------------------------------------------- 294 int32_t RBBISetBuilder::getTrieSize() { 295 if (U_FAILURE(*fStatus)) { 296 return 0; 297 } 298 utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus); 299 fTrieSize = utrie2_serialize(fTrie, 300 NULL, // Buffer 301 0, // Capacity 302 fStatus); 303 if (*fStatus == U_BUFFER_OVERFLOW_ERROR) { 304 *fStatus = U_ZERO_ERROR; 305 } 306 // RBBIDebugPrintf("Trie table size is %d\n", trieSize); 307 return fTrieSize; 308 } 309 310 311 //----------------------------------------------------------------------------------- 312 // 313 // serializeTrie() Put the serialized trie at the specified address. 314 // Trust the caller to have given us enough memory. 315 // getTrieSize() MUST be called first. 316 // 317 //----------------------------------------------------------------------------------- 318 void RBBISetBuilder::serializeTrie(uint8_t *where) { 319 utrie2_serialize(fTrie, 320 where, // Buffer 321 fTrieSize, // Capacity 322 fStatus); 323 } 324 325 //------------------------------------------------------------------------ 326 // 327 // addValToSets Add a runtime-mapped input value to each uset from a 328 // list of uset nodes. (val corresponds to a state table column.) 329 // For each of the original Unicode sets - which correspond 330 // directly to uset nodes - a logically equivalent expression 331 // is constructed in terms of the remapped runtime input 332 // symbol set. This function adds one runtime input symbol to 333 // a list of sets. 334 // 335 // The "logically equivalent expression" is the tree for an 336 // or-ing together of all of the symbols that go into the set. 337 // 338 //------------------------------------------------------------------------ 339 void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) { 340 int32_t ix; 341 342 for (ix=0; ix<sets->size(); ix++) { 343 RBBINode *usetNode = (RBBINode *)sets->elementAt(ix); 344 addValToSet(usetNode, val); 345 } 346 } 347 348 void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { 349 RBBINode *leafNode = new RBBINode(RBBINode::leafChar); 350 if (leafNode == NULL) { 351 *fStatus = U_MEMORY_ALLOCATION_ERROR; 352 return; 353 } 354 leafNode->fVal = (unsigned short)val; 355 if (usetNode->fLeftChild == NULL) { 356 usetNode->fLeftChild = leafNode; 357 leafNode->fParent = usetNode; 358 } else { 359 // There are already input symbols present for this set. 360 // Set up an OR node, with the previous stuff as the left child 361 // and the new value as the right child. 362 RBBINode *orNode = new RBBINode(RBBINode::opOr); 363 if (orNode == NULL) { 364 *fStatus = U_MEMORY_ALLOCATION_ERROR; 365 return; 366 } 367 orNode->fLeftChild = usetNode->fLeftChild; 368 orNode->fRightChild = leafNode; 369 orNode->fLeftChild->fParent = orNode; 370 orNode->fRightChild->fParent = orNode; 371 usetNode->fLeftChild = orNode; 372 orNode->fParent = usetNode; 373 } 374 } 375 376 377 //------------------------------------------------------------------------ 378 // 379 // getNumCharCategories 380 // 381 //------------------------------------------------------------------------ 382 int32_t RBBISetBuilder::getNumCharCategories() const { 383 return fGroupCount + 3; 384 } 385 386 387 //------------------------------------------------------------------------ 388 // 389 // sawBOF 390 // 391 //------------------------------------------------------------------------ 392 UBool RBBISetBuilder::sawBOF() const { 393 return fSawBOF; 394 } 395 396 397 //------------------------------------------------------------------------ 398 // 399 // getFirstChar Given a runtime RBBI character category, find 400 // the first UChar32 that is in the set of chars 401 // in the category. 402 //------------------------------------------------------------------------ 403 UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { 404 RangeDescriptor *rlRange; 405 UChar32 retVal = (UChar32)-1; 406 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { 407 if (rlRange->fNum == category) { 408 retVal = rlRange->fStartChar; 409 break; 410 } 411 } 412 return retVal; 413 } 414 415 416 417 //------------------------------------------------------------------------ 418 // 419 // printRanges A debugging function. 420 // dump out all of the range definitions. 421 // 422 //------------------------------------------------------------------------ 423 #ifdef RBBI_DEBUG 424 void RBBISetBuilder::printRanges() { 425 RangeDescriptor *rlRange; 426 int i; 427 428 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); 429 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { 430 RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar); 431 432 for (i=0; i<rlRange->fIncludesSets->size(); i++) { 433 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); 434 UnicodeString setName = UNICODE_STRING("anon", 4); 435 RBBINode *setRef = usetNode->fParent; 436 if (setRef != NULL) { 437 RBBINode *varRef = setRef->fParent; 438 if (varRef != NULL && varRef->fType == RBBINode::varRef) { 439 setName = varRef->fText; 440 } 441 } 442 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); 443 } 444 RBBIDebugPrintf("\n"); 445 } 446 } 447 #endif 448 449 450 //------------------------------------------------------------------------ 451 // 452 // printRangeGroups A debugging function. 453 // dump out all of the range groups. 454 // 455 //------------------------------------------------------------------------ 456 #ifdef RBBI_DEBUG 457 void RBBISetBuilder::printRangeGroups() { 458 RangeDescriptor *rlRange; 459 RangeDescriptor *tRange; 460 int i; 461 int lastPrintedGroupNum = 0; 462 463 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); 464 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { 465 int groupNum = rlRange->fNum & 0xbfff; 466 if (groupNum > lastPrintedGroupNum) { 467 lastPrintedGroupNum = groupNum; 468 RBBIDebugPrintf("%2i ", groupNum); 469 470 if (rlRange->fNum & DICT_BIT) { RBBIDebugPrintf(" <DICT> ");} 471 472 for (i=0; i<rlRange->fIncludesSets->size(); i++) { 473 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); 474 UnicodeString setName = UNICODE_STRING("anon", 4); 475 RBBINode *setRef = usetNode->fParent; 476 if (setRef != NULL) { 477 RBBINode *varRef = setRef->fParent; 478 if (varRef != NULL && varRef->fType == RBBINode::varRef) { 479 setName = varRef->fText; 480 } 481 } 482 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); 483 } 484 485 i = 0; 486 for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) { 487 if (tRange->fNum == rlRange->fNum) { 488 if (i++ % 5 == 0) { 489 RBBIDebugPrintf("\n "); 490 } 491 RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar); 492 } 493 } 494 RBBIDebugPrintf("\n"); 495 } 496 } 497 RBBIDebugPrintf("\n"); 498 } 499 #endif 500 501 502 //------------------------------------------------------------------------ 503 // 504 // printSets A debugging function. 505 // dump out all of the set definitions. 506 // 507 //------------------------------------------------------------------------ 508 #ifdef RBBI_DEBUG 509 void RBBISetBuilder::printSets() { 510 int i; 511 512 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n"); 513 for (i=0; ; i++) { 514 RBBINode *usetNode; 515 RBBINode *setRef; 516 RBBINode *varRef; 517 UnicodeString setName; 518 519 usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); 520 if (usetNode == NULL) { 521 break; 522 } 523 524 RBBIDebugPrintf("%3d ", i); 525 setName = UNICODE_STRING("anonymous", 9); 526 setRef = usetNode->fParent; 527 if (setRef != NULL) { 528 varRef = setRef->fParent; 529 if (varRef != NULL && varRef->fType == RBBINode::varRef) { 530 setName = varRef->fText; 531 } 532 } 533 RBBI_DEBUG_printUnicodeString(setName); 534 RBBIDebugPrintf(" "); 535 RBBI_DEBUG_printUnicodeString(usetNode->fText); 536 RBBIDebugPrintf("\n"); 537 if (usetNode->fLeftChild != NULL) { 538 RBBINode::printTree(usetNode->fLeftChild, TRUE); 539 } 540 } 541 RBBIDebugPrintf("\n"); 542 } 543 #endif 544 545 546 547 //------------------------------------------------------------------------------------- 548 // 549 // RangeDescriptor copy constructor 550 // 551 //------------------------------------------------------------------------------------- 552 553 RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) { 554 int i; 555 556 this->fStartChar = other.fStartChar; 557 this->fEndChar = other.fEndChar; 558 this->fNum = other.fNum; 559 this->fNext = NULL; 560 UErrorCode oldstatus = status; 561 this->fIncludesSets = new UVector(status); 562 if (U_FAILURE(oldstatus)) { 563 status = oldstatus; 564 } 565 if (U_FAILURE(status)) { 566 return; 567 } 568 /* test for NULL */ 569 if (this->fIncludesSets == 0) { 570 status = U_MEMORY_ALLOCATION_ERROR; 571 return; 572 } 573 574 for (i=0; i<other.fIncludesSets->size(); i++) { 575 this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status); 576 } 577 } 578 579 580 //------------------------------------------------------------------------------------- 581 // 582 // RangeDesriptor default constructor 583 // 584 //------------------------------------------------------------------------------------- 585 RangeDescriptor::RangeDescriptor(UErrorCode &status) { 586 this->fStartChar = 0; 587 this->fEndChar = 0; 588 this->fNum = 0; 589 this->fNext = NULL; 590 UErrorCode oldstatus = status; 591 this->fIncludesSets = new UVector(status); 592 if (U_FAILURE(oldstatus)) { 593 status = oldstatus; 594 } 595 if (U_FAILURE(status)) { 596 return; 597 } 598 /* test for NULL */ 599 if(this->fIncludesSets == 0) { 600 status = U_MEMORY_ALLOCATION_ERROR; 601 return; 602 } 603 604 } 605 606 607 //------------------------------------------------------------------------------------- 608 // 609 // RangeDesriptor Destructor 610 // 611 //------------------------------------------------------------------------------------- 612 RangeDescriptor::~RangeDescriptor() { 613 delete fIncludesSets; 614 fIncludesSets = NULL; 615 } 616 617 //------------------------------------------------------------------------------------- 618 // 619 // RangeDesriptor::split() 620 // 621 //------------------------------------------------------------------------------------- 622 void RangeDescriptor::split(UChar32 where, UErrorCode &status) { 623 U_ASSERT(where>fStartChar && where<=fEndChar); 624 RangeDescriptor *nr = new RangeDescriptor(*this, status); 625 if(nr == 0) { 626 status = U_MEMORY_ALLOCATION_ERROR; 627 return; 628 } 629 if (U_FAILURE(status)) { 630 delete nr; 631 return; 632 } 633 // RangeDescriptor copy constructor copies all fields. 634 // Only need to update those that are different after the split. 635 nr->fStartChar = where; 636 this->fEndChar = where-1; 637 nr->fNext = this->fNext; 638 this->fNext = nr; 639 } 640 641 642 //------------------------------------------------------------------------------------- 643 // 644 // RangeDescriptor::setDictionaryFlag 645 // 646 // Character Category Numbers that include characters from 647 // the original Unicode Set named "dictionary" have bit 14 648 // set to 1. The RBBI runtime engine uses this to trigger 649 // use of the word dictionary. 650 // 651 // This function looks through the Unicode Sets that it 652 // (the range) includes, and sets the bit in fNum when 653 // "dictionary" is among them. 654 // 655 // TODO: a faster way would be to find the set node for 656 // "dictionary" just once, rather than looking it 657 // up by name every time. 658 // 659 //------------------------------------------------------------------------------------- 660 void RangeDescriptor::setDictionaryFlag() { 661 int i; 662 663 static const char16_t *dictionary = u"dictionary"; 664 for (i=0; i<fIncludesSets->size(); i++) { 665 RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); 666 RBBINode *setRef = usetNode->fParent; 667 if (setRef != nullptr) { 668 RBBINode *varRef = setRef->fParent; 669 if (varRef && varRef->fType == RBBINode::varRef) { 670 const UnicodeString *setName = &varRef->fText; 671 if (setName->compare(dictionary, -1) == 0) { 672 fNum |= RBBISetBuilder::DICT_BIT; 673 break; 674 } 675 } 676 } 677 } 678 } 679 680 681 682 U_NAMESPACE_END 683 684 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 685