1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2007-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: bmpset.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2007jan29 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 #include "unicode/uniset.h" 19 #include "cmemory.h" 20 #include "bmpset.h" 21 22 U_NAMESPACE_BEGIN 23 24 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : 25 list(parentList), listLength(parentListLength) { 26 uprv_memset(asciiBytes, 0, sizeof(asciiBytes)); 27 uprv_memset(table7FF, 0, sizeof(table7FF)); 28 uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)); 29 30 /* 31 * Set the list indexes for binary searches for 32 * U+0800, U+1000, U+2000, .., U+F000, U+10000. 33 * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are 34 * looked up in the bit tables. 35 * The last pair of indexes is for finding supplementary code points. 36 */ 37 list4kStarts[0]=findCodePoint(0x800, 0, listLength-1); 38 int32_t i; 39 for(i=1; i<=0x10; ++i) { 40 list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1); 41 } 42 list4kStarts[0x11]=listLength-1; 43 44 initBits(); 45 overrideIllegal(); 46 } 47 48 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) : 49 list(newParentList), listLength(newParentListLength) { 50 uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes)); 51 uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF)); 52 uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); 53 uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts)); 54 } 55 56 BMPSet::~BMPSet() { 57 } 58 59 /* 60 * Set bits in a bit rectangle in "vertical" bit organization. 61 * start<limit<=0x800 62 */ 63 static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) { 64 int32_t lead=start>>6; 65 int32_t trail=start&0x3f; 66 67 // Set one bit indicating an all-one block. 68 uint32_t bits=(uint32_t)1<<lead; 69 if((start+1)==limit) { // Single-character shortcut. 70 table[trail]|=bits; 71 return; 72 } 73 74 int32_t limitLead=limit>>6; 75 int32_t limitTrail=limit&0x3f; 76 77 if(lead==limitLead) { 78 // Partial vertical bit column. 79 while(trail<limitTrail) { 80 table[trail++]|=bits; 81 } 82 } else { 83 // Partial vertical bit column, 84 // followed by a bit rectangle, 85 // followed by another partial vertical bit column. 86 if(trail>0) { 87 do { 88 table[trail++]|=bits; 89 } while(trail<64); 90 ++lead; 91 } 92 if(lead<limitLead) { 93 bits=~((1<<lead)-1); 94 if(limitLead<0x20) { 95 bits&=(1<<limitLead)-1; 96 } 97 for(trail=0; trail<64; ++trail) { 98 table[trail]|=bits; 99 } 100 } 101 bits=1<<limitLead; 102 for(trail=0; trail<limitTrail; ++trail) { 103 table[trail]|=bits; 104 } 105 } 106 } 107 108 void BMPSet::initBits() { 109 UChar32 start, limit; 110 int32_t listIndex=0; 111 112 // Set asciiBytes[]. 113 do { 114 start=list[listIndex++]; 115 if(listIndex<listLength) { 116 limit=list[listIndex++]; 117 } else { 118 limit=0x110000; 119 } 120 if(start>=0x80) { 121 break; 122 } 123 do { 124 asciiBytes[start++]=1; 125 } while(start<limit && start<0x80); 126 } while(limit<=0x80); 127 128 // Set table7FF[]. 129 while(start<0x800) { 130 set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800); 131 if(limit>0x800) { 132 start=0x800; 133 break; 134 } 135 136 start=list[listIndex++]; 137 if(listIndex<listLength) { 138 limit=list[listIndex++]; 139 } else { 140 limit=0x110000; 141 } 142 } 143 144 // Set bmpBlockBits[]. 145 int32_t minStart=0x800; 146 while(start<0x10000) { 147 if(limit>0x10000) { 148 limit=0x10000; 149 } 150 151 if(start<minStart) { 152 start=minStart; 153 } 154 if(start<limit) { // Else: Another range entirely in a known mixed-value block. 155 if(start&0x3f) { 156 // Mixed-value block of 64 code points. 157 start>>=6; 158 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6); 159 start=(start+1)<<6; // Round up to the next block boundary. 160 minStart=start; // Ignore further ranges in this block. 161 } 162 if(start<limit) { 163 if(start<(limit&~0x3f)) { 164 // Multiple all-ones blocks of 64 code points each. 165 set32x64Bits(bmpBlockBits, start>>6, limit>>6); 166 } 167 168 if(limit&0x3f) { 169 // Mixed-value block of 64 code points. 170 limit>>=6; 171 bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6); 172 limit=(limit+1)<<6; // Round up to the next block boundary. 173 minStart=limit; // Ignore further ranges in this block. 174 } 175 } 176 } 177 178 if(limit==0x10000) { 179 break; 180 } 181 182 start=list[listIndex++]; 183 if(listIndex<listLength) { 184 limit=list[listIndex++]; 185 } else { 186 limit=0x110000; 187 } 188 } 189 } 190 191 /* 192 * Override some bits and bytes to the result of contains(FFFD) 193 * for faster validity checking at runtime. 194 * No need to set 0 values where they were reset to 0 in the constructor 195 * and not modified by initBits(). 196 * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF) 197 * Need to set 0 values for surrogates D800..DFFF. 198 */ 199 void BMPSet::overrideIllegal() { 200 uint32_t bits, mask; 201 int32_t i; 202 203 if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) { 204 // contains(FFFD)==TRUE 205 for(i=0x80; i<0xc0; ++i) { 206 asciiBytes[i]=1; 207 } 208 209 bits=3; // Lead bytes 0xC0 and 0xC1. 210 for(i=0; i<64; ++i) { 211 table7FF[i]|=bits; 212 } 213 214 bits=1; // Lead byte 0xE0. 215 for(i=0; i<32; ++i) { // First half of 4k block. 216 bmpBlockBits[i]|=bits; 217 } 218 219 mask=~(0x10001<<0xd); // Lead byte 0xED. 220 bits=1<<0xd; 221 for(i=32; i<64; ++i) { // Second half of 4k block. 222 bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits; 223 } 224 } else { 225 // contains(FFFD)==FALSE 226 mask=~(0x10001<<0xd); // Lead byte 0xED. 227 for(i=32; i<64; ++i) { // Second half of 4k block. 228 bmpBlockBits[i]&=mask; 229 } 230 } 231 } 232 233 int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const { 234 /* Examples: 235 findCodePoint(c) 236 set list[] c=0 1 3 4 7 8 237 === ============== =========== 238 [] [110000] 0 0 0 0 0 0 239 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 240 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 241 [:Any:] [0, 110000] 1 1 1 1 1 1 242 */ 243 244 // Return the smallest i such that c < list[i]. Assume 245 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 246 if (c < list[lo]) 247 return lo; 248 // High runner test. c is often after the last range, so an 249 // initial check for this condition pays off. 250 if (lo >= hi || c >= list[hi-1]) 251 return hi; 252 // invariant: c >= list[lo] 253 // invariant: c < list[hi] 254 for (;;) { 255 int32_t i = (lo + hi) >> 1; 256 if (i == lo) { 257 break; // Found! 258 } else if (c < list[i]) { 259 hi = i; 260 } else { 261 lo = i; 262 } 263 } 264 return hi; 265 } 266 267 UBool 268 BMPSet::contains(UChar32 c) const { 269 if((uint32_t)c<=0x7f) { 270 return (UBool)asciiBytes[c]; 271 } else if((uint32_t)c<=0x7ff) { 272 return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0); 273 } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) { 274 int lead=c>>12; 275 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 276 if(twoBits<=1) { 277 // All 64 code points with the same bits 15..6 278 // are either in the set or not. 279 return (UBool)twoBits; 280 } else { 281 // Look up the code point in its 4k block of code points. 282 return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]); 283 } 284 } else if((uint32_t)c<=0x10ffff) { 285 // surrogate or supplementary code point 286 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); 287 } else { 288 // Out-of-range code points get FALSE, consistent with long-standing 289 // behavior of UnicodeSet::contains(c). 290 return FALSE; 291 } 292 } 293 294 /* 295 * Check for sufficient length for trail unit for each surrogate pair. 296 * Handle single surrogates as surrogate code points as usual in ICU. 297 */ 298 const UChar * 299 BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { 300 UChar c, c2; 301 302 if(spanCondition) { 303 // span 304 do { 305 c=*s; 306 if(c<=0x7f) { 307 if(!asciiBytes[c]) { 308 break; 309 } 310 } else if(c<=0x7ff) { 311 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { 312 break; 313 } 314 } else if(c<0xd800 || c>=0xe000) { 315 int lead=c>>12; 316 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 317 if(twoBits<=1) { 318 // All 64 code points with the same bits 15..6 319 // are either in the set or not. 320 if(twoBits==0) { 321 break; 322 } 323 } else { 324 // Look up the code point in its 4k block of code points. 325 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 326 break; 327 } 328 } 329 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { 330 // surrogate code point 331 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 332 break; 333 } 334 } else { 335 // surrogate pair 336 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { 337 break; 338 } 339 ++s; 340 } 341 } while(++s<limit); 342 } else { 343 // span not 344 do { 345 c=*s; 346 if(c<=0x7f) { 347 if(asciiBytes[c]) { 348 break; 349 } 350 } else if(c<=0x7ff) { 351 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { 352 break; 353 } 354 } else if(c<0xd800 || c>=0xe000) { 355 int lead=c>>12; 356 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 357 if(twoBits<=1) { 358 // All 64 code points with the same bits 15..6 359 // are either in the set or not. 360 if(twoBits!=0) { 361 break; 362 } 363 } else { 364 // Look up the code point in its 4k block of code points. 365 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 366 break; 367 } 368 } 369 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { 370 // surrogate code point 371 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 372 break; 373 } 374 } else { 375 // surrogate pair 376 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { 377 break; 378 } 379 ++s; 380 } 381 } while(++s<limit); 382 } 383 return s; 384 } 385 386 /* Symmetrical with span(). */ 387 const UChar * 388 BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { 389 UChar c, c2; 390 391 if(spanCondition) { 392 // span 393 for(;;) { 394 c=*(--limit); 395 if(c<=0x7f) { 396 if(!asciiBytes[c]) { 397 break; 398 } 399 } else if(c<=0x7ff) { 400 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { 401 break; 402 } 403 } else if(c<0xd800 || c>=0xe000) { 404 int lead=c>>12; 405 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 406 if(twoBits<=1) { 407 // All 64 code points with the same bits 15..6 408 // are either in the set or not. 409 if(twoBits==0) { 410 break; 411 } 412 } else { 413 // Look up the code point in its 4k block of code points. 414 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 415 break; 416 } 417 } 418 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { 419 // surrogate code point 420 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 421 break; 422 } 423 } else { 424 // surrogate pair 425 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { 426 break; 427 } 428 --limit; 429 } 430 if(s==limit) { 431 return s; 432 } 433 } 434 } else { 435 // span not 436 for(;;) { 437 c=*(--limit); 438 if(c<=0x7f) { 439 if(asciiBytes[c]) { 440 break; 441 } 442 } else if(c<=0x7ff) { 443 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { 444 break; 445 } 446 } else if(c<0xd800 || c>=0xe000) { 447 int lead=c>>12; 448 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 449 if(twoBits<=1) { 450 // All 64 code points with the same bits 15..6 451 // are either in the set or not. 452 if(twoBits!=0) { 453 break; 454 } 455 } else { 456 // Look up the code point in its 4k block of code points. 457 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 458 break; 459 } 460 } 461 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { 462 // surrogate code point 463 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 464 break; 465 } 466 } else { 467 // surrogate pair 468 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { 469 break; 470 } 471 --limit; 472 } 473 if(s==limit) { 474 return s; 475 } 476 } 477 } 478 return limit+1; 479 } 480 481 /* 482 * Precheck for sufficient trail bytes at end of string only once per span. 483 * Check validity. 484 */ 485 const uint8_t * 486 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 487 const uint8_t *limit=s+length; 488 uint8_t b=*s; 489 if((int8_t)b>=0) { 490 // Initial all-ASCII span. 491 if(spanCondition) { 492 do { 493 if(!asciiBytes[b] || ++s==limit) { 494 return s; 495 } 496 b=*s; 497 } while((int8_t)b>=0); 498 } else { 499 do { 500 if(asciiBytes[b] || ++s==limit) { 501 return s; 502 } 503 b=*s; 504 } while((int8_t)b>=0); 505 } 506 length=(int32_t)(limit-s); 507 } 508 509 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 510 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 511 } 512 513 const uint8_t *limit0=limit; 514 515 /* 516 * Make sure that the last 1/2/3/4-byte sequence before limit is complete 517 * or runs into a lead byte. 518 * In the span loop compare s with limit only once 519 * per multi-byte character. 520 * 521 * Give a trailing illegal sequence the same value as the result of contains(FFFD), 522 * including it if that is part of the span, otherwise set limit0 to before 523 * the truncated sequence. 524 */ 525 b=*(limit-1); 526 if((int8_t)b<0) { 527 // b>=0x80: lead or trail byte 528 if(b<0xc0) { 529 // single trail byte, check for preceding 3- or 4-byte lead byte 530 if(length>=2 && (b=*(limit-2))>=0xe0) { 531 limit-=2; 532 if(asciiBytes[0x80]!=spanCondition) { 533 limit0=limit; 534 } 535 } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { 536 // 4-byte lead byte with only two trail bytes 537 limit-=3; 538 if(asciiBytes[0x80]!=spanCondition) { 539 limit0=limit; 540 } 541 } 542 } else { 543 // lead byte with no trail bytes 544 --limit; 545 if(asciiBytes[0x80]!=spanCondition) { 546 limit0=limit; 547 } 548 } 549 } 550 551 uint8_t t1, t2, t3; 552 553 while(s<limit) { 554 b=*s; 555 if(b<0xc0) { 556 // ASCII; or trail bytes with the result of contains(FFFD). 557 if(spanCondition) { 558 do { 559 if(!asciiBytes[b]) { 560 return s; 561 } else if(++s==limit) { 562 return limit0; 563 } 564 b=*s; 565 } while(b<0xc0); 566 } else { 567 do { 568 if(asciiBytes[b]) { 569 return s; 570 } else if(++s==limit) { 571 return limit0; 572 } 573 b=*s; 574 } while(b<0xc0); 575 } 576 } 577 ++s; // Advance past the lead byte. 578 if(b>=0xe0) { 579 if(b<0xf0) { 580 if( /* handle U+0000..U+FFFF inline */ 581 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && 582 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f 583 ) { 584 b&=0xf; 585 uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001; 586 if(twoBits<=1) { 587 // All 64 code points with this lead byte and middle trail byte 588 // are either in the set or not. 589 if(twoBits!=(uint32_t)spanCondition) { 590 return s-1; 591 } 592 } else { 593 // Look up the code point in its 4k block of code points. 594 UChar32 c=(b<<12)|(t1<<6)|t2; 595 if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) { 596 return s-1; 597 } 598 } 599 s+=2; 600 continue; 601 } 602 } else if( /* handle U+10000..U+10FFFF inline */ 603 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && 604 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f && 605 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f 606 ) { 607 // Give an illegal sequence the same value as the result of contains(FFFD). 608 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3; 609 if( ( (0x10000<=c && c<=0x10ffff) ? 610 containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) : 611 asciiBytes[0x80] 612 ) != spanCondition 613 ) { 614 return s-1; 615 } 616 s+=3; 617 continue; 618 } 619 } else /* 0xc0<=b<0xe0 */ { 620 if( /* handle U+0000..U+07FF inline */ 621 (t1=(uint8_t)(*s-0x80)) <= 0x3f 622 ) { 623 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) { 624 return s-1; 625 } 626 ++s; 627 continue; 628 } 629 } 630 631 // Give an illegal sequence the same value as the result of contains(FFFD). 632 // Handle each byte of an illegal sequence separately to simplify the code; 633 // no need to optimize error handling. 634 if(asciiBytes[0x80]!=spanCondition) { 635 return s-1; 636 } 637 } 638 639 return limit0; 640 } 641 642 /* 643 * While going backwards through UTF-8 optimize only for ASCII. 644 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not 645 * possible to tell from the last byte in a multi-byte sequence how many 646 * preceding bytes there should be. Therefore, going backwards through UTF-8 647 * is much harder than going forward. 648 */ 649 int32_t 650 BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 651 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 652 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 653 } 654 655 uint8_t b; 656 657 do { 658 b=s[--length]; 659 if((int8_t)b>=0) { 660 // ASCII sub-span 661 if(spanCondition) { 662 do { 663 if(!asciiBytes[b]) { 664 return length+1; 665 } else if(length==0) { 666 return 0; 667 } 668 b=s[--length]; 669 } while((int8_t)b>=0); 670 } else { 671 do { 672 if(asciiBytes[b]) { 673 return length+1; 674 } else if(length==0) { 675 return 0; 676 } 677 b=s[--length]; 678 } while((int8_t)b>=0); 679 } 680 } 681 682 int32_t prev=length; 683 UChar32 c; 684 if(b<0xc0) { 685 // trail byte: collect a multi-byte character 686 c=utf8_prevCharSafeBody(s, 0, &length, b, -1); 687 if(c<0) { 688 c=0xfffd; 689 } 690 } else { 691 // lead byte in last-trail position 692 c=0xfffd; 693 } 694 // c is a valid code point, not ASCII, not a surrogate 695 if(c<=0x7ff) { 696 if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) { 697 return prev+1; 698 } 699 } else if(c<=0xffff) { 700 int lead=c>>12; 701 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 702 if(twoBits<=1) { 703 // All 64 code points with the same bits 15..6 704 // are either in the set or not. 705 if(twoBits!=(uint32_t)spanCondition) { 706 return prev+1; 707 } 708 } else { 709 // Look up the code point in its 4k block of code points. 710 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) { 711 return prev+1; 712 } 713 } 714 } else { 715 if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) { 716 return prev+1; 717 } 718 } 719 } while(length>0); 720 return 0; 721 } 722 723 U_NAMESPACE_END 724