Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2007-2008, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  bmpset.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2007jan29
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/uniset.h"
     19 #include "cmemory.h"
     20 #include "bmpset.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
     25         list(parentList), listLength(parentListLength) {
     26     uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
     27     uprv_memset(table7FF, 0, sizeof(table7FF));
     28     uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
     29 
     30     /*
     31      * Set the list indexes for binary searches for
     32      * U+0800, U+1000, U+2000, .., U+F000, U+10000.
     33      * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
     34      * looked up in the bit tables.
     35      * The last pair of indexes is for finding supplementary code points.
     36      */
     37     list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
     38     int32_t i;
     39     for(i=1; i<=0x10; ++i) {
     40         list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
     41     }
     42     list4kStarts[0x11]=listLength-1;
     43 
     44     initBits();
     45     overrideIllegal();
     46 }
     47 
     48 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
     49         list(newParentList), listLength(newParentListLength) {
     50     uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
     51     uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
     52     uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
     53     uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
     54 }
     55 
     56 BMPSet::~BMPSet() {
     57 }
     58 
     59 /*
     60  * Set bits in a bit rectangle in "vertical" bit organization.
     61  * start<limit<=0x800
     62  */
     63 static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
     64     int32_t lead=start>>6;
     65     int32_t trail=start&0x3f;
     66 
     67     // Set one bit indicating an all-one block.
     68     uint32_t bits=(uint32_t)1<<lead;
     69     if((start+1)==limit) {  // Single-character shortcut.
     70         table[trail]|=bits;
     71         return;
     72     }
     73 
     74     int32_t limitLead=limit>>6;
     75     int32_t limitTrail=limit&0x3f;
     76 
     77     if(lead==limitLead) {
     78         // Partial vertical bit column.
     79         while(trail<limitTrail) {
     80             table[trail++]|=bits;
     81         }
     82     } else {
     83         // Partial vertical bit column,
     84         // followed by a bit rectangle,
     85         // followed by another partial vertical bit column.
     86         if(trail>0) {
     87             do {
     88                 table[trail++]|=bits;
     89             } while(trail<64);
     90             ++lead;
     91         }
     92         if(lead<limitLead) {
     93             bits=~((1<<lead)-1);
     94             if(limitLead<0x20) {
     95                 bits&=(1<<limitLead)-1;
     96             }
     97             for(trail=0; trail<64; ++trail) {
     98                 table[trail]|=bits;
     99             }
    100         }
    101         bits=1<<limitLead;
    102         for(trail=0; trail<limitTrail; ++trail) {
    103             table[trail]|=bits;
    104         }
    105     }
    106 }
    107 
    108 void BMPSet::initBits() {
    109     UChar32 start, limit;
    110     int32_t listIndex=0;
    111 
    112     // Set asciiBytes[].
    113     do {
    114         start=list[listIndex++];
    115         if(listIndex<listLength) {
    116             limit=list[listIndex++];
    117         } else {
    118             limit=0x110000;
    119         }
    120         if(start>=0x80) {
    121             break;
    122         }
    123         do {
    124             asciiBytes[start++]=1;
    125         } while(start<limit && start<0x80);
    126     } while(limit<=0x80);
    127 
    128     // Set table7FF[].
    129     while(start<0x800) {
    130         set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
    131         if(limit>0x800) {
    132             start=0x800;
    133             break;
    134         }
    135 
    136         start=list[listIndex++];
    137         if(listIndex<listLength) {
    138             limit=list[listIndex++];
    139         } else {
    140             limit=0x110000;
    141         }
    142     }
    143 
    144     // Set bmpBlockBits[].
    145     int32_t minStart=0x800;
    146     while(start<0x10000) {
    147         if(limit>0x10000) {
    148             limit=0x10000;
    149         }
    150 
    151         if(start<minStart) {
    152             start=minStart;
    153         }
    154         if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
    155             if(start&0x3f) {
    156                 // Mixed-value block of 64 code points.
    157                 start>>=6;
    158                 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
    159                 start=(start+1)<<6;  // Round up to the next block boundary.
    160                 minStart=start;      // Ignore further ranges in this block.
    161             }
    162             if(start<limit) {
    163                 if(start<(limit&~0x3f)) {
    164                     // Multiple all-ones blocks of 64 code points each.
    165                     set32x64Bits(bmpBlockBits, start>>6, limit>>6);
    166                 }
    167 
    168                 if(limit&0x3f) {
    169                     // Mixed-value block of 64 code points.
    170                     limit>>=6;
    171                     bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
    172                     limit=(limit+1)<<6;  // Round up to the next block boundary.
    173                     minStart=limit;      // Ignore further ranges in this block.
    174                 }
    175             }
    176         }
    177 
    178         if(limit==0x10000) {
    179             break;
    180         }
    181 
    182         start=list[listIndex++];
    183         if(listIndex<listLength) {
    184             limit=list[listIndex++];
    185         } else {
    186             limit=0x110000;
    187         }
    188     }
    189 }
    190 
    191 /*
    192  * Override some bits and bytes to the result of contains(FFFD)
    193  * for faster validity checking at runtime.
    194  * No need to set 0 values where they were reset to 0 in the constructor
    195  * and not modified by initBits().
    196  * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
    197  * Need to set 0 values for surrogates D800..DFFF.
    198  */
    199 void BMPSet::overrideIllegal() {
    200     uint32_t bits, mask;
    201     int32_t i;
    202 
    203     if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
    204         // contains(FFFD)==TRUE
    205         for(i=0x80; i<0xc0; ++i) {
    206             asciiBytes[i]=1;
    207         }
    208 
    209         bits=3;                 // Lead bytes 0xC0 and 0xC1.
    210         for(i=0; i<64; ++i) {
    211             table7FF[i]|=bits;
    212         }
    213 
    214         bits=1;                 // Lead byte 0xE0.
    215         for(i=0; i<32; ++i) {   // First half of 4k block.
    216             bmpBlockBits[i]|=bits;
    217         }
    218 
    219         mask=~(0x10001<<0xd);   // Lead byte 0xED.
    220         bits=1<<0xd;
    221         for(i=32; i<64; ++i) {  // Second half of 4k block.
    222             bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
    223         }
    224     } else {
    225         // contains(FFFD)==FALSE
    226         mask=~(0x10001<<0xd);   // Lead byte 0xED.
    227         for(i=32; i<64; ++i) {  // Second half of 4k block.
    228             bmpBlockBits[i]&=mask;
    229         }
    230     }
    231 }
    232 
    233 int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
    234     /* Examples:
    235                                        findCodePoint(c)
    236        set              list[]         c=0 1 3 4 7 8
    237        ===              ==============   ===========
    238        []               [110000]         0 0 0 0 0 0
    239        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
    240        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
    241        [:Any:]          [0, 110000]      1 1 1 1 1 1
    242      */
    243 
    244     // Return the smallest i such that c < list[i].  Assume
    245     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
    246     if (c < list[lo])
    247         return lo;
    248     // High runner test.  c is often after the last range, so an
    249     // initial check for this condition pays off.
    250     if (lo >= hi || c >= list[hi-1])
    251         return hi;
    252     // invariant: c >= list[lo]
    253     // invariant: c < list[hi]
    254     for (;;) {
    255         int32_t i = (lo + hi) >> 1;
    256         if (i == lo) {
    257             break; // Found!
    258         } else if (c < list[i]) {
    259             hi = i;
    260         } else {
    261             lo = i;
    262         }
    263     }
    264     return hi;
    265 }
    266 
    267 UBool
    268 BMPSet::contains(UChar32 c) const {
    269     if((uint32_t)c<=0x7f) {
    270         return (UBool)asciiBytes[c];
    271     } else if((uint32_t)c<=0x7ff) {
    272         return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
    273     } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
    274         int lead=c>>12;
    275         uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    276         if(twoBits<=1) {
    277             // All 64 code points with the same bits 15..6
    278             // are either in the set or not.
    279             return (UBool)twoBits;
    280         } else {
    281             // Look up the code point in its 4k block of code points.
    282             return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
    283         }
    284     } else if((uint32_t)c<=0x10ffff) {
    285         // surrogate or supplementary code point
    286         return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
    287     } else {
    288         // Out-of-range code points get FALSE, consistent with long-standing
    289         // behavior of UnicodeSet::contains(c).
    290         return FALSE;
    291     }
    292 }
    293 
    294 /*
    295  * Check for sufficient length for trail unit for each surrogate pair.
    296  * Handle single surrogates as surrogate code points as usual in ICU.
    297  */
    298 const UChar *
    299 BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
    300     UChar c, c2;
    301 
    302     if(spanCondition) {
    303         // span
    304         do {
    305             c=*s;
    306             if(c<=0x7f) {
    307                 if(!asciiBytes[c]) {
    308                     break;
    309                 }
    310             } else if(c<=0x7ff) {
    311                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
    312                     break;
    313                 }
    314             } else if(c<0xd800 || c>=0xe000) {
    315                 int lead=c>>12;
    316                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    317                 if(twoBits<=1) {
    318                     // All 64 code points with the same bits 15..6
    319                     // are either in the set or not.
    320                     if(twoBits==0) {
    321                         break;
    322                     }
    323                 } else {
    324                     // Look up the code point in its 4k block of code points.
    325                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    326                         break;
    327                     }
    328                 }
    329             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
    330                 // surrogate code point
    331                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    332                     break;
    333                 }
    334             } else {
    335                 // surrogate pair
    336                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
    337                     break;
    338                 }
    339                 ++s;
    340             }
    341         } while(++s<limit);
    342     } else {
    343         // span not
    344         do {
    345             c=*s;
    346             if(c<=0x7f) {
    347                 if(asciiBytes[c]) {
    348                     break;
    349                 }
    350             } else if(c<=0x7ff) {
    351                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
    352                     break;
    353                 }
    354             } else if(c<0xd800 || c>=0xe000) {
    355                 int lead=c>>12;
    356                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    357                 if(twoBits<=1) {
    358                     // All 64 code points with the same bits 15..6
    359                     // are either in the set or not.
    360                     if(twoBits!=0) {
    361                         break;
    362                     }
    363                 } else {
    364                     // Look up the code point in its 4k block of code points.
    365                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    366                         break;
    367                     }
    368                 }
    369             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
    370                 // surrogate code point
    371                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    372                     break;
    373                 }
    374             } else {
    375                 // surrogate pair
    376                 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
    377                     break;
    378                 }
    379                 ++s;
    380             }
    381         } while(++s<limit);
    382     }
    383     return s;
    384 }
    385 
    386 /* Symmetrical with span(). */
    387 const UChar *
    388 BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
    389     UChar c, c2;
    390 
    391     if(spanCondition) {
    392         // span
    393         for(;;) {
    394             c=*(--limit);
    395             if(c<=0x7f) {
    396                 if(!asciiBytes[c]) {
    397                     break;
    398                 }
    399             } else if(c<=0x7ff) {
    400                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
    401                     break;
    402                 }
    403             } else if(c<0xd800 || c>=0xe000) {
    404                 int lead=c>>12;
    405                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    406                 if(twoBits<=1) {
    407                     // All 64 code points with the same bits 15..6
    408                     // are either in the set or not.
    409                     if(twoBits==0) {
    410                         break;
    411                     }
    412                 } else {
    413                     // Look up the code point in its 4k block of code points.
    414                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    415                         break;
    416                     }
    417                 }
    418             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
    419                 // surrogate code point
    420                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    421                     break;
    422                 }
    423             } else {
    424                 // surrogate pair
    425                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
    426                     break;
    427                 }
    428                 --limit;
    429             }
    430             if(s==limit) {
    431                 return s;
    432             }
    433         }
    434     } else {
    435         // span not
    436         for(;;) {
    437             c=*(--limit);
    438             if(c<=0x7f) {
    439                 if(asciiBytes[c]) {
    440                     break;
    441                 }
    442             } else if(c<=0x7ff) {
    443                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
    444                     break;
    445                 }
    446             } else if(c<0xd800 || c>=0xe000) {
    447                 int lead=c>>12;
    448                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    449                 if(twoBits<=1) {
    450                     // All 64 code points with the same bits 15..6
    451                     // are either in the set or not.
    452                     if(twoBits!=0) {
    453                         break;
    454                     }
    455                 } else {
    456                     // Look up the code point in its 4k block of code points.
    457                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    458                         break;
    459                     }
    460                 }
    461             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
    462                 // surrogate code point
    463                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    464                     break;
    465                 }
    466             } else {
    467                 // surrogate pair
    468                 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
    469                     break;
    470                 }
    471                 --limit;
    472             }
    473             if(s==limit) {
    474                 return s;
    475             }
    476         }
    477     }
    478     return limit+1;
    479 }
    480 
    481 /*
    482  * Precheck for sufficient trail bytes at end of string only once per span.
    483  * Check validity.
    484  */
    485 const uint8_t *
    486 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    487     const uint8_t *limit=s+length;
    488     uint8_t b=*s;
    489     if((int8_t)b>=0) {
    490         // Initial all-ASCII span.
    491         if(spanCondition) {
    492             do {
    493                 if(!asciiBytes[b] || ++s==limit) {
    494                     return s;
    495                 }
    496                 b=*s;
    497             } while((int8_t)b>=0);
    498         } else {
    499             do {
    500                 if(asciiBytes[b] || ++s==limit) {
    501                     return s;
    502                 }
    503                 b=*s;
    504             } while((int8_t)b>=0);
    505         }
    506         length=(int32_t)(limit-s);
    507     }
    508 
    509     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
    510         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
    511     }
    512 
    513     const uint8_t *limit0=limit;
    514 
    515     /*
    516      * Make sure that the last 1/2/3/4-byte sequence before limit is complete
    517      * or runs into a lead byte.
    518      * In the span loop compare s with limit only once
    519      * per multi-byte character.
    520      *
    521      * Give a trailing illegal sequence the same value as the result of contains(FFFD),
    522      * including it if that is part of the span, otherwise set limit0 to before
    523      * the truncated sequence.
    524      */
    525     b=*(limit-1);
    526     if((int8_t)b<0) {
    527         // b>=0x80: lead or trail byte
    528         if(b<0xc0) {
    529             // single trail byte, check for preceding 3- or 4-byte lead byte
    530             if(length>=2 && (b=*(limit-2))>=0xe0) {
    531                 limit-=2;
    532                 if(asciiBytes[0x80]!=spanCondition) {
    533                     limit0=limit;
    534                 }
    535             } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
    536                 // 4-byte lead byte with only two trail bytes
    537                 limit-=3;
    538                 if(asciiBytes[0x80]!=spanCondition) {
    539                     limit0=limit;
    540                 }
    541             }
    542         } else {
    543             // lead byte with no trail bytes
    544             --limit;
    545             if(asciiBytes[0x80]!=spanCondition) {
    546                 limit0=limit;
    547             }
    548         }
    549     }
    550 
    551     uint8_t t1, t2, t3;
    552 
    553     while(s<limit) {
    554         b=*s;
    555         if(b<0xc0) {
    556             // ASCII; or trail bytes with the result of contains(FFFD).
    557             if(spanCondition) {
    558                 do {
    559                     if(!asciiBytes[b]) {
    560                         return s;
    561                     } else if(++s==limit) {
    562                         return limit0;
    563                     }
    564                     b=*s;
    565                 } while(b<0xc0);
    566             } else {
    567                 do {
    568                     if(asciiBytes[b]) {
    569                         return s;
    570                     } else if(++s==limit) {
    571                         return limit0;
    572                     }
    573                     b=*s;
    574                 } while(b<0xc0);
    575             }
    576         }
    577         ++s;  // Advance past the lead byte.
    578         if(b>=0xe0) {
    579             if(b<0xf0) {
    580                 if( /* handle U+0000..U+FFFF inline */
    581                     (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
    582                     (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
    583                 ) {
    584                     b&=0xf;
    585                     uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
    586                     if(twoBits<=1) {
    587                         // All 64 code points with this lead byte and middle trail byte
    588                         // are either in the set or not.
    589                         if(twoBits!=(uint32_t)spanCondition) {
    590                             return s-1;
    591                         }
    592                     } else {
    593                         // Look up the code point in its 4k block of code points.
    594                         UChar32 c=(b<<12)|(t1<<6)|t2;
    595                         if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
    596                             return s-1;
    597                         }
    598                     }
    599                     s+=2;
    600                     continue;
    601                 }
    602             } else if( /* handle U+10000..U+10FFFF inline */
    603                 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
    604                 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
    605                 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
    606             ) {
    607                 // Give an illegal sequence the same value as the result of contains(FFFD).
    608                 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
    609                 if( (   (0x10000<=c && c<=0x10ffff) ?
    610                             containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
    611                             asciiBytes[0x80]
    612                     ) != spanCondition
    613                 ) {
    614                     return s-1;
    615                 }
    616                 s+=3;
    617                 continue;
    618             }
    619         } else /* 0xc0<=b<0xe0 */ {
    620             if( /* handle U+0000..U+07FF inline */
    621                 (t1=(uint8_t)(*s-0x80)) <= 0x3f
    622             ) {
    623                 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
    624                     return s-1;
    625                 }
    626                 ++s;
    627                 continue;
    628             }
    629         }
    630 
    631         // Give an illegal sequence the same value as the result of contains(FFFD).
    632         // Handle each byte of an illegal sequence separately to simplify the code;
    633         // no need to optimize error handling.
    634         if(asciiBytes[0x80]!=spanCondition) {
    635             return s-1;
    636         }
    637     }
    638 
    639     return limit0;
    640 }
    641 
    642 /*
    643  * While going backwards through UTF-8 optimize only for ASCII.
    644  * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
    645  * possible to tell from the last byte in a multi-byte sequence how many
    646  * preceding bytes there should be. Therefore, going backwards through UTF-8
    647  * is much harder than going forward.
    648  */
    649 int32_t
    650 BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    651     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
    652         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
    653     }
    654 
    655     uint8_t b;
    656 
    657     do {
    658         b=s[--length];
    659         if((int8_t)b>=0) {
    660             // ASCII sub-span
    661             if(spanCondition) {
    662                 do {
    663                     if(!asciiBytes[b]) {
    664                         return length+1;
    665                     } else if(length==0) {
    666                         return 0;
    667                     }
    668                     b=s[--length];
    669                 } while((int8_t)b>=0);
    670             } else {
    671                 do {
    672                     if(asciiBytes[b]) {
    673                         return length+1;
    674                     } else if(length==0) {
    675                         return 0;
    676                     }
    677                     b=s[--length];
    678                 } while((int8_t)b>=0);
    679             }
    680         }
    681 
    682         int32_t prev=length;
    683         UChar32 c;
    684         if(b<0xc0) {
    685             // trail byte: collect a multi-byte character
    686             c=utf8_prevCharSafeBody(s, 0, &length, b, -1);
    687             if(c<0) {
    688                 c=0xfffd;
    689             }
    690         } else {
    691             // lead byte in last-trail position
    692             c=0xfffd;
    693         }
    694         // c is a valid code point, not ASCII, not a surrogate
    695         if(c<=0x7ff) {
    696             if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
    697                 return prev+1;
    698             }
    699         } else if(c<=0xffff) {
    700             int lead=c>>12;
    701             uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    702             if(twoBits<=1) {
    703                 // All 64 code points with the same bits 15..6
    704                 // are either in the set or not.
    705                 if(twoBits!=(uint32_t)spanCondition) {
    706                     return prev+1;
    707                 }
    708             } else {
    709                 // Look up the code point in its 4k block of code points.
    710                 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
    711                     return prev+1;
    712                 }
    713             }
    714         } else {
    715             if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
    716                 return prev+1;
    717             }
    718         }
    719     } while(length>0);
    720     return 0;
    721 }
    722 
    723 U_NAMESPACE_END
    724