Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2007-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  bmpset.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2007jan29
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/utf8.h"
     20 #include "unicode/utf16.h"
     21 #include "cmemory.h"
     22 #include "bmpset.h"
     23 #include "uassert.h"
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
     28         list(parentList), listLength(parentListLength) {
     29     uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
     30     uprv_memset(table7FF, 0, sizeof(table7FF));
     31     uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
     32 
     33     /*
     34      * Set the list indexes for binary searches for
     35      * U+0800, U+1000, U+2000, .., U+F000, U+10000.
     36      * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
     37      * looked up in the bit tables.
     38      * The last pair of indexes is for finding supplementary code points.
     39      */
     40     list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
     41     int32_t i;
     42     for(i=1; i<=0x10; ++i) {
     43         list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
     44     }
     45     list4kStarts[0x11]=listLength-1;
     46 
     47     initBits();
     48     overrideIllegal();
     49 }
     50 
     51 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
     52         list(newParentList), listLength(newParentListLength) {
     53     uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
     54     uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
     55     uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
     56     uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
     57 }
     58 
     59 BMPSet::~BMPSet() {
     60 }
     61 
     62 /*
     63  * Set bits in a bit rectangle in "vertical" bit organization.
     64  * start<limit<=0x800
     65  */
     66 static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
     67     U_ASSERT(start<limit);
     68     U_ASSERT(limit<=0x800);
     69 
     70     int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
     71     int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
     72 
     73     // Set one bit indicating an all-one block.
     74     uint32_t bits=(uint32_t)1<<lead;
     75     if((start+1)==limit) {  // Single-character shortcut.
     76         table[trail]|=bits;
     77         return;
     78     }
     79 
     80     int32_t limitLead=limit>>6;
     81     int32_t limitTrail=limit&0x3f;
     82 
     83     if(lead==limitLead) {
     84         // Partial vertical bit column.
     85         while(trail<limitTrail) {
     86             table[trail++]|=bits;
     87         }
     88     } else {
     89         // Partial vertical bit column,
     90         // followed by a bit rectangle,
     91         // followed by another partial vertical bit column.
     92         if(trail>0) {
     93             do {
     94                 table[trail++]|=bits;
     95             } while(trail<64);
     96             ++lead;
     97         }
     98         if(lead<limitLead) {
     99             bits=~((1<<lead)-1);
    100             if(limitLead<0x20) {
    101                 bits&=(1<<limitLead)-1;
    102             }
    103             for(trail=0; trail<64; ++trail) {
    104                 table[trail]|=bits;
    105             }
    106         }
    107         // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
    108         // In that case, bits=1<<limitLead is undefined but the bits value
    109         // is not used because trail<limitTrail is already false.
    110         bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
    111         for(trail=0; trail<limitTrail; ++trail) {
    112             table[trail]|=bits;
    113         }
    114     }
    115 }
    116 
    117 void BMPSet::initBits() {
    118     UChar32 start, limit;
    119     int32_t listIndex=0;
    120 
    121     // Set asciiBytes[].
    122     do {
    123         start=list[listIndex++];
    124         if(listIndex<listLength) {
    125             limit=list[listIndex++];
    126         } else {
    127             limit=0x110000;
    128         }
    129         if(start>=0x80) {
    130             break;
    131         }
    132         do {
    133             asciiBytes[start++]=1;
    134         } while(start<limit && start<0x80);
    135     } while(limit<=0x80);
    136 
    137     // Set table7FF[].
    138     while(start<0x800) {
    139         set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
    140         if(limit>0x800) {
    141             start=0x800;
    142             break;
    143         }
    144 
    145         start=list[listIndex++];
    146         if(listIndex<listLength) {
    147             limit=list[listIndex++];
    148         } else {
    149             limit=0x110000;
    150         }
    151     }
    152 
    153     // Set bmpBlockBits[].
    154     int32_t minStart=0x800;
    155     while(start<0x10000) {
    156         if(limit>0x10000) {
    157             limit=0x10000;
    158         }
    159 
    160         if(start<minStart) {
    161             start=minStart;
    162         }
    163         if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
    164             if(start&0x3f) {
    165                 // Mixed-value block of 64 code points.
    166                 start>>=6;
    167                 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
    168                 start=(start+1)<<6;  // Round up to the next block boundary.
    169                 minStart=start;      // Ignore further ranges in this block.
    170             }
    171             if(start<limit) {
    172                 if(start<(limit&~0x3f)) {
    173                     // Multiple all-ones blocks of 64 code points each.
    174                     set32x64Bits(bmpBlockBits, start>>6, limit>>6);
    175                 }
    176 
    177                 if(limit&0x3f) {
    178                     // Mixed-value block of 64 code points.
    179                     limit>>=6;
    180                     bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
    181                     limit=(limit+1)<<6;  // Round up to the next block boundary.
    182                     minStart=limit;      // Ignore further ranges in this block.
    183                 }
    184             }
    185         }
    186 
    187         if(limit==0x10000) {
    188             break;
    189         }
    190 
    191         start=list[listIndex++];
    192         if(listIndex<listLength) {
    193             limit=list[listIndex++];
    194         } else {
    195             limit=0x110000;
    196         }
    197     }
    198 }
    199 
    200 /*
    201  * Override some bits and bytes to the result of contains(FFFD)
    202  * for faster validity checking at runtime.
    203  * No need to set 0 values where they were reset to 0 in the constructor
    204  * and not modified by initBits().
    205  * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
    206  * Need to set 0 values for surrogates D800..DFFF.
    207  */
    208 void BMPSet::overrideIllegal() {
    209     uint32_t bits, mask;
    210     int32_t i;
    211 
    212     if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
    213         // contains(FFFD)==TRUE
    214         for(i=0x80; i<0xc0; ++i) {
    215             asciiBytes[i]=1;
    216         }
    217 
    218         bits=3;                 // Lead bytes 0xC0 and 0xC1.
    219         for(i=0; i<64; ++i) {
    220             table7FF[i]|=bits;
    221         }
    222 
    223         bits=1;                 // Lead byte 0xE0.
    224         for(i=0; i<32; ++i) {   // First half of 4k block.
    225             bmpBlockBits[i]|=bits;
    226         }
    227 
    228         mask=~(0x10001<<0xd);   // Lead byte 0xED.
    229         bits=1<<0xd;
    230         for(i=32; i<64; ++i) {  // Second half of 4k block.
    231             bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
    232         }
    233     } else {
    234         // contains(FFFD)==FALSE
    235         mask=~(0x10001<<0xd);   // Lead byte 0xED.
    236         for(i=32; i<64; ++i) {  // Second half of 4k block.
    237             bmpBlockBits[i]&=mask;
    238         }
    239     }
    240 }
    241 
    242 int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
    243     /* Examples:
    244                                        findCodePoint(c)
    245        set              list[]         c=0 1 3 4 7 8
    246        ===              ==============   ===========
    247        []               [110000]         0 0 0 0 0 0
    248        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
    249        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
    250        [:Any:]          [0, 110000]      1 1 1 1 1 1
    251      */
    252 
    253     // Return the smallest i such that c < list[i].  Assume
    254     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
    255     if (c < list[lo])
    256         return lo;
    257     // High runner test.  c is often after the last range, so an
    258     // initial check for this condition pays off.
    259     if (lo >= hi || c >= list[hi-1])
    260         return hi;
    261     // invariant: c >= list[lo]
    262     // invariant: c < list[hi]
    263     for (;;) {
    264         int32_t i = (lo + hi) >> 1;
    265         if (i == lo) {
    266             break; // Found!
    267         } else if (c < list[i]) {
    268             hi = i;
    269         } else {
    270             lo = i;
    271         }
    272     }
    273     return hi;
    274 }
    275 
    276 UBool
    277 BMPSet::contains(UChar32 c) const {
    278     if((uint32_t)c<=0x7f) {
    279         return (UBool)asciiBytes[c];
    280     } else if((uint32_t)c<=0x7ff) {
    281         return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
    282     } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
    283         int lead=c>>12;
    284         uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    285         if(twoBits<=1) {
    286             // All 64 code points with the same bits 15..6
    287             // are either in the set or not.
    288             return (UBool)twoBits;
    289         } else {
    290             // Look up the code point in its 4k block of code points.
    291             return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
    292         }
    293     } else if((uint32_t)c<=0x10ffff) {
    294         // surrogate or supplementary code point
    295         return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
    296     } else {
    297         // Out-of-range code points get FALSE, consistent with long-standing
    298         // behavior of UnicodeSet::contains(c).
    299         return FALSE;
    300     }
    301 }
    302 
    303 /*
    304  * Check for sufficient length for trail unit for each surrogate pair.
    305  * Handle single surrogates as surrogate code points as usual in ICU.
    306  */
    307 const UChar *
    308 BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
    309     UChar c, c2;
    310 
    311     if(spanCondition) {
    312         // span
    313         do {
    314             c=*s;
    315             if(c<=0x7f) {
    316                 if(!asciiBytes[c]) {
    317                     break;
    318                 }
    319             } else if(c<=0x7ff) {
    320                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
    321                     break;
    322                 }
    323             } else if(c<0xd800 || c>=0xe000) {
    324                 int lead=c>>12;
    325                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    326                 if(twoBits<=1) {
    327                     // All 64 code points with the same bits 15..6
    328                     // are either in the set or not.
    329                     if(twoBits==0) {
    330                         break;
    331                     }
    332                 } else {
    333                     // Look up the code point in its 4k block of code points.
    334                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    335                         break;
    336                     }
    337                 }
    338             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
    339                 // surrogate code point
    340                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    341                     break;
    342                 }
    343             } else {
    344                 // surrogate pair
    345                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
    346                     break;
    347                 }
    348                 ++s;
    349             }
    350         } while(++s<limit);
    351     } else {
    352         // span not
    353         do {
    354             c=*s;
    355             if(c<=0x7f) {
    356                 if(asciiBytes[c]) {
    357                     break;
    358                 }
    359             } else if(c<=0x7ff) {
    360                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
    361                     break;
    362                 }
    363             } else if(c<0xd800 || c>=0xe000) {
    364                 int lead=c>>12;
    365                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    366                 if(twoBits<=1) {
    367                     // All 64 code points with the same bits 15..6
    368                     // are either in the set or not.
    369                     if(twoBits!=0) {
    370                         break;
    371                     }
    372                 } else {
    373                     // Look up the code point in its 4k block of code points.
    374                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    375                         break;
    376                     }
    377                 }
    378             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
    379                 // surrogate code point
    380                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    381                     break;
    382                 }
    383             } else {
    384                 // surrogate pair
    385                 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
    386                     break;
    387                 }
    388                 ++s;
    389             }
    390         } while(++s<limit);
    391     }
    392     return s;
    393 }
    394 
    395 /* Symmetrical with span(). */
    396 const UChar *
    397 BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
    398     UChar c, c2;
    399 
    400     if(spanCondition) {
    401         // span
    402         for(;;) {
    403             c=*(--limit);
    404             if(c<=0x7f) {
    405                 if(!asciiBytes[c]) {
    406                     break;
    407                 }
    408             } else if(c<=0x7ff) {
    409                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
    410                     break;
    411                 }
    412             } else if(c<0xd800 || c>=0xe000) {
    413                 int lead=c>>12;
    414                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    415                 if(twoBits<=1) {
    416                     // All 64 code points with the same bits 15..6
    417                     // are either in the set or not.
    418                     if(twoBits==0) {
    419                         break;
    420                     }
    421                 } else {
    422                     // Look up the code point in its 4k block of code points.
    423                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    424                         break;
    425                     }
    426                 }
    427             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
    428                 // surrogate code point
    429                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    430                     break;
    431                 }
    432             } else {
    433                 // surrogate pair
    434                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
    435                     break;
    436                 }
    437                 --limit;
    438             }
    439             if(s==limit) {
    440                 return s;
    441             }
    442         }
    443     } else {
    444         // span not
    445         for(;;) {
    446             c=*(--limit);
    447             if(c<=0x7f) {
    448                 if(asciiBytes[c]) {
    449                     break;
    450                 }
    451             } else if(c<=0x7ff) {
    452                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
    453                     break;
    454                 }
    455             } else if(c<0xd800 || c>=0xe000) {
    456                 int lead=c>>12;
    457                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    458                 if(twoBits<=1) {
    459                     // All 64 code points with the same bits 15..6
    460                     // are either in the set or not.
    461                     if(twoBits!=0) {
    462                         break;
    463                     }
    464                 } else {
    465                     // Look up the code point in its 4k block of code points.
    466                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    467                         break;
    468                     }
    469                 }
    470             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
    471                 // surrogate code point
    472                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    473                     break;
    474                 }
    475             } else {
    476                 // surrogate pair
    477                 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
    478                     break;
    479                 }
    480                 --limit;
    481             }
    482             if(s==limit) {
    483                 return s;
    484             }
    485         }
    486     }
    487     return limit+1;
    488 }
    489 
    490 /*
    491  * Precheck for sufficient trail bytes at end of string only once per span.
    492  * Check validity.
    493  */
    494 const uint8_t *
    495 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    496     const uint8_t *limit=s+length;
    497     uint8_t b=*s;
    498     if((int8_t)b>=0) {
    499         // Initial all-ASCII span.
    500         if(spanCondition) {
    501             do {
    502                 if(!asciiBytes[b] || ++s==limit) {
    503                     return s;
    504                 }
    505                 b=*s;
    506             } while((int8_t)b>=0);
    507         } else {
    508             do {
    509                 if(asciiBytes[b] || ++s==limit) {
    510                     return s;
    511                 }
    512                 b=*s;
    513             } while((int8_t)b>=0);
    514         }
    515         length=(int32_t)(limit-s);
    516     }
    517 
    518     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
    519         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
    520     }
    521 
    522     const uint8_t *limit0=limit;
    523 
    524     /*
    525      * Make sure that the last 1/2/3/4-byte sequence before limit is complete
    526      * or runs into a lead byte.
    527      * In the span loop compare s with limit only once
    528      * per multi-byte character.
    529      *
    530      * Give a trailing illegal sequence the same value as the result of contains(FFFD),
    531      * including it if that is part of the span, otherwise set limit0 to before
    532      * the truncated sequence.
    533      */
    534     b=*(limit-1);
    535     if((int8_t)b<0) {
    536         // b>=0x80: lead or trail byte
    537         if(b<0xc0) {
    538             // single trail byte, check for preceding 3- or 4-byte lead byte
    539             if(length>=2 && (b=*(limit-2))>=0xe0) {
    540                 limit-=2;
    541                 if(asciiBytes[0x80]!=spanCondition) {
    542                     limit0=limit;
    543                 }
    544             } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
    545                 // 4-byte lead byte with only two trail bytes
    546                 limit-=3;
    547                 if(asciiBytes[0x80]!=spanCondition) {
    548                     limit0=limit;
    549                 }
    550             }
    551         } else {
    552             // lead byte with no trail bytes
    553             --limit;
    554             if(asciiBytes[0x80]!=spanCondition) {
    555                 limit0=limit;
    556             }
    557         }
    558     }
    559 
    560     uint8_t t1, t2, t3;
    561 
    562     while(s<limit) {
    563         b=*s;
    564         if(b<0xc0) {
    565             // ASCII; or trail bytes with the result of contains(FFFD).
    566             if(spanCondition) {
    567                 do {
    568                     if(!asciiBytes[b]) {
    569                         return s;
    570                     } else if(++s==limit) {
    571                         return limit0;
    572                     }
    573                     b=*s;
    574                 } while(b<0xc0);
    575             } else {
    576                 do {
    577                     if(asciiBytes[b]) {
    578                         return s;
    579                     } else if(++s==limit) {
    580                         return limit0;
    581                     }
    582                     b=*s;
    583                 } while(b<0xc0);
    584             }
    585         }
    586         ++s;  // Advance past the lead byte.
    587         if(b>=0xe0) {
    588             if(b<0xf0) {
    589                 if( /* handle U+0000..U+FFFF inline */
    590                     (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
    591                     (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
    592                 ) {
    593                     b&=0xf;
    594                     uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
    595                     if(twoBits<=1) {
    596                         // All 64 code points with this lead byte and middle trail byte
    597                         // are either in the set or not.
    598                         if(twoBits!=(uint32_t)spanCondition) {
    599                             return s-1;
    600                         }
    601                     } else {
    602                         // Look up the code point in its 4k block of code points.
    603                         UChar32 c=(b<<12)|(t1<<6)|t2;
    604                         if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
    605                             return s-1;
    606                         }
    607                     }
    608                     s+=2;
    609                     continue;
    610                 }
    611             } else if( /* handle U+10000..U+10FFFF inline */
    612                 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
    613                 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
    614                 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
    615             ) {
    616                 // Give an illegal sequence the same value as the result of contains(FFFD).
    617                 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
    618                 if( (   (0x10000<=c && c<=0x10ffff) ?
    619                             containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
    620                             asciiBytes[0x80]
    621                     ) != spanCondition
    622                 ) {
    623                     return s-1;
    624                 }
    625                 s+=3;
    626                 continue;
    627             }
    628         } else /* 0xc0<=b<0xe0 */ {
    629             if( /* handle U+0000..U+07FF inline */
    630                 (t1=(uint8_t)(*s-0x80)) <= 0x3f
    631             ) {
    632                 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
    633                     return s-1;
    634                 }
    635                 ++s;
    636                 continue;
    637             }
    638         }
    639 
    640         // Give an illegal sequence the same value as the result of contains(FFFD).
    641         // Handle each byte of an illegal sequence separately to simplify the code;
    642         // no need to optimize error handling.
    643         if(asciiBytes[0x80]!=spanCondition) {
    644             return s-1;
    645         }
    646     }
    647 
    648     return limit0;
    649 }
    650 
    651 /*
    652  * While going backwards through UTF-8 optimize only for ASCII.
    653  * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
    654  * possible to tell from the last byte in a multi-byte sequence how many
    655  * preceding bytes there should be. Therefore, going backwards through UTF-8
    656  * is much harder than going forward.
    657  */
    658 int32_t
    659 BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    660     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
    661         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
    662     }
    663 
    664     uint8_t b;
    665 
    666     do {
    667         b=s[--length];
    668         if((int8_t)b>=0) {
    669             // ASCII sub-span
    670             if(spanCondition) {
    671                 do {
    672                     if(!asciiBytes[b]) {
    673                         return length+1;
    674                     } else if(length==0) {
    675                         return 0;
    676                     }
    677                     b=s[--length];
    678                 } while((int8_t)b>=0);
    679             } else {
    680                 do {
    681                     if(asciiBytes[b]) {
    682                         return length+1;
    683                     } else if(length==0) {
    684                         return 0;
    685                     }
    686                     b=s[--length];
    687                 } while((int8_t)b>=0);
    688             }
    689         }
    690 
    691         int32_t prev=length;
    692         UChar32 c;
    693         // trail byte: collect a multi-byte character
    694         // (or  lead byte in last-trail position)
    695         c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
    696         // c is a valid code point, not ASCII, not a surrogate
    697         if(c<=0x7ff) {
    698             if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
    699                 return prev+1;
    700             }
    701         } else if(c<=0xffff) {
    702             int lead=c>>12;
    703             uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    704             if(twoBits<=1) {
    705                 // All 64 code points with the same bits 15..6
    706                 // are either in the set or not.
    707                 if(twoBits!=(uint32_t)spanCondition) {
    708                     return prev+1;
    709                 }
    710             } else {
    711                 // Look up the code point in its 4k block of code points.
    712                 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
    713                     return prev+1;
    714                 }
    715             }
    716         } else {
    717             if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
    718                 return prev+1;
    719             }
    720         }
    721     } while(length>0);
    722     return 0;
    723 }
    724 
    725 U_NAMESPACE_END
    726