Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  ******************************************************************************
      5  *
      6  *   Copyright (C) 2009-2015, International Business Machines
      7  *   Corporation and others.  All Rights Reserved.
      8  *
      9  ******************************************************************************
     10  */
     11 
     12 package com.ibm.icu.impl;
     13 
     14 import com.ibm.icu.text.UnicodeSet.SpanCondition;
     15 import com.ibm.icu.util.OutputInt;
     16 
     17 /**
     18  * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
     19  *
     20  * Latin-1: Look up bytes.
     21  * 2-byte characters: Bits organized vertically.
     22  * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
     23  * Supplementary characters: Binary search over
     24  * the supplementary part of the parent set's inversion list.
     25  */
     26 public final class BMPSet {
     27     public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
     28 
     29     /**
     30      * One boolean ('true' or 'false') per Latin-1 character.
     31      */
     32     private boolean[] latin1Contains;
     33 
     34     /**
     35      * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
     36      * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
     37      * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
     38      *
     39      * Bits for 0..FF are unused (0).
     40      */
     41     private int[] table7FF;
     42 
     43     /**
     44      * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
     45      * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
     46      * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
     47      * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
     48      * and set.contains(c) must be called.
     49      *
     50      * Bits for 0..7FF are unused (0).
     51      */
     52     private int[] bmpBlockBits;
     53 
     54     /**
     55      * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
     56      * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
     57      * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
     58      */
     59     private int[] list4kStarts;
     60 
     61     /**
     62      * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
     63      * supplementary code points. The list is terminated with list[listLength-1]=0x110000.
     64      */
     65     private final int[] list;
     66     private final int listLength; // length used; list may be longer to minimize reallocs
     67 
     68     public BMPSet(final int[] parentList, int parentListLength) {
     69         list = parentList;
     70         listLength = parentListLength;
     71         latin1Contains = new boolean[0x100];
     72         table7FF = new int[64];
     73         bmpBlockBits = new int[64];
     74         list4kStarts = new int[18];
     75 
     76         /*
     77          * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
     78          * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
     79          * indexes is for finding supplementary code points.
     80          */
     81         list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
     82         int i;
     83         for (i = 1; i <= 0x10; ++i) {
     84             list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
     85         }
     86         list4kStarts[0x11] = listLength - 1;
     87 
     88         initBits();
     89     }
     90 
     91     public BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength) {
     92         list = newParentList;
     93         listLength = newParentListLength;
     94         latin1Contains = otherBMPSet.latin1Contains.clone();
     95         table7FF = otherBMPSet.table7FF.clone();
     96         bmpBlockBits = otherBMPSet.bmpBlockBits.clone();
     97         list4kStarts = otherBMPSet.list4kStarts.clone();
     98     }
     99 
    100     public boolean contains(int c) {
    101         if (c <= 0xff) {
    102             return (latin1Contains[c]);
    103         } else if (c <= 0x7ff) {
    104             return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
    105         } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
    106             int lead = c >> 12;
    107             int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
    108             if (twoBits <= 1) {
    109                 // All 64 code points with the same bits 15..6
    110                 // are either in the set or not.
    111                 return (0 != twoBits);
    112             } else {
    113                 // Look up the code point in its 4k block of code points.
    114                 return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
    115             }
    116         } else if (c <= 0x10ffff) {
    117             // surrogate or supplementary code point
    118             return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
    119         } else {
    120             // Out-of-range code points get false, consistent with long-standing
    121             // behavior of UnicodeSet.contains(c).
    122             return false;
    123         }
    124     }
    125 
    126     /**
    127      * Span the initial substring for which each character c has spanCondition==contains(c). It must be
    128      * spanCondition==0 or 1.
    129      *
    130      * @param start The start index
    131      * @param outCount If not null: Receives the number of code points in the span.
    132      * @return the limit (exclusive end) of the span
    133      *
    134      * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
    135      * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
    136      * as usual in ICU.
    137      */
    138     public final int span(CharSequence s, int start, SpanCondition spanCondition,
    139             OutputInt outCount) {
    140         char c, c2;
    141         int i = start;
    142         int limit = s.length();
    143         int numSupplementary = 0;
    144         if (SpanCondition.NOT_CONTAINED != spanCondition) {
    145             // span
    146             while (i < limit) {
    147                 c = s.charAt(i);
    148                 if (c <= 0xff) {
    149                     if (!latin1Contains[c]) {
    150                         break;
    151                     }
    152                 } else if (c <= 0x7ff) {
    153                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
    154                         break;
    155                     }
    156                 } else if (c < 0xd800 ||
    157                            c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
    158                     int lead = c >> 12;
    159                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
    160                     if (twoBits <= 1) {
    161                         // All 64 code points with the same bits 15..6
    162                         // are either in the set or not.
    163                         if (twoBits == 0) {
    164                             break;
    165                         }
    166                     } else {
    167                         // Look up the code point in its 4k block of code points.
    168                         if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
    169                             break;
    170                         }
    171                     }
    172                 } else {
    173                     // surrogate pair
    174                     int supplementary = Character.toCodePoint(c, c2);
    175                     if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
    176                         break;
    177                     }
    178                     ++numSupplementary;
    179                     ++i;
    180                 }
    181                 ++i;
    182             }
    183         } else {
    184             // span not
    185             while (i < limit) {
    186                 c = s.charAt(i);
    187                 if (c <= 0xff) {
    188                     if (latin1Contains[c]) {
    189                         break;
    190                     }
    191                 } else if (c <= 0x7ff) {
    192                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
    193                         break;
    194                     }
    195                 } else if (c < 0xd800 ||
    196                            c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
    197                     int lead = c >> 12;
    198                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
    199                     if (twoBits <= 1) {
    200                         // All 64 code points with the same bits 15..6
    201                         // are either in the set or not.
    202                         if (twoBits != 0) {
    203                             break;
    204                         }
    205                     } else {
    206                         // Look up the code point in its 4k block of code points.
    207                         if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
    208                             break;
    209                         }
    210                     }
    211                 } else {
    212                     // surrogate pair
    213                     int supplementary = Character.toCodePoint(c, c2);
    214                     if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
    215                         break;
    216                     }
    217                     ++numSupplementary;
    218                     ++i;
    219                 }
    220                 ++i;
    221             }
    222         }
    223         if (outCount != null) {
    224             int spanLength = i - start;
    225             outCount.value = spanLength - numSupplementary;  // number of code points
    226         }
    227         return i;
    228     }
    229 
    230     /**
    231      * Symmetrical with span().
    232      * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
    233      * limit and spanCondition==0 or 1.
    234      *
    235      * @return The string index which starts the span (i.e. inclusive).
    236      */
    237     public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
    238         char c, c2;
    239 
    240         if (SpanCondition.NOT_CONTAINED != spanCondition) {
    241             // span
    242             for (;;) {
    243                 c = s.charAt(--limit);
    244                 if (c <= 0xff) {
    245                     if (!latin1Contains[c]) {
    246                         break;
    247                     }
    248                 } else if (c <= 0x7ff) {
    249                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
    250                         break;
    251                     }
    252                 } else if (c < 0xd800 ||
    253                            c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
    254                     int lead = c >> 12;
    255                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
    256                     if (twoBits <= 1) {
    257                         // All 64 code points with the same bits 15..6
    258                         // are either in the set or not.
    259                         if (twoBits == 0) {
    260                             break;
    261                         }
    262                     } else {
    263                         // Look up the code point in its 4k block of code points.
    264                         if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
    265                             break;
    266                         }
    267                     }
    268                 } else {
    269                     // surrogate pair
    270                     int supplementary = Character.toCodePoint(c2, c);
    271                     if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
    272                         break;
    273                     }
    274                     --limit;
    275                 }
    276                 if (0 == limit) {
    277                     return 0;
    278                 }
    279             }
    280         } else {
    281             // span not
    282             for (;;) {
    283                 c = s.charAt(--limit);
    284                 if (c <= 0xff) {
    285                     if (latin1Contains[c]) {
    286                         break;
    287                     }
    288                 } else if (c <= 0x7ff) {
    289                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
    290                         break;
    291                     }
    292                 } else if (c < 0xd800 ||
    293                            c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
    294                     int lead = c >> 12;
    295                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
    296                     if (twoBits <= 1) {
    297                         // All 64 code points with the same bits 15..6
    298                         // are either in the set or not.
    299                         if (twoBits != 0) {
    300                             break;
    301                         }
    302                     } else {
    303                         // Look up the code point in its 4k block of code points.
    304                         if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
    305                             break;
    306                         }
    307                     }
    308                 } else {
    309                     // surrogate pair
    310                     int supplementary = Character.toCodePoint(c2, c);
    311                     if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
    312                         break;
    313                     }
    314                     --limit;
    315                 }
    316                 if (0 == limit) {
    317                     return 0;
    318                 }
    319             }
    320         }
    321         return limit + 1;
    322     }
    323 
    324     /**
    325      * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
    326      */
    327     private static void set32x64Bits(int[] table, int start, int limit) {
    328         assert (64 == table.length);
    329         int lead = start >> 6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
    330         int trail = start & 0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
    331 
    332         // Set one bit indicating an all-one block.
    333         int bits = 1 << lead;
    334         if ((start + 1) == limit) { // Single-character shortcut.
    335             table[trail] |= bits;
    336             return;
    337         }
    338 
    339         int limitLead = limit >> 6;
    340         int limitTrail = limit & 0x3f;
    341 
    342         if (lead == limitLead) {
    343             // Partial vertical bit column.
    344             while (trail < limitTrail) {
    345                 table[trail++] |= bits;
    346             }
    347         } else {
    348             // Partial vertical bit column,
    349             // followed by a bit rectangle,
    350             // followed by another partial vertical bit column.
    351             if (trail > 0) {
    352                 do {
    353                     table[trail++] |= bits;
    354                 } while (trail < 64);
    355                 ++lead;
    356             }
    357             if (lead < limitLead) {
    358                 bits = ~((1 << lead) - 1);
    359                 if (limitLead < 0x20) {
    360                     bits &= (1 << limitLead) - 1;
    361                 }
    362                 for (trail = 0; trail < 64; ++trail) {
    363                     table[trail] |= bits;
    364                 }
    365             }
    366             // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
    367             // In that case, bits=1<<limitLead == 1<<0 == 1
    368             // (because Java << uses only the lower 5 bits of the shift operand)
    369             // but the bits value is not used because trail<limitTrail is already false.
    370             bits = 1 << limitLead;
    371             for (trail = 0; trail < limitTrail; ++trail) {
    372                 table[trail] |= bits;
    373             }
    374         }
    375     }
    376 
    377     private void initBits() {
    378         int start, limit;
    379         int listIndex = 0;
    380 
    381         // Set latin1Contains[].
    382         do {
    383             start = list[listIndex++];
    384             if (listIndex < listLength) {
    385                 limit = list[listIndex++];
    386             } else {
    387                 limit = 0x110000;
    388             }
    389             if (start >= 0x100) {
    390                 break;
    391             }
    392             do {
    393                 latin1Contains[start++] = true;
    394             } while (start < limit && start < 0x100);
    395         } while (limit <= 0x100);
    396 
    397         // Set table7FF[].
    398         while (start < 0x800) {
    399             set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
    400             if (limit > 0x800) {
    401                 start = 0x800;
    402                 break;
    403             }
    404 
    405             start = list[listIndex++];
    406             if (listIndex < listLength) {
    407                 limit = list[listIndex++];
    408             } else {
    409                 limit = 0x110000;
    410             }
    411         }
    412 
    413         // Set bmpBlockBits[].
    414         int minStart = 0x800;
    415         while (start < 0x10000) {
    416             if (limit > 0x10000) {
    417                 limit = 0x10000;
    418             }
    419 
    420             if (start < minStart) {
    421                 start = minStart;
    422             }
    423             if (start < limit) { // Else: Another range entirely in a known mixed-value block.
    424                 if (0 != (start & 0x3f)) {
    425                     // Mixed-value block of 64 code points.
    426                     start >>= 6;
    427                     bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
    428                     start = (start + 1) << 6; // Round up to the next block boundary.
    429                     minStart = start; // Ignore further ranges in this block.
    430                 }
    431                 if (start < limit) {
    432                     if (start < (limit & ~0x3f)) {
    433                         // Multiple all-ones blocks of 64 code points each.
    434                         set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
    435                     }
    436 
    437                     if (0 != (limit & 0x3f)) {
    438                         // Mixed-value block of 64 code points.
    439                         limit >>= 6;
    440                         bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
    441                         limit = (limit + 1) << 6; // Round up to the next block boundary.
    442                         minStart = limit; // Ignore further ranges in this block.
    443                     }
    444                 }
    445             }
    446 
    447             if (limit == 0x10000) {
    448                 break;
    449             }
    450 
    451             start = list[listIndex++];
    452             if (listIndex < listLength) {
    453                 limit = list[listIndex++];
    454             } else {
    455                 limit = 0x110000;
    456             }
    457         }
    458     }
    459 
    460 
    461     /**
    462      * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
    463      * points in a certain range.
    464      *
    465      * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
    466      * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
    467      *
    468      * @param c
    469      *            a character in a subrange of MIN_VALUE..MAX_VALUE
    470      * @param lo
    471      *            The lowest index to be returned.
    472      * @param hi
    473      *            The highest index to be returned.
    474      * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
    475      */
    476     private int findCodePoint(int c, int lo, int hi) {
    477         /* Examples:
    478                                            findCodePoint(c)
    479            set              list[]         c=0 1 3 4 7 8
    480            ===              ==============   ===========
    481            []               [110000]         0 0 0 0 0 0
    482            [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
    483            [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
    484            [:Any:]          [0, 110000]      1 1 1 1 1 1
    485          */
    486 
    487         // Return the smallest i such that c < list[i]. Assume
    488         // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
    489         if (c < list[lo])
    490             return lo;
    491         // High runner test. c is often after the last range, so an
    492         // initial check for this condition pays off.
    493         if (lo >= hi || c >= list[hi - 1])
    494             return hi;
    495         // invariant: c >= list[lo]
    496         // invariant: c < list[hi]
    497         for (;;) {
    498             int i = (lo + hi) >>> 1;
    499             if (i == lo) {
    500                 break; // Found!
    501             } else if (c < list[i]) {
    502                 hi = i;
    503             } else {
    504                 lo = i;
    505             }
    506         }
    507         return hi;
    508     }
    509 
    510     private final boolean containsSlow(int c, int lo, int hi) {
    511         return (0 != (findCodePoint(c, lo, hi) & 1));
    512     }
    513 }
    514