Home | History | Annotate | Download | only in stringlib
      1 /* Finding the optimal width of unicode characters in a buffer */
      2 
      3 #if !STRINGLIB_IS_UNICODE
      4 # error "find_max_char.h is specific to Unicode"
      5 #endif
      6 
      7 /* Mask to quickly check whether a C 'long' contains a
      8    non-ASCII, UTF8-encoded char. */
      9 #if (SIZEOF_LONG == 8)
     10 # define UCS1_ASCII_CHAR_MASK 0x8080808080808080UL
     11 #elif (SIZEOF_LONG == 4)
     12 # define UCS1_ASCII_CHAR_MASK 0x80808080UL
     13 #else
     14 # error C 'long' size should be either 4 or 8!
     15 #endif
     16 
     17 #if STRINGLIB_SIZEOF_CHAR == 1
     18 
     19 Py_LOCAL_INLINE(Py_UCS4)
     20 STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
     21 {
     22     const unsigned char *p = (const unsigned char *) begin;
     23     const unsigned char *aligned_end =
     24             (const unsigned char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
     25 
     26     while (p < end) {
     27         if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
     28             /* Help register allocation */
     29             const unsigned char *_p = p;
     30             while (_p < aligned_end) {
     31                 unsigned long value = *(unsigned long *) _p;
     32                 if (value & UCS1_ASCII_CHAR_MASK)
     33                     return 255;
     34                 _p += SIZEOF_LONG;
     35             }
     36             p = _p;
     37             if (p == end)
     38                 break;
     39         }
     40         if (*p++ & 0x80)
     41             return 255;
     42     }
     43     return 127;
     44 }
     45 
     46 #undef ASCII_CHAR_MASK
     47 
     48 #else /* STRINGLIB_SIZEOF_CHAR == 1 */
     49 
     50 #define MASK_ASCII 0xFFFFFF80
     51 #define MASK_UCS1 0xFFFFFF00
     52 #define MASK_UCS2 0xFFFF0000
     53 
     54 #define MAX_CHAR_ASCII 0x7f
     55 #define MAX_CHAR_UCS1  0xff
     56 #define MAX_CHAR_UCS2  0xffff
     57 #define MAX_CHAR_UCS4  0x10ffff
     58 
     59 Py_LOCAL_INLINE(Py_UCS4)
     60 STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
     61 {
     62 #if STRINGLIB_SIZEOF_CHAR == 2
     63     const Py_UCS4 mask_limit = MASK_UCS1;
     64     const Py_UCS4 max_char_limit = MAX_CHAR_UCS2;
     65 #elif STRINGLIB_SIZEOF_CHAR == 4
     66     const Py_UCS4 mask_limit = MASK_UCS2;
     67     const Py_UCS4 max_char_limit = MAX_CHAR_UCS4;
     68 #else
     69 #error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4)
     70 #endif
     71     Py_UCS4 mask;
     72     Py_ssize_t n = end - begin;
     73     const STRINGLIB_CHAR *p = begin;
     74     const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4);
     75     Py_UCS4 max_char;
     76 
     77     max_char = MAX_CHAR_ASCII;
     78     mask = MASK_ASCII;
     79     while (p < unrolled_end) {
     80         STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3];
     81         if (bits & mask) {
     82             if (mask == mask_limit) {
     83                 /* Limit reached */
     84                 return max_char_limit;
     85             }
     86             if (mask == MASK_ASCII) {
     87                 max_char = MAX_CHAR_UCS1;
     88                 mask = MASK_UCS1;
     89             }
     90             else {
     91                 /* mask can't be MASK_UCS2 because of mask_limit above */
     92                 assert(mask == MASK_UCS1);
     93                 max_char = MAX_CHAR_UCS2;
     94                 mask = MASK_UCS2;
     95             }
     96             /* We check the new mask on the same chars in the next iteration */
     97             continue;
     98         }
     99         p += 4;
    100     }
    101     while (p < end) {
    102         if (p[0] & mask) {
    103             if (mask == mask_limit) {
    104                 /* Limit reached */
    105                 return max_char_limit;
    106             }
    107             if (mask == MASK_ASCII) {
    108                 max_char = MAX_CHAR_UCS1;
    109                 mask = MASK_UCS1;
    110             }
    111             else {
    112                 /* mask can't be MASK_UCS2 because of mask_limit above */
    113                 assert(mask == MASK_UCS1);
    114                 max_char = MAX_CHAR_UCS2;
    115                 mask = MASK_UCS2;
    116             }
    117             /* We check the new mask on the same chars in the next iteration */
    118             continue;
    119         }
    120         p++;
    121     }
    122     return max_char;
    123 }
    124 
    125 #undef MASK_ASCII
    126 #undef MASK_UCS1
    127 #undef MASK_UCS2
    128 #undef MAX_CHAR_ASCII
    129 #undef MAX_CHAR_UCS1
    130 #undef MAX_CHAR_UCS2
    131 #undef MAX_CHAR_UCS4
    132 
    133 #endif /* STRINGLIB_SIZEOF_CHAR == 1 */
    134 
    135