Home | History | Annotate | Download | only in stringlib
      1 /* stringlib: fastsearch implementation */
      2 
      3 #ifndef STRINGLIB_FASTSEARCH_H
      4 #define STRINGLIB_FASTSEARCH_H
      5 
      6 /* fast search/count implementation, based on a mix between boyer-
      7    moore and horspool, with a few more bells and whistles on the top.
      8    for some more background, see: http://effbot.org/zone/stringlib.htm */
      9 
     10 /* note: fastsearch may access s[n], which isn't a problem when using
     11    Python's ordinary string types, but may cause problems if you're
     12    using this code in other contexts.  also, the count mode returns -1
     13    if there cannot possible be a match in the target string, and 0 if
     14    it has actually checked for matches, but didn't find any.  callers
     15    beware! */
     16 
     17 #define FAST_COUNT 0
     18 #define FAST_SEARCH 1
     19 #define FAST_RSEARCH 2
     20 
     21 #if LONG_BIT >= 128
     22 #define STRINGLIB_BLOOM_WIDTH 128
     23 #elif LONG_BIT >= 64
     24 #define STRINGLIB_BLOOM_WIDTH 64
     25 #elif LONG_BIT >= 32
     26 #define STRINGLIB_BLOOM_WIDTH 32
     27 #else
     28 #error "LONG_BIT is smaller than 32"
     29 #endif
     30 
     31 #define STRINGLIB_BLOOM_ADD(mask, ch) \
     32     ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
     33 #define STRINGLIB_BLOOM(mask, ch)     \
     34     ((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
     35 
     36 Py_LOCAL_INLINE(Py_ssize_t)
     37 fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
     38            const STRINGLIB_CHAR* p, Py_ssize_t m,
     39            Py_ssize_t maxcount, int mode)
     40 {
     41     unsigned long mask;
     42     Py_ssize_t skip, count = 0;
     43     Py_ssize_t i, j, mlast, w;
     44 
     45     w = n - m;
     46 
     47     if (w < 0 || (mode == FAST_COUNT && maxcount == 0))
     48         return -1;
     49 
     50     /* look for special cases */
     51     if (m <= 1) {
     52         if (m <= 0)
     53             return -1;
     54         /* use special case for 1-character strings */
     55         if (mode == FAST_COUNT) {
     56             for (i = 0; i < n; i++)
     57                 if (s[i] == p[0]) {
     58                     count++;
     59                     if (count == maxcount)
     60                         return maxcount;
     61                 }
     62             return count;
     63         } else if (mode == FAST_SEARCH) {
     64             for (i = 0; i < n; i++)
     65                 if (s[i] == p[0])
     66                     return i;
     67         } else {    /* FAST_RSEARCH */
     68             for (i = n - 1; i > -1; i--)
     69                 if (s[i] == p[0])
     70                     return i;
     71         }
     72         return -1;
     73     }
     74 
     75     mlast = m - 1;
     76     skip = mlast - 1;
     77     mask = 0;
     78 
     79     if (mode != FAST_RSEARCH) {
     80 
     81         /* create compressed boyer-moore delta 1 table */
     82 
     83         /* process pattern[:-1] */
     84         for (i = 0; i < mlast; i++) {
     85             STRINGLIB_BLOOM_ADD(mask, p[i]);
     86             if (p[i] == p[mlast])
     87                 skip = mlast - i - 1;
     88         }
     89         /* process pattern[-1] outside the loop */
     90         STRINGLIB_BLOOM_ADD(mask, p[mlast]);
     91 
     92         for (i = 0; i <= w; i++) {
     93             /* note: using mlast in the skip path slows things down on x86 */
     94             if (s[i+m-1] == p[m-1]) {
     95                 /* candidate match */
     96                 for (j = 0; j < mlast; j++)
     97                     if (s[i+j] != p[j])
     98                         break;
     99                 if (j == mlast) {
    100                     /* got a match! */
    101                     if (mode != FAST_COUNT)
    102                         return i;
    103                     count++;
    104                     if (count == maxcount)
    105                         return maxcount;
    106                     i = i + mlast;
    107                     continue;
    108                 }
    109                 /* miss: check if next character is part of pattern */
    110                 if (!STRINGLIB_BLOOM(mask, s[i+m]))
    111                     i = i + m;
    112                 else
    113                     i = i + skip;
    114             } else {
    115                 /* skip: check if next character is part of pattern */
    116                 if (!STRINGLIB_BLOOM(mask, s[i+m]))
    117                     i = i + m;
    118             }
    119         }
    120     } else {    /* FAST_RSEARCH */
    121 
    122         /* create compressed boyer-moore delta 1 table */
    123 
    124         /* process pattern[0] outside the loop */
    125         STRINGLIB_BLOOM_ADD(mask, p[0]);
    126         /* process pattern[:0:-1] */
    127         for (i = mlast; i > 0; i--) {
    128             STRINGLIB_BLOOM_ADD(mask, p[i]);
    129             if (p[i] == p[0])
    130                 skip = i - 1;
    131         }
    132 
    133         for (i = w; i >= 0; i--) {
    134             if (s[i] == p[0]) {
    135                 /* candidate match */
    136                 for (j = mlast; j > 0; j--)
    137                     if (s[i+j] != p[j])
    138                         break;
    139                 if (j == 0)
    140                     /* got a match! */
    141                     return i;
    142                 /* miss: check if previous character is part of pattern */
    143                 if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
    144                     i = i - m;
    145                 else
    146                     i = i - skip;
    147             } else {
    148                 /* skip: check if previous character is part of pattern */
    149                 if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
    150                     i = i - m;
    151             }
    152         }
    153     }
    154 
    155     if (mode != FAST_COUNT)
    156         return -1;
    157     return count;
    158 }
    159 
    160 #endif
    161