1 /* stringlib: fastsearch implementation */ 2 3 #define STRINGLIB_FASTSEARCH_H 4 5 /* fast search/count implementation, based on a mix between boyer- 6 moore and horspool, with a few more bells and whistles on the top. 7 for some more background, see: http://effbot.org/zone/stringlib.htm */ 8 9 /* note: fastsearch may access s[n], which isn't a problem when using 10 Python's ordinary string types, but may cause problems if you're 11 using this code in other contexts. also, the count mode returns -1 12 if there cannot possible be a match in the target string, and 0 if 13 it has actually checked for matches, but didn't find any. callers 14 beware! */ 15 16 #define FAST_COUNT 0 17 #define FAST_SEARCH 1 18 #define FAST_RSEARCH 2 19 20 #if LONG_BIT >= 128 21 #define STRINGLIB_BLOOM_WIDTH 128 22 #elif LONG_BIT >= 64 23 #define STRINGLIB_BLOOM_WIDTH 64 24 #elif LONG_BIT >= 32 25 #define STRINGLIB_BLOOM_WIDTH 32 26 #else 27 #error "LONG_BIT is smaller than 32" 28 #endif 29 30 #define STRINGLIB_BLOOM_ADD(mask, ch) \ 31 ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) 32 #define STRINGLIB_BLOOM(mask, ch) \ 33 ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) 34 35 Py_LOCAL_INLINE(Py_ssize_t) 36 STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) 37 { 38 const STRINGLIB_CHAR *p, *e; 39 40 p = s; 41 e = s + n; 42 if (n > 10) { 43 #if STRINGLIB_SIZEOF_CHAR == 1 44 p = memchr(s, ch, n); 45 if (p != NULL) 46 return (p - s); 47 return -1; 48 #else 49 /* use memchr if we can choose a needle without two many likely 50 false positives */ 51 unsigned char needle = ch & 0xff; 52 /* If looking for a multiple of 256, we'd have too 53 many false positives looking for the '\0' byte in UCS2 54 and UCS4 representations. */ 55 if (needle != 0) { 56 while (p < e) { 57 void *candidate = memchr(p, needle, 58 (e - p) * sizeof(STRINGLIB_CHAR)); 59 if (candidate == NULL) 60 return -1; 61 p = (const STRINGLIB_CHAR *) 62 _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); 63 if (*p == ch) 64 return (p - s); 65 /* False positive */ 66 p++; 67 } 68 return -1; 69 } 70 #endif 71 } 72 while (p < e) { 73 if (*p == ch) 74 return (p - s); 75 p++; 76 } 77 return -1; 78 } 79 80 Py_LOCAL_INLINE(Py_ssize_t) 81 STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) 82 { 83 const STRINGLIB_CHAR *p; 84 #ifdef HAVE_MEMRCHR 85 /* memrchr() is a GNU extension, available since glibc 2.1.91. 86 it doesn't seem as optimized as memchr(), but is still quite 87 faster than our hand-written loop below */ 88 89 if (n > 10) { 90 #if STRINGLIB_SIZEOF_CHAR == 1 91 p = memrchr(s, ch, n); 92 if (p != NULL) 93 return (p - s); 94 return -1; 95 #else 96 /* use memrchr if we can choose a needle without two many likely 97 false positives */ 98 unsigned char needle = ch & 0xff; 99 /* If looking for a multiple of 256, we'd have too 100 many false positives looking for the '\0' byte in UCS2 101 and UCS4 representations. */ 102 if (needle != 0) { 103 while (n > 0) { 104 void *candidate = memrchr(s, needle, 105 n * sizeof(STRINGLIB_CHAR)); 106 if (candidate == NULL) 107 return -1; 108 p = (const STRINGLIB_CHAR *) 109 _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); 110 n = p - s; 111 if (*p == ch) 112 return n; 113 /* False positive */ 114 } 115 return -1; 116 } 117 #endif 118 } 119 #endif /* HAVE_MEMRCHR */ 120 p = s + n; 121 while (p > s) { 122 p--; 123 if (*p == ch) 124 return (p - s); 125 } 126 return -1; 127 } 128 129 Py_LOCAL_INLINE(Py_ssize_t) 130 FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, 131 const STRINGLIB_CHAR* p, Py_ssize_t m, 132 Py_ssize_t maxcount, int mode) 133 { 134 unsigned long mask; 135 Py_ssize_t skip, count = 0; 136 Py_ssize_t i, j, mlast, w; 137 138 w = n - m; 139 140 if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) 141 return -1; 142 143 /* look for special cases */ 144 if (m <= 1) { 145 if (m <= 0) 146 return -1; 147 /* use special case for 1-character strings */ 148 if (mode == FAST_SEARCH) 149 return STRINGLIB(find_char)(s, n, p[0]); 150 else if (mode == FAST_RSEARCH) 151 return STRINGLIB(rfind_char)(s, n, p[0]); 152 else { /* FAST_COUNT */ 153 for (i = 0; i < n; i++) 154 if (s[i] == p[0]) { 155 count++; 156 if (count == maxcount) 157 return maxcount; 158 } 159 return count; 160 } 161 return -1; 162 } 163 164 mlast = m - 1; 165 skip = mlast - 1; 166 mask = 0; 167 168 if (mode != FAST_RSEARCH) { 169 const STRINGLIB_CHAR *ss = s + m - 1; 170 const STRINGLIB_CHAR *pp = p + m - 1; 171 172 /* create compressed boyer-moore delta 1 table */ 173 174 /* process pattern[:-1] */ 175 for (i = 0; i < mlast; i++) { 176 STRINGLIB_BLOOM_ADD(mask, p[i]); 177 if (p[i] == p[mlast]) 178 skip = mlast - i - 1; 179 } 180 /* process pattern[-1] outside the loop */ 181 STRINGLIB_BLOOM_ADD(mask, p[mlast]); 182 183 for (i = 0; i <= w; i++) { 184 /* note: using mlast in the skip path slows things down on x86 */ 185 if (ss[i] == pp[0]) { 186 /* candidate match */ 187 for (j = 0; j < mlast; j++) 188 if (s[i+j] != p[j]) 189 break; 190 if (j == mlast) { 191 /* got a match! */ 192 if (mode != FAST_COUNT) 193 return i; 194 count++; 195 if (count == maxcount) 196 return maxcount; 197 i = i + mlast; 198 continue; 199 } 200 /* miss: check if next character is part of pattern */ 201 if (!STRINGLIB_BLOOM(mask, ss[i+1])) 202 i = i + m; 203 else 204 i = i + skip; 205 } else { 206 /* skip: check if next character is part of pattern */ 207 if (!STRINGLIB_BLOOM(mask, ss[i+1])) 208 i = i + m; 209 } 210 } 211 } else { /* FAST_RSEARCH */ 212 213 /* create compressed boyer-moore delta 1 table */ 214 215 /* process pattern[0] outside the loop */ 216 STRINGLIB_BLOOM_ADD(mask, p[0]); 217 /* process pattern[:0:-1] */ 218 for (i = mlast; i > 0; i--) { 219 STRINGLIB_BLOOM_ADD(mask, p[i]); 220 if (p[i] == p[0]) 221 skip = i - 1; 222 } 223 224 for (i = w; i >= 0; i--) { 225 if (s[i] == p[0]) { 226 /* candidate match */ 227 for (j = mlast; j > 0; j--) 228 if (s[i+j] != p[j]) 229 break; 230 if (j == 0) 231 /* got a match! */ 232 return i; 233 /* miss: check if previous character is part of pattern */ 234 if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) 235 i = i - m; 236 else 237 i = i - skip; 238 } else { 239 /* skip: check if previous character is part of pattern */ 240 if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) 241 i = i - m; 242 } 243 } 244 } 245 246 if (mode != FAST_COUNT) 247 return -1; 248 return count; 249 } 250 251