1 /* stringlib: split implementation */ 2 3 #ifndef STRINGLIB_FASTSEARCH_H 4 #error must include "stringlib/fastsearch.h" before including this module 5 #endif 6 7 /* Overallocate the initial list to reduce the number of reallocs for small 8 split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three 9 resizes, to sizes 4, 8, then 16. Most observed string splits are for human 10 text (roughly 11 words per line) and field delimited data (usually 1-10 11 fields). For large strings the split algorithms are bandwidth limited 12 so increasing the preallocation likely will not improve things.*/ 13 14 #define MAX_PREALLOC 12 15 16 /* 5 splits gives 6 elements */ 17 #define PREALLOC_SIZE(maxsplit) \ 18 (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) 19 20 #define SPLIT_APPEND(data, left, right) \ 21 sub = STRINGLIB_NEW((data) + (left), \ 22 (right) - (left)); \ 23 if (sub == NULL) \ 24 goto onError; \ 25 if (PyList_Append(list, sub)) { \ 26 Py_DECREF(sub); \ 27 goto onError; \ 28 } \ 29 else \ 30 Py_DECREF(sub); 31 32 #define SPLIT_ADD(data, left, right) { \ 33 sub = STRINGLIB_NEW((data) + (left), \ 34 (right) - (left)); \ 35 if (sub == NULL) \ 36 goto onError; \ 37 if (count < MAX_PREALLOC) { \ 38 PyList_SET_ITEM(list, count, sub); \ 39 } else { \ 40 if (PyList_Append(list, sub)) { \ 41 Py_DECREF(sub); \ 42 goto onError; \ 43 } \ 44 else \ 45 Py_DECREF(sub); \ 46 } \ 47 count++; } 48 49 50 /* Always force the list to the expected size. */ 51 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count 52 53 Py_LOCAL_INLINE(PyObject *) 54 STRINGLIB(split_whitespace)(PyObject* str_obj, 55 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 56 Py_ssize_t maxcount) 57 { 58 Py_ssize_t i, j, count=0; 59 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); 60 PyObject *sub; 61 62 if (list == NULL) 63 return NULL; 64 65 i = j = 0; 66 while (maxcount-- > 0) { 67 while (i < str_len && STRINGLIB_ISSPACE(str[i])) 68 i++; 69 if (i == str_len) break; 70 j = i; i++; 71 while (i < str_len && !STRINGLIB_ISSPACE(str[i])) 72 i++; 73 #ifndef STRINGLIB_MUTABLE 74 if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { 75 /* No whitespace in str_obj, so just use it as list[0] */ 76 Py_INCREF(str_obj); 77 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 78 count++; 79 break; 80 } 81 #endif 82 SPLIT_ADD(str, j, i); 83 } 84 85 if (i < str_len) { 86 /* Only occurs when maxcount was reached */ 87 /* Skip any remaining whitespace and copy to end of string */ 88 while (i < str_len && STRINGLIB_ISSPACE(str[i])) 89 i++; 90 if (i != str_len) 91 SPLIT_ADD(str, i, str_len); 92 } 93 FIX_PREALLOC_SIZE(list); 94 return list; 95 96 onError: 97 Py_DECREF(list); 98 return NULL; 99 } 100 101 Py_LOCAL_INLINE(PyObject *) 102 STRINGLIB(split_char)(PyObject* str_obj, 103 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 104 const STRINGLIB_CHAR ch, 105 Py_ssize_t maxcount) 106 { 107 Py_ssize_t i, j, count=0; 108 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); 109 PyObject *sub; 110 111 if (list == NULL) 112 return NULL; 113 114 i = j = 0; 115 while ((j < str_len) && (maxcount-- > 0)) { 116 for(; j < str_len; j++) { 117 /* I found that using memchr makes no difference */ 118 if (str[j] == ch) { 119 SPLIT_ADD(str, i, j); 120 i = j = j + 1; 121 break; 122 } 123 } 124 } 125 #ifndef STRINGLIB_MUTABLE 126 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { 127 /* ch not in str_obj, so just use str_obj as list[0] */ 128 Py_INCREF(str_obj); 129 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 130 count++; 131 } else 132 #endif 133 if (i <= str_len) { 134 SPLIT_ADD(str, i, str_len); 135 } 136 FIX_PREALLOC_SIZE(list); 137 return list; 138 139 onError: 140 Py_DECREF(list); 141 return NULL; 142 } 143 144 Py_LOCAL_INLINE(PyObject *) 145 STRINGLIB(split)(PyObject* str_obj, 146 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 147 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, 148 Py_ssize_t maxcount) 149 { 150 Py_ssize_t i, j, pos, count=0; 151 PyObject *list, *sub; 152 153 if (sep_len == 0) { 154 PyErr_SetString(PyExc_ValueError, "empty separator"); 155 return NULL; 156 } 157 else if (sep_len == 1) 158 return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount); 159 160 list = PyList_New(PREALLOC_SIZE(maxcount)); 161 if (list == NULL) 162 return NULL; 163 164 i = j = 0; 165 while (maxcount-- > 0) { 166 pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); 167 if (pos < 0) 168 break; 169 j = i + pos; 170 SPLIT_ADD(str, i, j); 171 i = j + sep_len; 172 } 173 #ifndef STRINGLIB_MUTABLE 174 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { 175 /* No match in str_obj, so just use it as list[0] */ 176 Py_INCREF(str_obj); 177 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 178 count++; 179 } else 180 #endif 181 { 182 SPLIT_ADD(str, i, str_len); 183 } 184 FIX_PREALLOC_SIZE(list); 185 return list; 186 187 onError: 188 Py_DECREF(list); 189 return NULL; 190 } 191 192 Py_LOCAL_INLINE(PyObject *) 193 STRINGLIB(rsplit_whitespace)(PyObject* str_obj, 194 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 195 Py_ssize_t maxcount) 196 { 197 Py_ssize_t i, j, count=0; 198 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); 199 PyObject *sub; 200 201 if (list == NULL) 202 return NULL; 203 204 i = j = str_len - 1; 205 while (maxcount-- > 0) { 206 while (i >= 0 && STRINGLIB_ISSPACE(str[i])) 207 i--; 208 if (i < 0) break; 209 j = i; i--; 210 while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) 211 i--; 212 #ifndef STRINGLIB_MUTABLE 213 if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { 214 /* No whitespace in str_obj, so just use it as list[0] */ 215 Py_INCREF(str_obj); 216 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 217 count++; 218 break; 219 } 220 #endif 221 SPLIT_ADD(str, i + 1, j + 1); 222 } 223 224 if (i >= 0) { 225 /* Only occurs when maxcount was reached */ 226 /* Skip any remaining whitespace and copy to beginning of string */ 227 while (i >= 0 && STRINGLIB_ISSPACE(str[i])) 228 i--; 229 if (i >= 0) 230 SPLIT_ADD(str, 0, i + 1); 231 } 232 FIX_PREALLOC_SIZE(list); 233 if (PyList_Reverse(list) < 0) 234 goto onError; 235 return list; 236 237 onError: 238 Py_DECREF(list); 239 return NULL; 240 } 241 242 Py_LOCAL_INLINE(PyObject *) 243 STRINGLIB(rsplit_char)(PyObject* str_obj, 244 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 245 const STRINGLIB_CHAR ch, 246 Py_ssize_t maxcount) 247 { 248 Py_ssize_t i, j, count=0; 249 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); 250 PyObject *sub; 251 252 if (list == NULL) 253 return NULL; 254 255 i = j = str_len - 1; 256 while ((i >= 0) && (maxcount-- > 0)) { 257 for(; i >= 0; i--) { 258 if (str[i] == ch) { 259 SPLIT_ADD(str, i + 1, j + 1); 260 j = i = i - 1; 261 break; 262 } 263 } 264 } 265 #ifndef STRINGLIB_MUTABLE 266 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { 267 /* ch not in str_obj, so just use str_obj as list[0] */ 268 Py_INCREF(str_obj); 269 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 270 count++; 271 } else 272 #endif 273 if (j >= -1) { 274 SPLIT_ADD(str, 0, j + 1); 275 } 276 FIX_PREALLOC_SIZE(list); 277 if (PyList_Reverse(list) < 0) 278 goto onError; 279 return list; 280 281 onError: 282 Py_DECREF(list); 283 return NULL; 284 } 285 286 Py_LOCAL_INLINE(PyObject *) 287 STRINGLIB(rsplit)(PyObject* str_obj, 288 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 289 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, 290 Py_ssize_t maxcount) 291 { 292 Py_ssize_t j, pos, count=0; 293 PyObject *list, *sub; 294 295 if (sep_len == 0) { 296 PyErr_SetString(PyExc_ValueError, "empty separator"); 297 return NULL; 298 } 299 else if (sep_len == 1) 300 return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount); 301 302 list = PyList_New(PREALLOC_SIZE(maxcount)); 303 if (list == NULL) 304 return NULL; 305 306 j = str_len; 307 while (maxcount-- > 0) { 308 pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH); 309 if (pos < 0) 310 break; 311 SPLIT_ADD(str, pos + sep_len, j); 312 j = pos; 313 } 314 #ifndef STRINGLIB_MUTABLE 315 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { 316 /* No match in str_obj, so just use it as list[0] */ 317 Py_INCREF(str_obj); 318 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 319 count++; 320 } else 321 #endif 322 { 323 SPLIT_ADD(str, 0, j); 324 } 325 FIX_PREALLOC_SIZE(list); 326 if (PyList_Reverse(list) < 0) 327 goto onError; 328 return list; 329 330 onError: 331 Py_DECREF(list); 332 return NULL; 333 } 334 335 Py_LOCAL_INLINE(PyObject *) 336 STRINGLIB(splitlines)(PyObject* str_obj, 337 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 338 int keepends) 339 { 340 /* This does not use the preallocated list because splitlines is 341 usually run with hundreds of newlines. The overhead of 342 switching between PyList_SET_ITEM and append causes about a 343 2-3% slowdown for that common case. A smarter implementation 344 could move the if check out, so the SET_ITEMs are done first 345 and the appends only done when the prealloc buffer is full. 346 That's too much work for little gain.*/ 347 348 Py_ssize_t i; 349 Py_ssize_t j; 350 PyObject *list = PyList_New(0); 351 PyObject *sub; 352 353 if (list == NULL) 354 return NULL; 355 356 for (i = j = 0; i < str_len; ) { 357 Py_ssize_t eol; 358 359 /* Find a line and append it */ 360 while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) 361 i++; 362 363 /* Skip the line break reading CRLF as one line break */ 364 eol = i; 365 if (i < str_len) { 366 if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') 367 i += 2; 368 else 369 i++; 370 if (keepends) 371 eol = i; 372 } 373 #ifndef STRINGLIB_MUTABLE 374 if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { 375 /* No linebreak in str_obj, so just use it as list[0] */ 376 if (PyList_Append(list, str_obj)) 377 goto onError; 378 break; 379 } 380 #endif 381 SPLIT_APPEND(str, j, eol); 382 j = i; 383 } 384 return list; 385 386 onError: 387 Py_DECREF(list); 388 return NULL; 389 } 390 391