1 /* stringlib: split implementation */ 2 3 #ifndef STRINGLIB_SPLIT_H 4 #define STRINGLIB_SPLIT_H 5 6 #ifndef STRINGLIB_FASTSEARCH_H 7 #error must include "stringlib/fastsearch.h" before including this module 8 #endif 9 10 /* Overallocate the initial list to reduce the number of reallocs for small 11 split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three 12 resizes, to sizes 4, 8, then 16. Most observed string splits are for human 13 text (roughly 11 words per line) and field delimited data (usually 1-10 14 fields). For large strings the split algorithms are bandwidth limited 15 so increasing the preallocation likely will not improve things.*/ 16 17 #define MAX_PREALLOC 12 18 19 /* 5 splits gives 6 elements */ 20 #define PREALLOC_SIZE(maxsplit) \ 21 (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) 22 23 #define SPLIT_APPEND(data, left, right) \ 24 sub = STRINGLIB_NEW((data) + (left), \ 25 (right) - (left)); \ 26 if (sub == NULL) \ 27 goto onError; \ 28 if (PyList_Append(list, sub)) { \ 29 Py_DECREF(sub); \ 30 goto onError; \ 31 } \ 32 else \ 33 Py_DECREF(sub); 34 35 #define SPLIT_ADD(data, left, right) { \ 36 sub = STRINGLIB_NEW((data) + (left), \ 37 (right) - (left)); \ 38 if (sub == NULL) \ 39 goto onError; \ 40 if (count < MAX_PREALLOC) { \ 41 PyList_SET_ITEM(list, count, sub); \ 42 } else { \ 43 if (PyList_Append(list, sub)) { \ 44 Py_DECREF(sub); \ 45 goto onError; \ 46 } \ 47 else \ 48 Py_DECREF(sub); \ 49 } \ 50 count++; } 51 52 53 /* Always force the list to the expected size. */ 54 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count 55 56 Py_LOCAL_INLINE(PyObject *) 57 stringlib_split_whitespace(PyObject* str_obj, 58 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 59 Py_ssize_t maxcount) 60 { 61 Py_ssize_t i, j, count=0; 62 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); 63 PyObject *sub; 64 65 if (list == NULL) 66 return NULL; 67 68 i = j = 0; 69 while (maxcount-- > 0) { 70 while (i < str_len && STRINGLIB_ISSPACE(str[i])) 71 i++; 72 if (i == str_len) break; 73 j = i; i++; 74 while (i < str_len && !STRINGLIB_ISSPACE(str[i])) 75 i++; 76 #ifndef STRINGLIB_MUTABLE 77 if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { 78 /* No whitespace in str_obj, so just use it as list[0] */ 79 Py_INCREF(str_obj); 80 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 81 count++; 82 break; 83 } 84 #endif 85 SPLIT_ADD(str, j, i); 86 } 87 88 if (i < str_len) { 89 /* Only occurs when maxcount was reached */ 90 /* Skip any remaining whitespace and copy to end of string */ 91 while (i < str_len && STRINGLIB_ISSPACE(str[i])) 92 i++; 93 if (i != str_len) 94 SPLIT_ADD(str, i, str_len); 95 } 96 FIX_PREALLOC_SIZE(list); 97 return list; 98 99 onError: 100 Py_DECREF(list); 101 return NULL; 102 } 103 104 Py_LOCAL_INLINE(PyObject *) 105 stringlib_split_char(PyObject* str_obj, 106 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 107 const STRINGLIB_CHAR ch, 108 Py_ssize_t maxcount) 109 { 110 Py_ssize_t i, j, count=0; 111 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); 112 PyObject *sub; 113 114 if (list == NULL) 115 return NULL; 116 117 i = j = 0; 118 while ((j < str_len) && (maxcount-- > 0)) { 119 for(; j < str_len; j++) { 120 /* I found that using memchr makes no difference */ 121 if (str[j] == ch) { 122 SPLIT_ADD(str, i, j); 123 i = j = j + 1; 124 break; 125 } 126 } 127 } 128 #ifndef STRINGLIB_MUTABLE 129 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { 130 /* ch not in str_obj, so just use str_obj as list[0] */ 131 Py_INCREF(str_obj); 132 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 133 count++; 134 } else 135 #endif 136 if (i <= str_len) { 137 SPLIT_ADD(str, i, str_len); 138 } 139 FIX_PREALLOC_SIZE(list); 140 return list; 141 142 onError: 143 Py_DECREF(list); 144 return NULL; 145 } 146 147 Py_LOCAL_INLINE(PyObject *) 148 stringlib_split(PyObject* str_obj, 149 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 150 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, 151 Py_ssize_t maxcount) 152 { 153 Py_ssize_t i, j, pos, count=0; 154 PyObject *list, *sub; 155 156 if (sep_len == 0) { 157 PyErr_SetString(PyExc_ValueError, "empty separator"); 158 return NULL; 159 } 160 else if (sep_len == 1) 161 return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount); 162 163 list = PyList_New(PREALLOC_SIZE(maxcount)); 164 if (list == NULL) 165 return NULL; 166 167 i = j = 0; 168 while (maxcount-- > 0) { 169 pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); 170 if (pos < 0) 171 break; 172 j = i + pos; 173 SPLIT_ADD(str, i, j); 174 i = j + sep_len; 175 } 176 #ifndef STRINGLIB_MUTABLE 177 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { 178 /* No match in str_obj, so just use it as list[0] */ 179 Py_INCREF(str_obj); 180 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 181 count++; 182 } else 183 #endif 184 { 185 SPLIT_ADD(str, i, str_len); 186 } 187 FIX_PREALLOC_SIZE(list); 188 return list; 189 190 onError: 191 Py_DECREF(list); 192 return NULL; 193 } 194 195 Py_LOCAL_INLINE(PyObject *) 196 stringlib_rsplit_whitespace(PyObject* str_obj, 197 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 198 Py_ssize_t maxcount) 199 { 200 Py_ssize_t i, j, count=0; 201 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); 202 PyObject *sub; 203 204 if (list == NULL) 205 return NULL; 206 207 i = j = str_len - 1; 208 while (maxcount-- > 0) { 209 while (i >= 0 && STRINGLIB_ISSPACE(str[i])) 210 i--; 211 if (i < 0) break; 212 j = i; i--; 213 while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) 214 i--; 215 #ifndef STRINGLIB_MUTABLE 216 if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { 217 /* No whitespace in str_obj, so just use it as list[0] */ 218 Py_INCREF(str_obj); 219 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 220 count++; 221 break; 222 } 223 #endif 224 SPLIT_ADD(str, i + 1, j + 1); 225 } 226 227 if (i >= 0) { 228 /* Only occurs when maxcount was reached */ 229 /* Skip any remaining whitespace and copy to beginning of string */ 230 while (i >= 0 && STRINGLIB_ISSPACE(str[i])) 231 i--; 232 if (i >= 0) 233 SPLIT_ADD(str, 0, i + 1); 234 } 235 FIX_PREALLOC_SIZE(list); 236 if (PyList_Reverse(list) < 0) 237 goto onError; 238 return list; 239 240 onError: 241 Py_DECREF(list); 242 return NULL; 243 } 244 245 Py_LOCAL_INLINE(PyObject *) 246 stringlib_rsplit_char(PyObject* str_obj, 247 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 248 const STRINGLIB_CHAR ch, 249 Py_ssize_t maxcount) 250 { 251 Py_ssize_t i, j, count=0; 252 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); 253 PyObject *sub; 254 255 if (list == NULL) 256 return NULL; 257 258 i = j = str_len - 1; 259 while ((i >= 0) && (maxcount-- > 0)) { 260 for(; i >= 0; i--) { 261 if (str[i] == ch) { 262 SPLIT_ADD(str, i + 1, j + 1); 263 j = i = i - 1; 264 break; 265 } 266 } 267 } 268 #ifndef STRINGLIB_MUTABLE 269 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { 270 /* ch not in str_obj, so just use str_obj as list[0] */ 271 Py_INCREF(str_obj); 272 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 273 count++; 274 } else 275 #endif 276 if (j >= -1) { 277 SPLIT_ADD(str, 0, j + 1); 278 } 279 FIX_PREALLOC_SIZE(list); 280 if (PyList_Reverse(list) < 0) 281 goto onError; 282 return list; 283 284 onError: 285 Py_DECREF(list); 286 return NULL; 287 } 288 289 Py_LOCAL_INLINE(PyObject *) 290 stringlib_rsplit(PyObject* str_obj, 291 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 292 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, 293 Py_ssize_t maxcount) 294 { 295 Py_ssize_t j, pos, count=0; 296 PyObject *list, *sub; 297 298 if (sep_len == 0) { 299 PyErr_SetString(PyExc_ValueError, "empty separator"); 300 return NULL; 301 } 302 else if (sep_len == 1) 303 return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount); 304 305 list = PyList_New(PREALLOC_SIZE(maxcount)); 306 if (list == NULL) 307 return NULL; 308 309 j = str_len; 310 while (maxcount-- > 0) { 311 pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH); 312 if (pos < 0) 313 break; 314 SPLIT_ADD(str, pos + sep_len, j); 315 j = pos; 316 } 317 #ifndef STRINGLIB_MUTABLE 318 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { 319 /* No match in str_obj, so just use it as list[0] */ 320 Py_INCREF(str_obj); 321 PyList_SET_ITEM(list, 0, (PyObject *)str_obj); 322 count++; 323 } else 324 #endif 325 { 326 SPLIT_ADD(str, 0, j); 327 } 328 FIX_PREALLOC_SIZE(list); 329 if (PyList_Reverse(list) < 0) 330 goto onError; 331 return list; 332 333 onError: 334 Py_DECREF(list); 335 return NULL; 336 } 337 338 Py_LOCAL_INLINE(PyObject *) 339 stringlib_splitlines(PyObject* str_obj, 340 const STRINGLIB_CHAR* str, Py_ssize_t str_len, 341 int keepends) 342 { 343 /* This does not use the preallocated list because splitlines is 344 usually run with hundreds of newlines. The overhead of 345 switching between PyList_SET_ITEM and append causes about a 346 2-3% slowdown for that common case. A smarter implementation 347 could move the if check out, so the SET_ITEMs are done first 348 and the appends only done when the prealloc buffer is full. 349 That's too much work for little gain.*/ 350 351 register Py_ssize_t i; 352 register Py_ssize_t j; 353 PyObject *list = PyList_New(0); 354 PyObject *sub; 355 356 if (list == NULL) 357 return NULL; 358 359 for (i = j = 0; i < str_len; ) { 360 Py_ssize_t eol; 361 362 /* Find a line and append it */ 363 while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) 364 i++; 365 366 /* Skip the line break reading CRLF as one line break */ 367 eol = i; 368 if (i < str_len) { 369 if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') 370 i += 2; 371 else 372 i++; 373 if (keepends) 374 eol = i; 375 } 376 #ifndef STRINGLIB_MUTABLE 377 if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { 378 /* No linebreak in str_obj, so just use it as list[0] */ 379 if (PyList_Append(list, str_obj)) 380 goto onError; 381 break; 382 } 383 #endif 384 SPLIT_APPEND(str, j, eol); 385 j = i; 386 } 387 return list; 388 389 onError: 390 Py_DECREF(list); 391 return NULL; 392 } 393 394 #endif 395