Home | History | Annotate | Download | only in stringlib
      1 /* stringlib: split implementation */
      2 
      3 #ifndef STRINGLIB_FASTSEARCH_H
      4 #error must include "stringlib/fastsearch.h" before including this module
      5 #endif
      6 
      7 /* Overallocate the initial list to reduce the number of reallocs for small
      8    split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
      9    resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
     10    text (roughly 11 words per line) and field delimited data (usually 1-10
     11    fields).  For large strings the split algorithms are bandwidth limited
     12    so increasing the preallocation likely will not improve things.*/
     13 
     14 #define MAX_PREALLOC 12
     15 
     16 /* 5 splits gives 6 elements */
     17 #define PREALLOC_SIZE(maxsplit) \
     18     (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
     19 
     20 #define SPLIT_APPEND(data, left, right)         \
     21     sub = STRINGLIB_NEW((data) + (left),        \
     22                         (right) - (left));      \
     23     if (sub == NULL)                            \
     24         goto onError;                           \
     25     if (PyList_Append(list, sub)) {             \
     26         Py_DECREF(sub);                         \
     27         goto onError;                           \
     28     }                                           \
     29     else                                        \
     30         Py_DECREF(sub);
     31 
     32 #define SPLIT_ADD(data, left, right) {          \
     33     sub = STRINGLIB_NEW((data) + (left),        \
     34                         (right) - (left));      \
     35     if (sub == NULL)                            \
     36         goto onError;                           \
     37     if (count < MAX_PREALLOC) {                 \
     38         PyList_SET_ITEM(list, count, sub);      \
     39     } else {                                    \
     40         if (PyList_Append(list, sub)) {         \
     41             Py_DECREF(sub);                     \
     42             goto onError;                       \
     43         }                                       \
     44         else                                    \
     45             Py_DECREF(sub);                     \
     46     }                                           \
     47     count++; }
     48 
     49 
     50 /* Always force the list to the expected size. */
     51 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
     52 
     53 Py_LOCAL_INLINE(PyObject *)
     54 STRINGLIB(split_whitespace)(PyObject* str_obj,
     55                            const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     56                            Py_ssize_t maxcount)
     57 {
     58     Py_ssize_t i, j, count=0;
     59     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
     60     PyObject *sub;
     61 
     62     if (list == NULL)
     63         return NULL;
     64 
     65     i = j = 0;
     66     while (maxcount-- > 0) {
     67         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
     68             i++;
     69         if (i == str_len) break;
     70         j = i; i++;
     71         while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
     72             i++;
     73 #ifndef STRINGLIB_MUTABLE
     74         if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
     75             /* No whitespace in str_obj, so just use it as list[0] */
     76             Py_INCREF(str_obj);
     77             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     78             count++;
     79             break;
     80         }
     81 #endif
     82         SPLIT_ADD(str, j, i);
     83     }
     84 
     85     if (i < str_len) {
     86         /* Only occurs when maxcount was reached */
     87         /* Skip any remaining whitespace and copy to end of string */
     88         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
     89             i++;
     90         if (i != str_len)
     91             SPLIT_ADD(str, i, str_len);
     92     }
     93     FIX_PREALLOC_SIZE(list);
     94     return list;
     95 
     96   onError:
     97     Py_DECREF(list);
     98     return NULL;
     99 }
    100 
    101 Py_LOCAL_INLINE(PyObject *)
    102 STRINGLIB(split_char)(PyObject* str_obj,
    103                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    104                      const STRINGLIB_CHAR ch,
    105                      Py_ssize_t maxcount)
    106 {
    107     Py_ssize_t i, j, count=0;
    108     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
    109     PyObject *sub;
    110 
    111     if (list == NULL)
    112         return NULL;
    113 
    114     i = j = 0;
    115     while ((j < str_len) && (maxcount-- > 0)) {
    116         for(; j < str_len; j++) {
    117             /* I found that using memchr makes no difference */
    118             if (str[j] == ch) {
    119                 SPLIT_ADD(str, i, j);
    120                 i = j = j + 1;
    121                 break;
    122             }
    123         }
    124     }
    125 #ifndef STRINGLIB_MUTABLE
    126     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
    127         /* ch not in str_obj, so just use str_obj as list[0] */
    128         Py_INCREF(str_obj);
    129         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
    130         count++;
    131     } else
    132 #endif
    133     if (i <= str_len) {
    134         SPLIT_ADD(str, i, str_len);
    135     }
    136     FIX_PREALLOC_SIZE(list);
    137     return list;
    138 
    139   onError:
    140     Py_DECREF(list);
    141     return NULL;
    142 }
    143 
    144 Py_LOCAL_INLINE(PyObject *)
    145 STRINGLIB(split)(PyObject* str_obj,
    146                 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    147                 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
    148                 Py_ssize_t maxcount)
    149 {
    150     Py_ssize_t i, j, pos, count=0;
    151     PyObject *list, *sub;
    152 
    153     if (sep_len == 0) {
    154         PyErr_SetString(PyExc_ValueError, "empty separator");
    155         return NULL;
    156     }
    157     else if (sep_len == 1)
    158         return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount);
    159 
    160     list = PyList_New(PREALLOC_SIZE(maxcount));
    161     if (list == NULL)
    162         return NULL;
    163 
    164     i = j = 0;
    165     while (maxcount-- > 0) {
    166         pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
    167         if (pos < 0)
    168             break;
    169         j = i + pos;
    170         SPLIT_ADD(str, i, j);
    171         i = j + sep_len;
    172     }
    173 #ifndef STRINGLIB_MUTABLE
    174     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
    175         /* No match in str_obj, so just use it as list[0] */
    176         Py_INCREF(str_obj);
    177         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
    178         count++;
    179     } else
    180 #endif
    181     {
    182         SPLIT_ADD(str, i, str_len);
    183     }
    184     FIX_PREALLOC_SIZE(list);
    185     return list;
    186 
    187   onError:
    188     Py_DECREF(list);
    189     return NULL;
    190 }
    191 
    192 Py_LOCAL_INLINE(PyObject *)
    193 STRINGLIB(rsplit_whitespace)(PyObject* str_obj,
    194                             const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    195                             Py_ssize_t maxcount)
    196 {
    197     Py_ssize_t i, j, count=0;
    198     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
    199     PyObject *sub;
    200 
    201     if (list == NULL)
    202         return NULL;
    203 
    204     i = j = str_len - 1;
    205     while (maxcount-- > 0) {
    206         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
    207             i--;
    208         if (i < 0) break;
    209         j = i; i--;
    210         while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
    211             i--;
    212 #ifndef STRINGLIB_MUTABLE
    213         if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
    214             /* No whitespace in str_obj, so just use it as list[0] */
    215             Py_INCREF(str_obj);
    216             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
    217             count++;
    218             break;
    219         }
    220 #endif
    221         SPLIT_ADD(str, i + 1, j + 1);
    222     }
    223 
    224     if (i >= 0) {
    225         /* Only occurs when maxcount was reached */
    226         /* Skip any remaining whitespace and copy to beginning of string */
    227         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
    228             i--;
    229         if (i >= 0)
    230             SPLIT_ADD(str, 0, i + 1);
    231     }
    232     FIX_PREALLOC_SIZE(list);
    233     if (PyList_Reverse(list) < 0)
    234         goto onError;
    235     return list;
    236 
    237   onError:
    238     Py_DECREF(list);
    239     return NULL;
    240 }
    241 
    242 Py_LOCAL_INLINE(PyObject *)
    243 STRINGLIB(rsplit_char)(PyObject* str_obj,
    244                       const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    245                       const STRINGLIB_CHAR ch,
    246                       Py_ssize_t maxcount)
    247 {
    248     Py_ssize_t i, j, count=0;
    249     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
    250     PyObject *sub;
    251 
    252     if (list == NULL)
    253         return NULL;
    254 
    255     i = j = str_len - 1;
    256     while ((i >= 0) && (maxcount-- > 0)) {
    257         for(; i >= 0; i--) {
    258             if (str[i] == ch) {
    259                 SPLIT_ADD(str, i + 1, j + 1);
    260                 j = i = i - 1;
    261                 break;
    262             }
    263         }
    264     }
    265 #ifndef STRINGLIB_MUTABLE
    266     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
    267         /* ch not in str_obj, so just use str_obj as list[0] */
    268         Py_INCREF(str_obj);
    269         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
    270         count++;
    271     } else
    272 #endif
    273     if (j >= -1) {
    274         SPLIT_ADD(str, 0, j + 1);
    275     }
    276     FIX_PREALLOC_SIZE(list);
    277     if (PyList_Reverse(list) < 0)
    278         goto onError;
    279     return list;
    280 
    281   onError:
    282     Py_DECREF(list);
    283     return NULL;
    284 }
    285 
    286 Py_LOCAL_INLINE(PyObject *)
    287 STRINGLIB(rsplit)(PyObject* str_obj,
    288                  const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    289                  const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
    290                  Py_ssize_t maxcount)
    291 {
    292     Py_ssize_t j, pos, count=0;
    293     PyObject *list, *sub;
    294 
    295     if (sep_len == 0) {
    296         PyErr_SetString(PyExc_ValueError, "empty separator");
    297         return NULL;
    298     }
    299     else if (sep_len == 1)
    300         return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount);
    301 
    302     list = PyList_New(PREALLOC_SIZE(maxcount));
    303     if (list == NULL)
    304         return NULL;
    305 
    306     j = str_len;
    307     while (maxcount-- > 0) {
    308         pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH);
    309         if (pos < 0)
    310             break;
    311         SPLIT_ADD(str, pos + sep_len, j);
    312         j = pos;
    313     }
    314 #ifndef STRINGLIB_MUTABLE
    315     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
    316         /* No match in str_obj, so just use it as list[0] */
    317         Py_INCREF(str_obj);
    318         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
    319         count++;
    320     } else
    321 #endif
    322     {
    323         SPLIT_ADD(str, 0, j);
    324     }
    325     FIX_PREALLOC_SIZE(list);
    326     if (PyList_Reverse(list) < 0)
    327         goto onError;
    328     return list;
    329 
    330   onError:
    331     Py_DECREF(list);
    332     return NULL;
    333 }
    334 
    335 Py_LOCAL_INLINE(PyObject *)
    336 STRINGLIB(splitlines)(PyObject* str_obj,
    337                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    338                      int keepends)
    339 {
    340     /* This does not use the preallocated list because splitlines is
    341        usually run with hundreds of newlines.  The overhead of
    342        switching between PyList_SET_ITEM and append causes about a
    343        2-3% slowdown for that common case.  A smarter implementation
    344        could move the if check out, so the SET_ITEMs are done first
    345        and the appends only done when the prealloc buffer is full.
    346        That's too much work for little gain.*/
    347 
    348     Py_ssize_t i;
    349     Py_ssize_t j;
    350     PyObject *list = PyList_New(0);
    351     PyObject *sub;
    352 
    353     if (list == NULL)
    354         return NULL;
    355 
    356     for (i = j = 0; i < str_len; ) {
    357         Py_ssize_t eol;
    358 
    359         /* Find a line and append it */
    360         while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
    361             i++;
    362 
    363         /* Skip the line break reading CRLF as one line break */
    364         eol = i;
    365         if (i < str_len) {
    366             if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
    367                 i += 2;
    368             else
    369                 i++;
    370             if (keepends)
    371                 eol = i;
    372         }
    373 #ifndef STRINGLIB_MUTABLE
    374         if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
    375             /* No linebreak in str_obj, so just use it as list[0] */
    376             if (PyList_Append(list, str_obj))
    377                 goto onError;
    378             break;
    379         }
    380 #endif
    381         SPLIT_APPEND(str, j, eol);
    382         j = i;
    383     }
    384     return list;
    385 
    386   onError:
    387     Py_DECREF(list);
    388     return NULL;
    389 }
    390 
    391