Home | History | Annotate | Download | only in stringlib
      1 /* stringlib: split implementation */
      2 
      3 #ifndef STRINGLIB_SPLIT_H
      4 #define STRINGLIB_SPLIT_H
      5 
      6 #ifndef STRINGLIB_FASTSEARCH_H
      7 #error must include "stringlib/fastsearch.h" before including this module
      8 #endif
      9 
     10 /* Overallocate the initial list to reduce the number of reallocs for small
     11    split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
     12    resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
     13    text (roughly 11 words per line) and field delimited data (usually 1-10
     14    fields).  For large strings the split algorithms are bandwidth limited
     15    so increasing the preallocation likely will not improve things.*/
     16 
     17 #define MAX_PREALLOC 12
     18 
     19 /* 5 splits gives 6 elements */
     20 #define PREALLOC_SIZE(maxsplit) \
     21     (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
     22 
     23 #define SPLIT_APPEND(data, left, right)         \
     24     sub = STRINGLIB_NEW((data) + (left),        \
     25                         (right) - (left));      \
     26     if (sub == NULL)                            \
     27         goto onError;                           \
     28     if (PyList_Append(list, sub)) {             \
     29         Py_DECREF(sub);                         \
     30         goto onError;                           \
     31     }                                           \
     32     else                                        \
     33         Py_DECREF(sub);
     34 
     35 #define SPLIT_ADD(data, left, right) {          \
     36     sub = STRINGLIB_NEW((data) + (left),        \
     37                         (right) - (left));      \
     38     if (sub == NULL)                            \
     39         goto onError;                           \
     40     if (count < MAX_PREALLOC) {                 \
     41         PyList_SET_ITEM(list, count, sub);      \
     42     } else {                                    \
     43         if (PyList_Append(list, sub)) {         \
     44             Py_DECREF(sub);                     \
     45             goto onError;                       \
     46         }                                       \
     47         else                                    \
     48             Py_DECREF(sub);                     \
     49     }                                           \
     50     count++; }
     51 
     52 
     53 /* Always force the list to the expected size. */
     54 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
     55 
     56 Py_LOCAL_INLINE(PyObject *)
     57 stringlib_split_whitespace(PyObject* str_obj,
     58                            const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     59                            Py_ssize_t maxcount)
     60 {
     61     Py_ssize_t i, j, count=0;
     62     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
     63     PyObject *sub;
     64 
     65     if (list == NULL)
     66         return NULL;
     67 
     68     i = j = 0;
     69     while (maxcount-- > 0) {
     70         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
     71             i++;
     72         if (i == str_len) break;
     73         j = i; i++;
     74         while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
     75             i++;
     76 #ifndef STRINGLIB_MUTABLE
     77         if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
     78             /* No whitespace in str_obj, so just use it as list[0] */
     79             Py_INCREF(str_obj);
     80             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     81             count++;
     82             break;
     83         }
     84 #endif
     85         SPLIT_ADD(str, j, i);
     86     }
     87 
     88     if (i < str_len) {
     89         /* Only occurs when maxcount was reached */
     90         /* Skip any remaining whitespace and copy to end of string */
     91         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
     92             i++;
     93         if (i != str_len)
     94             SPLIT_ADD(str, i, str_len);
     95     }
     96     FIX_PREALLOC_SIZE(list);
     97     return list;
     98 
     99   onError:
    100     Py_DECREF(list);
    101     return NULL;
    102 }
    103 
    104 Py_LOCAL_INLINE(PyObject *)
    105 stringlib_split_char(PyObject* str_obj,
    106                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    107                      const STRINGLIB_CHAR ch,
    108                      Py_ssize_t maxcount)
    109 {
    110     Py_ssize_t i, j, count=0;
    111     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
    112     PyObject *sub;
    113 
    114     if (list == NULL)
    115         return NULL;
    116 
    117     i = j = 0;
    118     while ((j < str_len) && (maxcount-- > 0)) {
    119         for(; j < str_len; j++) {
    120             /* I found that using memchr makes no difference */
    121             if (str[j] == ch) {
    122                 SPLIT_ADD(str, i, j);
    123                 i = j = j + 1;
    124                 break;
    125             }
    126         }
    127     }
    128 #ifndef STRINGLIB_MUTABLE
    129     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
    130         /* ch not in str_obj, so just use str_obj as list[0] */
    131         Py_INCREF(str_obj);
    132         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
    133         count++;
    134     } else
    135 #endif
    136     if (i <= str_len) {
    137         SPLIT_ADD(str, i, str_len);
    138     }
    139     FIX_PREALLOC_SIZE(list);
    140     return list;
    141 
    142   onError:
    143     Py_DECREF(list);
    144     return NULL;
    145 }
    146 
    147 Py_LOCAL_INLINE(PyObject *)
    148 stringlib_split(PyObject* str_obj,
    149                 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    150                 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
    151                 Py_ssize_t maxcount)
    152 {
    153     Py_ssize_t i, j, pos, count=0;
    154     PyObject *list, *sub;
    155 
    156     if (sep_len == 0) {
    157         PyErr_SetString(PyExc_ValueError, "empty separator");
    158         return NULL;
    159     }
    160     else if (sep_len == 1)
    161         return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
    162 
    163     list = PyList_New(PREALLOC_SIZE(maxcount));
    164     if (list == NULL)
    165         return NULL;
    166 
    167     i = j = 0;
    168     while (maxcount-- > 0) {
    169         pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
    170         if (pos < 0)
    171             break;
    172         j = i + pos;
    173         SPLIT_ADD(str, i, j);
    174         i = j + sep_len;
    175     }
    176 #ifndef STRINGLIB_MUTABLE
    177     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
    178         /* No match in str_obj, so just use it as list[0] */
    179         Py_INCREF(str_obj);
    180         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
    181         count++;
    182     } else
    183 #endif
    184     {
    185         SPLIT_ADD(str, i, str_len);
    186     }
    187     FIX_PREALLOC_SIZE(list);
    188     return list;
    189 
    190   onError:
    191     Py_DECREF(list);
    192     return NULL;
    193 }
    194 
    195 Py_LOCAL_INLINE(PyObject *)
    196 stringlib_rsplit_whitespace(PyObject* str_obj,
    197                             const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    198                             Py_ssize_t maxcount)
    199 {
    200     Py_ssize_t i, j, count=0;
    201     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
    202     PyObject *sub;
    203 
    204     if (list == NULL)
    205         return NULL;
    206 
    207     i = j = str_len - 1;
    208     while (maxcount-- > 0) {
    209         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
    210             i--;
    211         if (i < 0) break;
    212         j = i; i--;
    213         while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
    214             i--;
    215 #ifndef STRINGLIB_MUTABLE
    216         if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
    217             /* No whitespace in str_obj, so just use it as list[0] */
    218             Py_INCREF(str_obj);
    219             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
    220             count++;
    221             break;
    222         }
    223 #endif
    224         SPLIT_ADD(str, i + 1, j + 1);
    225     }
    226 
    227     if (i >= 0) {
    228         /* Only occurs when maxcount was reached */
    229         /* Skip any remaining whitespace and copy to beginning of string */
    230         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
    231             i--;
    232         if (i >= 0)
    233             SPLIT_ADD(str, 0, i + 1);
    234     }
    235     FIX_PREALLOC_SIZE(list);
    236     if (PyList_Reverse(list) < 0)
    237         goto onError;
    238     return list;
    239 
    240   onError:
    241     Py_DECREF(list);
    242     return NULL;
    243 }
    244 
    245 Py_LOCAL_INLINE(PyObject *)
    246 stringlib_rsplit_char(PyObject* str_obj,
    247                       const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    248                       const STRINGLIB_CHAR ch,
    249                       Py_ssize_t maxcount)
    250 {
    251     Py_ssize_t i, j, count=0;
    252     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
    253     PyObject *sub;
    254 
    255     if (list == NULL)
    256         return NULL;
    257 
    258     i = j = str_len - 1;
    259     while ((i >= 0) && (maxcount-- > 0)) {
    260         for(; i >= 0; i--) {
    261             if (str[i] == ch) {
    262                 SPLIT_ADD(str, i + 1, j + 1);
    263                 j = i = i - 1;
    264                 break;
    265             }
    266         }
    267     }
    268 #ifndef STRINGLIB_MUTABLE
    269     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
    270         /* ch not in str_obj, so just use str_obj as list[0] */
    271         Py_INCREF(str_obj);
    272         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
    273         count++;
    274     } else
    275 #endif
    276     if (j >= -1) {
    277         SPLIT_ADD(str, 0, j + 1);
    278     }
    279     FIX_PREALLOC_SIZE(list);
    280     if (PyList_Reverse(list) < 0)
    281         goto onError;
    282     return list;
    283 
    284   onError:
    285     Py_DECREF(list);
    286     return NULL;
    287 }
    288 
    289 Py_LOCAL_INLINE(PyObject *)
    290 stringlib_rsplit(PyObject* str_obj,
    291                  const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    292                  const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
    293                  Py_ssize_t maxcount)
    294 {
    295     Py_ssize_t j, pos, count=0;
    296     PyObject *list, *sub;
    297 
    298     if (sep_len == 0) {
    299         PyErr_SetString(PyExc_ValueError, "empty separator");
    300         return NULL;
    301     }
    302     else if (sep_len == 1)
    303         return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
    304 
    305     list = PyList_New(PREALLOC_SIZE(maxcount));
    306     if (list == NULL)
    307         return NULL;
    308 
    309     j = str_len;
    310     while (maxcount-- > 0) {
    311         pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
    312         if (pos < 0)
    313             break;
    314         SPLIT_ADD(str, pos + sep_len, j);
    315         j = pos;
    316     }
    317 #ifndef STRINGLIB_MUTABLE
    318     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
    319         /* No match in str_obj, so just use it as list[0] */
    320         Py_INCREF(str_obj);
    321         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
    322         count++;
    323     } else
    324 #endif
    325     {
    326         SPLIT_ADD(str, 0, j);
    327     }
    328     FIX_PREALLOC_SIZE(list);
    329     if (PyList_Reverse(list) < 0)
    330         goto onError;
    331     return list;
    332 
    333   onError:
    334     Py_DECREF(list);
    335     return NULL;
    336 }
    337 
    338 Py_LOCAL_INLINE(PyObject *)
    339 stringlib_splitlines(PyObject* str_obj,
    340                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    341                      int keepends)
    342 {
    343     /* This does not use the preallocated list because splitlines is
    344        usually run with hundreds of newlines.  The overhead of
    345        switching between PyList_SET_ITEM and append causes about a
    346        2-3% slowdown for that common case.  A smarter implementation
    347        could move the if check out, so the SET_ITEMs are done first
    348        and the appends only done when the prealloc buffer is full.
    349        That's too much work for little gain.*/
    350 
    351     register Py_ssize_t i;
    352     register Py_ssize_t j;
    353     PyObject *list = PyList_New(0);
    354     PyObject *sub;
    355 
    356     if (list == NULL)
    357         return NULL;
    358 
    359     for (i = j = 0; i < str_len; ) {
    360         Py_ssize_t eol;
    361 
    362         /* Find a line and append it */
    363         while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
    364             i++;
    365 
    366         /* Skip the line break reading CRLF as one line break */
    367         eol = i;
    368         if (i < str_len) {
    369             if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
    370                 i += 2;
    371             else
    372                 i++;
    373             if (keepends)
    374                 eol = i;
    375         }
    376 #ifndef STRINGLIB_MUTABLE
    377         if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
    378             /* No linebreak in str_obj, so just use it as list[0] */
    379             if (PyList_Append(list, str_obj))
    380                 goto onError;
    381             break;
    382         }
    383 #endif
    384         SPLIT_APPEND(str, j, eol);
    385         j = i;
    386     }
    387     return list;
    388 
    389   onError:
    390     Py_DECREF(list);
    391     return NULL;
    392 }
    393 
    394 #endif
    395