1 /* 2 * Secret Labs' Regular Expression Engine 3 * 4 * regular expression matching engine 5 * 6 * partial history: 7 * 1999-10-24 fl created (based on existing template matcher code) 8 * 2000-03-06 fl first alpha, sort of 9 * 2000-08-01 fl fixes for 1.6b1 10 * 2000-08-07 fl use PyOS_CheckStack() if available 11 * 2000-09-20 fl added expand method 12 * 2001-03-20 fl lots of fixes for 2.1b2 13 * 2001-04-15 fl export copyright as Python attribute, not global 14 * 2001-04-28 fl added __copy__ methods (work in progress) 15 * 2001-05-14 fl fixes for 1.5.2 compatibility 16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) 17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) 18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 19 * 2001-10-21 fl added sub/subn primitive 20 * 2001-10-24 fl added finditer primitive (for 2.2 only) 21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) 22 * 2002-11-09 fl fixed empty sub/subn return type 23 * 2003-04-18 mvl fully support 4-byte codes 24 * 2003-10-17 gn implemented non recursive scheme 25 * 2013-02-04 mrab added fullmatch primitive 26 * 27 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 28 * 29 * This version of the SRE library can be redistributed under CNRI's 30 * Python 1.6 license. For any other use, please contact Secret Labs 31 * AB (info (at) pythonware.com). 32 * 33 * Portions of this engine have been developed in cooperation with 34 * CNRI. Hewlett-Packard provided funding for 1.6 integration and 35 * other compatibility work. 36 */ 37 38 static const char copyright[] = 39 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB "; 40 41 #define PY_SSIZE_T_CLEAN 42 43 #include "Python.h" 44 #include "structmember.h" /* offsetof */ 45 46 #include "sre.h" 47 48 #define SRE_CODE_BITS (8 * sizeof(SRE_CODE)) 49 50 #include <ctype.h> 51 52 /* name of this module, minus the leading underscore */ 53 #if !defined(SRE_MODULE) 54 #define SRE_MODULE "sre" 55 #endif 56 57 #define SRE_PY_MODULE "re" 58 59 /* defining this one enables tracing */ 60 #undef VERBOSE 61 62 /* -------------------------------------------------------------------- */ 63 64 #if defined(_MSC_VER) 65 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */ 66 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */ 67 /* fastest possible local call under MSVC */ 68 #define LOCAL(type) static __inline type __fastcall 69 #else 70 #define LOCAL(type) static inline type 71 #endif 72 73 /* error codes */ 74 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ 75 #define SRE_ERROR_STATE -2 /* illegal state */ 76 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */ 77 #define SRE_ERROR_MEMORY -9 /* out of memory */ 78 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */ 79 80 #if defined(VERBOSE) 81 #define TRACE(v) printf v 82 #else 83 #define TRACE(v) 84 #endif 85 86 /* -------------------------------------------------------------------- */ 87 /* search engine state */ 88 89 #define SRE_IS_DIGIT(ch)\ 90 ((ch) < 128 && Py_ISDIGIT(ch)) 91 #define SRE_IS_SPACE(ch)\ 92 ((ch) < 128 && Py_ISSPACE(ch)) 93 #define SRE_IS_LINEBREAK(ch)\ 94 ((ch) == '\n') 95 #define SRE_IS_ALNUM(ch)\ 96 ((ch) < 128 && Py_ISALNUM(ch)) 97 #define SRE_IS_WORD(ch)\ 98 ((ch) < 128 && (Py_ISALNUM(ch) || (ch) == '_')) 99 100 static unsigned int sre_lower_ascii(unsigned int ch) 101 { 102 return ((ch) < 128 ? Py_TOLOWER(ch) : ch); 103 } 104 105 static unsigned int sre_upper_ascii(unsigned int ch) 106 { 107 return ((ch) < 128 ? Py_TOUPPER(ch) : ch); 108 } 109 110 /* locale-specific character predicates */ 111 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids 112 * warnings when c's type supports only numbers < N+1 */ 113 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0) 114 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_') 115 116 static unsigned int sre_lower_locale(unsigned int ch) 117 { 118 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch); 119 } 120 121 static unsigned int sre_upper_locale(unsigned int ch) 122 { 123 return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch); 124 } 125 126 /* unicode-specific character predicates */ 127 128 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch) 129 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch) 130 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch) 131 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch) 132 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_') 133 134 static unsigned int sre_lower_unicode(unsigned int ch) 135 { 136 return (unsigned int) Py_UNICODE_TOLOWER(ch); 137 } 138 139 static unsigned int sre_upper_unicode(unsigned int ch) 140 { 141 return (unsigned int) Py_UNICODE_TOUPPER(ch); 142 } 143 144 LOCAL(int) 145 sre_category(SRE_CODE category, unsigned int ch) 146 { 147 switch (category) { 148 149 case SRE_CATEGORY_DIGIT: 150 return SRE_IS_DIGIT(ch); 151 case SRE_CATEGORY_NOT_DIGIT: 152 return !SRE_IS_DIGIT(ch); 153 case SRE_CATEGORY_SPACE: 154 return SRE_IS_SPACE(ch); 155 case SRE_CATEGORY_NOT_SPACE: 156 return !SRE_IS_SPACE(ch); 157 case SRE_CATEGORY_WORD: 158 return SRE_IS_WORD(ch); 159 case SRE_CATEGORY_NOT_WORD: 160 return !SRE_IS_WORD(ch); 161 case SRE_CATEGORY_LINEBREAK: 162 return SRE_IS_LINEBREAK(ch); 163 case SRE_CATEGORY_NOT_LINEBREAK: 164 return !SRE_IS_LINEBREAK(ch); 165 166 case SRE_CATEGORY_LOC_WORD: 167 return SRE_LOC_IS_WORD(ch); 168 case SRE_CATEGORY_LOC_NOT_WORD: 169 return !SRE_LOC_IS_WORD(ch); 170 171 case SRE_CATEGORY_UNI_DIGIT: 172 return SRE_UNI_IS_DIGIT(ch); 173 case SRE_CATEGORY_UNI_NOT_DIGIT: 174 return !SRE_UNI_IS_DIGIT(ch); 175 case SRE_CATEGORY_UNI_SPACE: 176 return SRE_UNI_IS_SPACE(ch); 177 case SRE_CATEGORY_UNI_NOT_SPACE: 178 return !SRE_UNI_IS_SPACE(ch); 179 case SRE_CATEGORY_UNI_WORD: 180 return SRE_UNI_IS_WORD(ch); 181 case SRE_CATEGORY_UNI_NOT_WORD: 182 return !SRE_UNI_IS_WORD(ch); 183 case SRE_CATEGORY_UNI_LINEBREAK: 184 return SRE_UNI_IS_LINEBREAK(ch); 185 case SRE_CATEGORY_UNI_NOT_LINEBREAK: 186 return !SRE_UNI_IS_LINEBREAK(ch); 187 } 188 return 0; 189 } 190 191 LOCAL(int) 192 char_loc_ignore(SRE_CODE pattern, SRE_CODE ch) 193 { 194 return ch == pattern 195 || (SRE_CODE) sre_lower_locale(ch) == pattern 196 || (SRE_CODE) sre_upper_locale(ch) == pattern; 197 } 198 199 200 /* helpers */ 201 202 static void 203 data_stack_dealloc(SRE_STATE* state) 204 { 205 if (state->data_stack) { 206 PyMem_FREE(state->data_stack); 207 state->data_stack = NULL; 208 } 209 state->data_stack_size = state->data_stack_base = 0; 210 } 211 212 static int 213 data_stack_grow(SRE_STATE* state, Py_ssize_t size) 214 { 215 Py_ssize_t minsize, cursize; 216 minsize = state->data_stack_base+size; 217 cursize = state->data_stack_size; 218 if (cursize < minsize) { 219 void* stack; 220 cursize = minsize+minsize/4+1024; 221 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize)); 222 stack = PyMem_REALLOC(state->data_stack, cursize); 223 if (!stack) { 224 data_stack_dealloc(state); 225 return SRE_ERROR_MEMORY; 226 } 227 state->data_stack = (char *)stack; 228 state->data_stack_size = cursize; 229 } 230 return 0; 231 } 232 233 /* generate 8-bit version */ 234 235 #define SRE_CHAR Py_UCS1 236 #define SIZEOF_SRE_CHAR 1 237 #define SRE(F) sre_ucs1_##F 238 #include "sre_lib.h" 239 240 /* generate 16-bit unicode version */ 241 242 #define SRE_CHAR Py_UCS2 243 #define SIZEOF_SRE_CHAR 2 244 #define SRE(F) sre_ucs2_##F 245 #include "sre_lib.h" 246 247 /* generate 32-bit unicode version */ 248 249 #define SRE_CHAR Py_UCS4 250 #define SIZEOF_SRE_CHAR 4 251 #define SRE(F) sre_ucs4_##F 252 #include "sre_lib.h" 253 254 /* -------------------------------------------------------------------- */ 255 /* factories and destructors */ 256 257 /* see sre.h for object declarations */ 258 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, Py_ssize_t); 259 static PyObject *pattern_scanner(PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t); 260 261 262 /*[clinic input] 263 module _sre 264 class _sre.SRE_Pattern "PatternObject *" "&Pattern_Type" 265 class _sre.SRE_Match "MatchObject *" "&Match_Type" 266 class _sre.SRE_Scanner "ScannerObject *" "&Scanner_Type" 267 [clinic start generated code]*/ 268 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=b0230ec19a0deac8]*/ 269 270 static PyTypeObject Pattern_Type; 271 static PyTypeObject Match_Type; 272 static PyTypeObject Scanner_Type; 273 274 /*[clinic input] 275 _sre.getcodesize -> int 276 [clinic start generated code]*/ 277 278 static int 279 _sre_getcodesize_impl(PyObject *module) 280 /*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/ 281 { 282 return sizeof(SRE_CODE); 283 } 284 285 /*[clinic input] 286 _sre.ascii_iscased -> bool 287 288 character: int 289 / 290 291 [clinic start generated code]*/ 292 293 static int 294 _sre_ascii_iscased_impl(PyObject *module, int character) 295 /*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/ 296 { 297 unsigned int ch = (unsigned int)character; 298 return ch != sre_lower_ascii(ch) || ch != sre_upper_ascii(ch); 299 } 300 301 /*[clinic input] 302 _sre.unicode_iscased -> bool 303 304 character: int 305 / 306 307 [clinic start generated code]*/ 308 309 static int 310 _sre_unicode_iscased_impl(PyObject *module, int character) 311 /*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/ 312 { 313 unsigned int ch = (unsigned int)character; 314 return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch); 315 } 316 317 /*[clinic input] 318 _sre.ascii_tolower -> int 319 320 character: int 321 / 322 323 [clinic start generated code]*/ 324 325 static int 326 _sre_ascii_tolower_impl(PyObject *module, int character) 327 /*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/ 328 { 329 return sre_lower_ascii(character); 330 } 331 332 /*[clinic input] 333 _sre.unicode_tolower -> int 334 335 character: int 336 / 337 338 [clinic start generated code]*/ 339 340 static int 341 _sre_unicode_tolower_impl(PyObject *module, int character) 342 /*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/ 343 { 344 return sre_lower_unicode(character); 345 } 346 347 LOCAL(void) 348 state_reset(SRE_STATE* state) 349 { 350 /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */ 351 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/ 352 353 state->lastmark = -1; 354 state->lastindex = -1; 355 356 state->repeat = NULL; 357 358 data_stack_dealloc(state); 359 } 360 361 static void* 362 getstring(PyObject* string, Py_ssize_t* p_length, 363 int* p_isbytes, int* p_charsize, 364 Py_buffer *view) 365 { 366 /* given a python object, return a data pointer, a length (in 367 characters), and a character size. return NULL if the object 368 is not a string (or not compatible) */ 369 370 /* Unicode objects do not support the buffer API. So, get the data 371 directly instead. */ 372 if (PyUnicode_Check(string)) { 373 if (PyUnicode_READY(string) == -1) 374 return NULL; 375 *p_length = PyUnicode_GET_LENGTH(string); 376 *p_charsize = PyUnicode_KIND(string); 377 *p_isbytes = 0; 378 return PyUnicode_DATA(string); 379 } 380 381 /* get pointer to byte string buffer */ 382 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) { 383 PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object"); 384 return NULL; 385 } 386 387 *p_length = view->len; 388 *p_charsize = 1; 389 *p_isbytes = 1; 390 391 if (view->buf == NULL) { 392 PyErr_SetString(PyExc_ValueError, "Buffer is NULL"); 393 PyBuffer_Release(view); 394 view->buf = NULL; 395 return NULL; 396 } 397 return view->buf; 398 } 399 400 LOCAL(PyObject*) 401 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, 402 Py_ssize_t start, Py_ssize_t end) 403 { 404 /* prepare state object */ 405 406 Py_ssize_t length; 407 int isbytes, charsize; 408 void* ptr; 409 410 memset(state, 0, sizeof(SRE_STATE)); 411 412 state->mark = PyMem_New(void *, pattern->groups * 2); 413 if (!state->mark) { 414 PyErr_NoMemory(); 415 goto err; 416 } 417 state->lastmark = -1; 418 state->lastindex = -1; 419 420 state->buffer.buf = NULL; 421 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer); 422 if (!ptr) 423 goto err; 424 425 if (isbytes && pattern->isbytes == 0) { 426 PyErr_SetString(PyExc_TypeError, 427 "cannot use a string pattern on a bytes-like object"); 428 goto err; 429 } 430 if (!isbytes && pattern->isbytes > 0) { 431 PyErr_SetString(PyExc_TypeError, 432 "cannot use a bytes pattern on a string-like object"); 433 goto err; 434 } 435 436 /* adjust boundaries */ 437 if (start < 0) 438 start = 0; 439 else if (start > length) 440 start = length; 441 442 if (end < 0) 443 end = 0; 444 else if (end > length) 445 end = length; 446 447 state->isbytes = isbytes; 448 state->charsize = charsize; 449 state->match_all = 0; 450 state->must_advance = 0; 451 452 state->beginning = ptr; 453 454 state->start = (void*) ((char*) ptr + start * state->charsize); 455 state->end = (void*) ((char*) ptr + end * state->charsize); 456 457 Py_INCREF(string); 458 state->string = string; 459 state->pos = start; 460 state->endpos = end; 461 462 return string; 463 err: 464 PyMem_Del(state->mark); 465 state->mark = NULL; 466 if (state->buffer.buf) 467 PyBuffer_Release(&state->buffer); 468 return NULL; 469 } 470 471 LOCAL(void) 472 state_fini(SRE_STATE* state) 473 { 474 if (state->buffer.buf) 475 PyBuffer_Release(&state->buffer); 476 Py_XDECREF(state->string); 477 data_stack_dealloc(state); 478 PyMem_Del(state->mark); 479 state->mark = NULL; 480 } 481 482 /* calculate offset from start of string */ 483 #define STATE_OFFSET(state, member)\ 484 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize) 485 486 LOCAL(PyObject*) 487 getslice(int isbytes, const void *ptr, 488 PyObject* string, Py_ssize_t start, Py_ssize_t end) 489 { 490 if (isbytes) { 491 if (PyBytes_CheckExact(string) && 492 start == 0 && end == PyBytes_GET_SIZE(string)) { 493 Py_INCREF(string); 494 return string; 495 } 496 return PyBytes_FromStringAndSize( 497 (const char *)ptr + start, end - start); 498 } 499 else { 500 return PyUnicode_Substring(string, start, end); 501 } 502 } 503 504 LOCAL(PyObject*) 505 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty) 506 { 507 Py_ssize_t i, j; 508 509 index = (index - 1) * 2; 510 511 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) { 512 if (empty) 513 /* want empty string */ 514 i = j = 0; 515 else { 516 Py_RETURN_NONE; 517 } 518 } else { 519 i = STATE_OFFSET(state, state->mark[index]); 520 j = STATE_OFFSET(state, state->mark[index+1]); 521 } 522 523 return getslice(state->isbytes, state->beginning, string, i, j); 524 } 525 526 static void 527 pattern_error(Py_ssize_t status) 528 { 529 switch (status) { 530 case SRE_ERROR_RECURSION_LIMIT: 531 /* This error code seems to be unused. */ 532 PyErr_SetString( 533 PyExc_RecursionError, 534 "maximum recursion limit exceeded" 535 ); 536 break; 537 case SRE_ERROR_MEMORY: 538 PyErr_NoMemory(); 539 break; 540 case SRE_ERROR_INTERRUPTED: 541 /* An exception has already been raised, so let it fly */ 542 break; 543 default: 544 /* other error codes indicate compiler/engine bugs */ 545 PyErr_SetString( 546 PyExc_RuntimeError, 547 "internal error in regular expression engine" 548 ); 549 } 550 } 551 552 static void 553 pattern_dealloc(PatternObject* self) 554 { 555 if (self->weakreflist != NULL) 556 PyObject_ClearWeakRefs((PyObject *) self); 557 Py_XDECREF(self->pattern); 558 Py_XDECREF(self->groupindex); 559 Py_XDECREF(self->indexgroup); 560 PyObject_DEL(self); 561 } 562 563 LOCAL(Py_ssize_t) 564 sre_match(SRE_STATE* state, SRE_CODE* pattern) 565 { 566 if (state->charsize == 1) 567 return sre_ucs1_match(state, pattern, 1); 568 if (state->charsize == 2) 569 return sre_ucs2_match(state, pattern, 1); 570 assert(state->charsize == 4); 571 return sre_ucs4_match(state, pattern, 1); 572 } 573 574 LOCAL(Py_ssize_t) 575 sre_search(SRE_STATE* state, SRE_CODE* pattern) 576 { 577 if (state->charsize == 1) 578 return sre_ucs1_search(state, pattern); 579 if (state->charsize == 2) 580 return sre_ucs2_search(state, pattern); 581 assert(state->charsize == 4); 582 return sre_ucs4_search(state, pattern); 583 } 584 585 /*[clinic input] 586 _sre.SRE_Pattern.match 587 588 string: object 589 pos: Py_ssize_t = 0 590 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize 591 592 Matches zero or more characters at the beginning of the string. 593 [clinic start generated code]*/ 594 595 static PyObject * 596 _sre_SRE_Pattern_match_impl(PatternObject *self, PyObject *string, 597 Py_ssize_t pos, Py_ssize_t endpos) 598 /*[clinic end generated code: output=ea2d838888510661 input=a2ba191647abebe5]*/ 599 { 600 SRE_STATE state; 601 Py_ssize_t status; 602 PyObject *match; 603 604 if (!state_init(&state, (PatternObject *)self, string, pos, endpos)) 605 return NULL; 606 607 state.ptr = state.start; 608 609 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr)); 610 611 status = sre_match(&state, PatternObject_GetCode(self)); 612 613 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); 614 if (PyErr_Occurred()) { 615 state_fini(&state); 616 return NULL; 617 } 618 619 match = pattern_new_match(self, &state, status); 620 state_fini(&state); 621 return match; 622 } 623 624 /*[clinic input] 625 _sre.SRE_Pattern.fullmatch 626 627 string: object 628 pos: Py_ssize_t = 0 629 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize 630 631 Matches against all of the string. 632 [clinic start generated code]*/ 633 634 static PyObject * 635 _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyObject *string, 636 Py_ssize_t pos, Py_ssize_t endpos) 637 /*[clinic end generated code: output=5833c47782a35f4a input=d9fb03a7625b5828]*/ 638 { 639 SRE_STATE state; 640 Py_ssize_t status; 641 PyObject *match; 642 643 if (!state_init(&state, self, string, pos, endpos)) 644 return NULL; 645 646 state.ptr = state.start; 647 648 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr)); 649 650 state.match_all = 1; 651 status = sre_match(&state, PatternObject_GetCode(self)); 652 653 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); 654 if (PyErr_Occurred()) { 655 state_fini(&state); 656 return NULL; 657 } 658 659 match = pattern_new_match(self, &state, status); 660 state_fini(&state); 661 return match; 662 } 663 664 /*[clinic input] 665 _sre.SRE_Pattern.search 666 667 string: object 668 pos: Py_ssize_t = 0 669 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize 670 671 Scan through string looking for a match, and return a corresponding match object instance. 672 673 Return None if no position in the string matches. 674 [clinic start generated code]*/ 675 676 static PyObject * 677 _sre_SRE_Pattern_search_impl(PatternObject *self, PyObject *string, 678 Py_ssize_t pos, Py_ssize_t endpos) 679 /*[clinic end generated code: output=25f302a644e951e8 input=4ae5cb7dc38fed1b]*/ 680 { 681 SRE_STATE state; 682 Py_ssize_t status; 683 PyObject *match; 684 685 if (!state_init(&state, self, string, pos, endpos)) 686 return NULL; 687 688 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr)); 689 690 status = sre_search(&state, PatternObject_GetCode(self)); 691 692 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); 693 694 if (PyErr_Occurred()) { 695 state_fini(&state); 696 return NULL; 697 } 698 699 match = pattern_new_match(self, &state, status); 700 state_fini(&state); 701 return match; 702 } 703 704 static PyObject* 705 call(const char* module, const char* function, PyObject* args) 706 { 707 PyObject* name; 708 PyObject* mod; 709 PyObject* func; 710 PyObject* result; 711 712 if (!args) 713 return NULL; 714 name = PyUnicode_FromString(module); 715 if (!name) 716 return NULL; 717 mod = PyImport_Import(name); 718 Py_DECREF(name); 719 if (!mod) 720 return NULL; 721 func = PyObject_GetAttrString(mod, function); 722 Py_DECREF(mod); 723 if (!func) 724 return NULL; 725 result = PyObject_CallObject(func, args); 726 Py_DECREF(func); 727 Py_DECREF(args); 728 return result; 729 } 730 731 /*[clinic input] 732 _sre.SRE_Pattern.findall 733 734 string: object 735 pos: Py_ssize_t = 0 736 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize 737 738 Return a list of all non-overlapping matches of pattern in string. 739 [clinic start generated code]*/ 740 741 static PyObject * 742 _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string, 743 Py_ssize_t pos, Py_ssize_t endpos) 744 /*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/ 745 { 746 SRE_STATE state; 747 PyObject* list; 748 Py_ssize_t status; 749 Py_ssize_t i, b, e; 750 751 if (!state_init(&state, self, string, pos, endpos)) 752 return NULL; 753 754 list = PyList_New(0); 755 if (!list) { 756 state_fini(&state); 757 return NULL; 758 } 759 760 while (state.start <= state.end) { 761 762 PyObject* item; 763 764 state_reset(&state); 765 766 state.ptr = state.start; 767 768 status = sre_search(&state, PatternObject_GetCode(self)); 769 if (PyErr_Occurred()) 770 goto error; 771 772 if (status <= 0) { 773 if (status == 0) 774 break; 775 pattern_error(status); 776 goto error; 777 } 778 779 /* don't bother to build a match object */ 780 switch (self->groups) { 781 case 0: 782 b = STATE_OFFSET(&state, state.start); 783 e = STATE_OFFSET(&state, state.ptr); 784 item = getslice(state.isbytes, state.beginning, 785 string, b, e); 786 if (!item) 787 goto error; 788 break; 789 case 1: 790 item = state_getslice(&state, 1, string, 1); 791 if (!item) 792 goto error; 793 break; 794 default: 795 item = PyTuple_New(self->groups); 796 if (!item) 797 goto error; 798 for (i = 0; i < self->groups; i++) { 799 PyObject* o = state_getslice(&state, i+1, string, 1); 800 if (!o) { 801 Py_DECREF(item); 802 goto error; 803 } 804 PyTuple_SET_ITEM(item, i, o); 805 } 806 break; 807 } 808 809 status = PyList_Append(list, item); 810 Py_DECREF(item); 811 if (status < 0) 812 goto error; 813 814 state.must_advance = (state.ptr == state.start); 815 state.start = state.ptr; 816 } 817 818 state_fini(&state); 819 return list; 820 821 error: 822 Py_DECREF(list); 823 state_fini(&state); 824 return NULL; 825 826 } 827 828 /*[clinic input] 829 _sre.SRE_Pattern.finditer 830 831 string: object 832 pos: Py_ssize_t = 0 833 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize 834 835 Return an iterator over all non-overlapping matches for the RE pattern in string. 836 837 For each match, the iterator returns a match object. 838 [clinic start generated code]*/ 839 840 static PyObject * 841 _sre_SRE_Pattern_finditer_impl(PatternObject *self, PyObject *string, 842 Py_ssize_t pos, Py_ssize_t endpos) 843 /*[clinic end generated code: output=0bbb1a0aeb38bb14 input=612aab69e9fe08e4]*/ 844 { 845 PyObject* scanner; 846 PyObject* search; 847 PyObject* iterator; 848 849 scanner = pattern_scanner(self, string, pos, endpos); 850 if (!scanner) 851 return NULL; 852 853 search = PyObject_GetAttrString(scanner, "search"); 854 Py_DECREF(scanner); 855 if (!search) 856 return NULL; 857 858 iterator = PyCallIter_New(search, Py_None); 859 Py_DECREF(search); 860 861 return iterator; 862 } 863 864 /*[clinic input] 865 _sre.SRE_Pattern.scanner 866 867 string: object 868 pos: Py_ssize_t = 0 869 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize 870 871 [clinic start generated code]*/ 872 873 static PyObject * 874 _sre_SRE_Pattern_scanner_impl(PatternObject *self, PyObject *string, 875 Py_ssize_t pos, Py_ssize_t endpos) 876 /*[clinic end generated code: output=54ea548aed33890b input=3aacdbde77a3a637]*/ 877 { 878 return pattern_scanner(self, string, pos, endpos); 879 } 880 881 /*[clinic input] 882 _sre.SRE_Pattern.split 883 884 string: object 885 maxsplit: Py_ssize_t = 0 886 887 Split string by the occurrences of pattern. 888 [clinic start generated code]*/ 889 890 static PyObject * 891 _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, 892 Py_ssize_t maxsplit) 893 /*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/ 894 { 895 SRE_STATE state; 896 PyObject* list; 897 PyObject* item; 898 Py_ssize_t status; 899 Py_ssize_t n; 900 Py_ssize_t i; 901 void* last; 902 903 assert(self->codesize != 0); 904 905 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) 906 return NULL; 907 908 list = PyList_New(0); 909 if (!list) { 910 state_fini(&state); 911 return NULL; 912 } 913 914 n = 0; 915 last = state.start; 916 917 while (!maxsplit || n < maxsplit) { 918 919 state_reset(&state); 920 921 state.ptr = state.start; 922 923 status = sre_search(&state, PatternObject_GetCode(self)); 924 if (PyErr_Occurred()) 925 goto error; 926 927 if (status <= 0) { 928 if (status == 0) 929 break; 930 pattern_error(status); 931 goto error; 932 } 933 934 /* get segment before this match */ 935 item = getslice(state.isbytes, state.beginning, 936 string, STATE_OFFSET(&state, last), 937 STATE_OFFSET(&state, state.start) 938 ); 939 if (!item) 940 goto error; 941 status = PyList_Append(list, item); 942 Py_DECREF(item); 943 if (status < 0) 944 goto error; 945 946 /* add groups (if any) */ 947 for (i = 0; i < self->groups; i++) { 948 item = state_getslice(&state, i+1, string, 0); 949 if (!item) 950 goto error; 951 status = PyList_Append(list, item); 952 Py_DECREF(item); 953 if (status < 0) 954 goto error; 955 } 956 957 n = n + 1; 958 state.must_advance = (state.ptr == state.start); 959 last = state.start = state.ptr; 960 961 } 962 963 /* get segment following last match (even if empty) */ 964 item = getslice(state.isbytes, state.beginning, 965 string, STATE_OFFSET(&state, last), state.endpos 966 ); 967 if (!item) 968 goto error; 969 status = PyList_Append(list, item); 970 Py_DECREF(item); 971 if (status < 0) 972 goto error; 973 974 state_fini(&state); 975 return list; 976 977 error: 978 Py_DECREF(list); 979 state_fini(&state); 980 return NULL; 981 982 } 983 984 static PyObject* 985 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, 986 Py_ssize_t count, Py_ssize_t subn) 987 { 988 SRE_STATE state; 989 PyObject* list; 990 PyObject* joiner; 991 PyObject* item; 992 PyObject* filter; 993 PyObject* match; 994 void* ptr; 995 Py_ssize_t status; 996 Py_ssize_t n; 997 Py_ssize_t i, b, e; 998 int isbytes, charsize; 999 int filter_is_callable; 1000 Py_buffer view; 1001 1002 if (PyCallable_Check(ptemplate)) { 1003 /* sub/subn takes either a function or a template */ 1004 filter = ptemplate; 1005 Py_INCREF(filter); 1006 filter_is_callable = 1; 1007 } else { 1008 /* if not callable, check if it's a literal string */ 1009 int literal; 1010 view.buf = NULL; 1011 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view); 1012 b = charsize; 1013 if (ptr) { 1014 if (charsize == 1) 1015 literal = memchr(ptr, '\\', n) == NULL; 1016 else 1017 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1; 1018 } else { 1019 PyErr_Clear(); 1020 literal = 0; 1021 } 1022 if (view.buf) 1023 PyBuffer_Release(&view); 1024 if (literal) { 1025 filter = ptemplate; 1026 Py_INCREF(filter); 1027 filter_is_callable = 0; 1028 } else { 1029 /* not a literal; hand it over to the template compiler */ 1030 filter = call( 1031 SRE_PY_MODULE, "_subx", 1032 PyTuple_Pack(2, self, ptemplate) 1033 ); 1034 if (!filter) 1035 return NULL; 1036 filter_is_callable = PyCallable_Check(filter); 1037 } 1038 } 1039 1040 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) { 1041 Py_DECREF(filter); 1042 return NULL; 1043 } 1044 1045 list = PyList_New(0); 1046 if (!list) { 1047 Py_DECREF(filter); 1048 state_fini(&state); 1049 return NULL; 1050 } 1051 1052 n = i = 0; 1053 1054 while (!count || n < count) { 1055 1056 state_reset(&state); 1057 1058 state.ptr = state.start; 1059 1060 status = sre_search(&state, PatternObject_GetCode(self)); 1061 if (PyErr_Occurred()) 1062 goto error; 1063 1064 if (status <= 0) { 1065 if (status == 0) 1066 break; 1067 pattern_error(status); 1068 goto error; 1069 } 1070 1071 b = STATE_OFFSET(&state, state.start); 1072 e = STATE_OFFSET(&state, state.ptr); 1073 1074 if (i < b) { 1075 /* get segment before this match */ 1076 item = getslice(state.isbytes, state.beginning, 1077 string, i, b); 1078 if (!item) 1079 goto error; 1080 status = PyList_Append(list, item); 1081 Py_DECREF(item); 1082 if (status < 0) 1083 goto error; 1084 1085 } 1086 1087 if (filter_is_callable) { 1088 /* pass match object through filter */ 1089 match = pattern_new_match(self, &state, 1); 1090 if (!match) 1091 goto error; 1092 item = PyObject_CallFunctionObjArgs(filter, match, NULL); 1093 Py_DECREF(match); 1094 if (!item) 1095 goto error; 1096 } else { 1097 /* filter is literal string */ 1098 item = filter; 1099 Py_INCREF(item); 1100 } 1101 1102 /* add to list */ 1103 if (item != Py_None) { 1104 status = PyList_Append(list, item); 1105 Py_DECREF(item); 1106 if (status < 0) 1107 goto error; 1108 } 1109 1110 i = e; 1111 n = n + 1; 1112 state.must_advance = (state.ptr == state.start); 1113 state.start = state.ptr; 1114 } 1115 1116 /* get segment following last match */ 1117 if (i < state.endpos) { 1118 item = getslice(state.isbytes, state.beginning, 1119 string, i, state.endpos); 1120 if (!item) 1121 goto error; 1122 status = PyList_Append(list, item); 1123 Py_DECREF(item); 1124 if (status < 0) 1125 goto error; 1126 } 1127 1128 state_fini(&state); 1129 1130 Py_DECREF(filter); 1131 1132 /* convert list to single string (also removes list) */ 1133 joiner = getslice(state.isbytes, state.beginning, string, 0, 0); 1134 if (!joiner) { 1135 Py_DECREF(list); 1136 return NULL; 1137 } 1138 if (PyList_GET_SIZE(list) == 0) { 1139 Py_DECREF(list); 1140 item = joiner; 1141 } 1142 else { 1143 if (state.isbytes) 1144 item = _PyBytes_Join(joiner, list); 1145 else 1146 item = PyUnicode_Join(joiner, list); 1147 Py_DECREF(joiner); 1148 Py_DECREF(list); 1149 if (!item) 1150 return NULL; 1151 } 1152 1153 if (subn) 1154 return Py_BuildValue("Nn", item, n); 1155 1156 return item; 1157 1158 error: 1159 Py_DECREF(list); 1160 state_fini(&state); 1161 Py_DECREF(filter); 1162 return NULL; 1163 1164 } 1165 1166 /*[clinic input] 1167 _sre.SRE_Pattern.sub 1168 1169 repl: object 1170 string: object 1171 count: Py_ssize_t = 0 1172 1173 Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl. 1174 [clinic start generated code]*/ 1175 1176 static PyObject * 1177 _sre_SRE_Pattern_sub_impl(PatternObject *self, PyObject *repl, 1178 PyObject *string, Py_ssize_t count) 1179 /*[clinic end generated code: output=1dbf2ec3479cba00 input=c53d70be0b3caf86]*/ 1180 { 1181 return pattern_subx(self, repl, string, count, 0); 1182 } 1183 1184 /*[clinic input] 1185 _sre.SRE_Pattern.subn 1186 1187 repl: object 1188 string: object 1189 count: Py_ssize_t = 0 1190 1191 Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl. 1192 [clinic start generated code]*/ 1193 1194 static PyObject * 1195 _sre_SRE_Pattern_subn_impl(PatternObject *self, PyObject *repl, 1196 PyObject *string, Py_ssize_t count) 1197 /*[clinic end generated code: output=0d9522cd529e9728 input=e7342d7ce6083577]*/ 1198 { 1199 return pattern_subx(self, repl, string, count, 1); 1200 } 1201 1202 /*[clinic input] 1203 _sre.SRE_Pattern.__copy__ 1204 1205 [clinic start generated code]*/ 1206 1207 static PyObject * 1208 _sre_SRE_Pattern___copy___impl(PatternObject *self) 1209 /*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/ 1210 { 1211 Py_INCREF(self); 1212 return (PyObject *)self; 1213 } 1214 1215 /*[clinic input] 1216 _sre.SRE_Pattern.__deepcopy__ 1217 1218 memo: object 1219 / 1220 1221 [clinic start generated code]*/ 1222 1223 static PyObject * 1224 _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo) 1225 /*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/ 1226 { 1227 Py_INCREF(self); 1228 return (PyObject *)self; 1229 } 1230 1231 static PyObject * 1232 pattern_repr(PatternObject *obj) 1233 { 1234 static const struct { 1235 const char *name; 1236 int value; 1237 } flag_names[] = { 1238 {"re.TEMPLATE", SRE_FLAG_TEMPLATE}, 1239 {"re.IGNORECASE", SRE_FLAG_IGNORECASE}, 1240 {"re.LOCALE", SRE_FLAG_LOCALE}, 1241 {"re.MULTILINE", SRE_FLAG_MULTILINE}, 1242 {"re.DOTALL", SRE_FLAG_DOTALL}, 1243 {"re.UNICODE", SRE_FLAG_UNICODE}, 1244 {"re.VERBOSE", SRE_FLAG_VERBOSE}, 1245 {"re.DEBUG", SRE_FLAG_DEBUG}, 1246 {"re.ASCII", SRE_FLAG_ASCII}, 1247 }; 1248 PyObject *result = NULL; 1249 PyObject *flag_items; 1250 size_t i; 1251 int flags = obj->flags; 1252 1253 /* Omit re.UNICODE for valid string patterns. */ 1254 if (obj->isbytes == 0 && 1255 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) == 1256 SRE_FLAG_UNICODE) 1257 flags &= ~SRE_FLAG_UNICODE; 1258 1259 flag_items = PyList_New(0); 1260 if (!flag_items) 1261 return NULL; 1262 1263 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) { 1264 if (flags & flag_names[i].value) { 1265 PyObject *item = PyUnicode_FromString(flag_names[i].name); 1266 if (!item) 1267 goto done; 1268 1269 if (PyList_Append(flag_items, item) < 0) { 1270 Py_DECREF(item); 1271 goto done; 1272 } 1273 Py_DECREF(item); 1274 flags &= ~flag_names[i].value; 1275 } 1276 } 1277 if (flags) { 1278 PyObject *item = PyUnicode_FromFormat("0x%x", flags); 1279 if (!item) 1280 goto done; 1281 1282 if (PyList_Append(flag_items, item) < 0) { 1283 Py_DECREF(item); 1284 goto done; 1285 } 1286 Py_DECREF(item); 1287 } 1288 1289 if (PyList_Size(flag_items) > 0) { 1290 PyObject *flags_result; 1291 PyObject *sep = PyUnicode_FromString("|"); 1292 if (!sep) 1293 goto done; 1294 flags_result = PyUnicode_Join(sep, flag_items); 1295 Py_DECREF(sep); 1296 if (!flags_result) 1297 goto done; 1298 result = PyUnicode_FromFormat("re.compile(%.200R, %S)", 1299 obj->pattern, flags_result); 1300 Py_DECREF(flags_result); 1301 } 1302 else { 1303 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern); 1304 } 1305 1306 done: 1307 Py_DECREF(flag_items); 1308 return result; 1309 } 1310 1311 PyDoc_STRVAR(pattern_doc, "Compiled regular expression object."); 1312 1313 /* PatternObject's 'groupindex' method. */ 1314 static PyObject * 1315 pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored)) 1316 { 1317 if (self->groupindex == NULL) 1318 return PyDict_New(); 1319 return PyDictProxy_New(self->groupindex); 1320 } 1321 1322 static int _validate(PatternObject *self); /* Forward */ 1323 1324 /*[clinic input] 1325 _sre.compile 1326 1327 pattern: object 1328 flags: int 1329 code: object(subclass_of='&PyList_Type') 1330 groups: Py_ssize_t 1331 groupindex: object(subclass_of='&PyDict_Type') 1332 indexgroup: object(subclass_of='&PyTuple_Type') 1333 1334 [clinic start generated code]*/ 1335 1336 static PyObject * 1337 _sre_compile_impl(PyObject *module, PyObject *pattern, int flags, 1338 PyObject *code, Py_ssize_t groups, PyObject *groupindex, 1339 PyObject *indexgroup) 1340 /*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/ 1341 { 1342 /* "compile" pattern descriptor to pattern object */ 1343 1344 PatternObject* self; 1345 Py_ssize_t i, n; 1346 1347 n = PyList_GET_SIZE(code); 1348 /* coverity[ampersand_in_size] */ 1349 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n); 1350 if (!self) 1351 return NULL; 1352 self->weakreflist = NULL; 1353 self->pattern = NULL; 1354 self->groupindex = NULL; 1355 self->indexgroup = NULL; 1356 1357 self->codesize = n; 1358 1359 for (i = 0; i < n; i++) { 1360 PyObject *o = PyList_GET_ITEM(code, i); 1361 unsigned long value = PyLong_AsUnsignedLong(o); 1362 self->code[i] = (SRE_CODE) value; 1363 if ((unsigned long) self->code[i] != value) { 1364 PyErr_SetString(PyExc_OverflowError, 1365 "regular expression code size limit exceeded"); 1366 break; 1367 } 1368 } 1369 1370 if (PyErr_Occurred()) { 1371 Py_DECREF(self); 1372 return NULL; 1373 } 1374 1375 if (pattern == Py_None) { 1376 self->isbytes = -1; 1377 } 1378 else { 1379 Py_ssize_t p_length; 1380 int charsize; 1381 Py_buffer view; 1382 view.buf = NULL; 1383 if (!getstring(pattern, &p_length, &self->isbytes, 1384 &charsize, &view)) { 1385 Py_DECREF(self); 1386 return NULL; 1387 } 1388 if (view.buf) 1389 PyBuffer_Release(&view); 1390 } 1391 1392 Py_INCREF(pattern); 1393 self->pattern = pattern; 1394 1395 self->flags = flags; 1396 1397 self->groups = groups; 1398 1399 if (PyDict_GET_SIZE(groupindex) > 0) { 1400 Py_INCREF(groupindex); 1401 self->groupindex = groupindex; 1402 if (PyTuple_GET_SIZE(indexgroup) > 0) { 1403 Py_INCREF(indexgroup); 1404 self->indexgroup = indexgroup; 1405 } 1406 } 1407 1408 if (!_validate(self)) { 1409 Py_DECREF(self); 1410 return NULL; 1411 } 1412 1413 return (PyObject*) self; 1414 } 1415 1416 /* -------------------------------------------------------------------- */ 1417 /* Code validation */ 1418 1419 /* To learn more about this code, have a look at the _compile() function in 1420 Lib/sre_compile.py. The validation functions below checks the code array 1421 for conformance with the code patterns generated there. 1422 1423 The nice thing about the generated code is that it is position-independent: 1424 all jumps are relative jumps forward. Also, jumps don't cross each other: 1425 the target of a later jump is always earlier than the target of an earlier 1426 jump. IOW, this is okay: 1427 1428 J---------J-------T--------T 1429 \ \_____/ / 1430 \______________________/ 1431 1432 but this is not: 1433 1434 J---------J-------T--------T 1435 \_________\_____/ / 1436 \____________/ 1437 1438 It also helps that SRE_CODE is always an unsigned type. 1439 */ 1440 1441 /* Defining this one enables tracing of the validator */ 1442 #undef VVERBOSE 1443 1444 /* Trace macro for the validator */ 1445 #if defined(VVERBOSE) 1446 #define VTRACE(v) printf v 1447 #else 1448 #define VTRACE(v) do {} while(0) /* do nothing */ 1449 #endif 1450 1451 /* Report failure */ 1452 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0) 1453 1454 /* Extract opcode, argument, or skip count from code array */ 1455 #define GET_OP \ 1456 do { \ 1457 VTRACE(("%p: ", code)); \ 1458 if (code >= end) FAIL; \ 1459 op = *code++; \ 1460 VTRACE(("%lu (op)\n", (unsigned long)op)); \ 1461 } while (0) 1462 #define GET_ARG \ 1463 do { \ 1464 VTRACE(("%p= ", code)); \ 1465 if (code >= end) FAIL; \ 1466 arg = *code++; \ 1467 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \ 1468 } while (0) 1469 #define GET_SKIP_ADJ(adj) \ 1470 do { \ 1471 VTRACE(("%p= ", code)); \ 1472 if (code >= end) FAIL; \ 1473 skip = *code; \ 1474 VTRACE(("%lu (skip to %p)\n", \ 1475 (unsigned long)skip, code+skip)); \ 1476 if (skip-adj > (uintptr_t)(end - code)) \ 1477 FAIL; \ 1478 code++; \ 1479 } while (0) 1480 #define GET_SKIP GET_SKIP_ADJ(0) 1481 1482 static int 1483 _validate_charset(SRE_CODE *code, SRE_CODE *end) 1484 { 1485 /* Some variables are manipulated by the macros above */ 1486 SRE_CODE op; 1487 SRE_CODE arg; 1488 SRE_CODE offset; 1489 int i; 1490 1491 while (code < end) { 1492 GET_OP; 1493 switch (op) { 1494 1495 case SRE_OP_NEGATE: 1496 break; 1497 1498 case SRE_OP_LITERAL: 1499 GET_ARG; 1500 break; 1501 1502 case SRE_OP_RANGE: 1503 case SRE_OP_RANGE_UNI_IGNORE: 1504 GET_ARG; 1505 GET_ARG; 1506 break; 1507 1508 case SRE_OP_CHARSET: 1509 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */ 1510 if (offset > (uintptr_t)(end - code)) 1511 FAIL; 1512 code += offset; 1513 break; 1514 1515 case SRE_OP_BIGCHARSET: 1516 GET_ARG; /* Number of blocks */ 1517 offset = 256/sizeof(SRE_CODE); /* 256-byte table */ 1518 if (offset > (uintptr_t)(end - code)) 1519 FAIL; 1520 /* Make sure that each byte points to a valid block */ 1521 for (i = 0; i < 256; i++) { 1522 if (((unsigned char *)code)[i] >= arg) 1523 FAIL; 1524 } 1525 code += offset; 1526 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */ 1527 if (offset > (uintptr_t)(end - code)) 1528 FAIL; 1529 code += offset; 1530 break; 1531 1532 case SRE_OP_CATEGORY: 1533 GET_ARG; 1534 switch (arg) { 1535 case SRE_CATEGORY_DIGIT: 1536 case SRE_CATEGORY_NOT_DIGIT: 1537 case SRE_CATEGORY_SPACE: 1538 case SRE_CATEGORY_NOT_SPACE: 1539 case SRE_CATEGORY_WORD: 1540 case SRE_CATEGORY_NOT_WORD: 1541 case SRE_CATEGORY_LINEBREAK: 1542 case SRE_CATEGORY_NOT_LINEBREAK: 1543 case SRE_CATEGORY_LOC_WORD: 1544 case SRE_CATEGORY_LOC_NOT_WORD: 1545 case SRE_CATEGORY_UNI_DIGIT: 1546 case SRE_CATEGORY_UNI_NOT_DIGIT: 1547 case SRE_CATEGORY_UNI_SPACE: 1548 case SRE_CATEGORY_UNI_NOT_SPACE: 1549 case SRE_CATEGORY_UNI_WORD: 1550 case SRE_CATEGORY_UNI_NOT_WORD: 1551 case SRE_CATEGORY_UNI_LINEBREAK: 1552 case SRE_CATEGORY_UNI_NOT_LINEBREAK: 1553 break; 1554 default: 1555 FAIL; 1556 } 1557 break; 1558 1559 default: 1560 FAIL; 1561 1562 } 1563 } 1564 1565 return 1; 1566 } 1567 1568 static int 1569 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) 1570 { 1571 /* Some variables are manipulated by the macros above */ 1572 SRE_CODE op; 1573 SRE_CODE arg; 1574 SRE_CODE skip; 1575 1576 VTRACE(("code=%p, end=%p\n", code, end)); 1577 1578 if (code > end) 1579 FAIL; 1580 1581 while (code < end) { 1582 GET_OP; 1583 switch (op) { 1584 1585 case SRE_OP_MARK: 1586 /* We don't check whether marks are properly nested; the 1587 sre_match() code is robust even if they don't, and the worst 1588 you can get is nonsensical match results. */ 1589 GET_ARG; 1590 if (arg > 2 * (size_t)groups + 1) { 1591 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups)); 1592 FAIL; 1593 } 1594 break; 1595 1596 case SRE_OP_LITERAL: 1597 case SRE_OP_NOT_LITERAL: 1598 case SRE_OP_LITERAL_IGNORE: 1599 case SRE_OP_NOT_LITERAL_IGNORE: 1600 case SRE_OP_LITERAL_UNI_IGNORE: 1601 case SRE_OP_NOT_LITERAL_UNI_IGNORE: 1602 case SRE_OP_LITERAL_LOC_IGNORE: 1603 case SRE_OP_NOT_LITERAL_LOC_IGNORE: 1604 GET_ARG; 1605 /* The arg is just a character, nothing to check */ 1606 break; 1607 1608 case SRE_OP_SUCCESS: 1609 case SRE_OP_FAILURE: 1610 /* Nothing to check; these normally end the matching process */ 1611 break; 1612 1613 case SRE_OP_AT: 1614 GET_ARG; 1615 switch (arg) { 1616 case SRE_AT_BEGINNING: 1617 case SRE_AT_BEGINNING_STRING: 1618 case SRE_AT_BEGINNING_LINE: 1619 case SRE_AT_END: 1620 case SRE_AT_END_LINE: 1621 case SRE_AT_END_STRING: 1622 case SRE_AT_BOUNDARY: 1623 case SRE_AT_NON_BOUNDARY: 1624 case SRE_AT_LOC_BOUNDARY: 1625 case SRE_AT_LOC_NON_BOUNDARY: 1626 case SRE_AT_UNI_BOUNDARY: 1627 case SRE_AT_UNI_NON_BOUNDARY: 1628 break; 1629 default: 1630 FAIL; 1631 } 1632 break; 1633 1634 case SRE_OP_ANY: 1635 case SRE_OP_ANY_ALL: 1636 /* These have no operands */ 1637 break; 1638 1639 case SRE_OP_IN: 1640 case SRE_OP_IN_IGNORE: 1641 case SRE_OP_IN_UNI_IGNORE: 1642 case SRE_OP_IN_LOC_IGNORE: 1643 GET_SKIP; 1644 /* Stop 1 before the end; we check the FAILURE below */ 1645 if (!_validate_charset(code, code+skip-2)) 1646 FAIL; 1647 if (code[skip-2] != SRE_OP_FAILURE) 1648 FAIL; 1649 code += skip-1; 1650 break; 1651 1652 case SRE_OP_INFO: 1653 { 1654 /* A minimal info field is 1655 <INFO> <1=skip> <2=flags> <3=min> <4=max>; 1656 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags, 1657 more follows. */ 1658 SRE_CODE flags, i; 1659 SRE_CODE *newcode; 1660 GET_SKIP; 1661 newcode = code+skip-1; 1662 GET_ARG; flags = arg; 1663 GET_ARG; 1664 GET_ARG; 1665 /* Check that only valid flags are present */ 1666 if ((flags & ~(SRE_INFO_PREFIX | 1667 SRE_INFO_LITERAL | 1668 SRE_INFO_CHARSET)) != 0) 1669 FAIL; 1670 /* PREFIX and CHARSET are mutually exclusive */ 1671 if ((flags & SRE_INFO_PREFIX) && 1672 (flags & SRE_INFO_CHARSET)) 1673 FAIL; 1674 /* LITERAL implies PREFIX */ 1675 if ((flags & SRE_INFO_LITERAL) && 1676 !(flags & SRE_INFO_PREFIX)) 1677 FAIL; 1678 /* Validate the prefix */ 1679 if (flags & SRE_INFO_PREFIX) { 1680 SRE_CODE prefix_len; 1681 GET_ARG; prefix_len = arg; 1682 GET_ARG; 1683 /* Here comes the prefix string */ 1684 if (prefix_len > (uintptr_t)(newcode - code)) 1685 FAIL; 1686 code += prefix_len; 1687 /* And here comes the overlap table */ 1688 if (prefix_len > (uintptr_t)(newcode - code)) 1689 FAIL; 1690 /* Each overlap value should be < prefix_len */ 1691 for (i = 0; i < prefix_len; i++) { 1692 if (code[i] >= prefix_len) 1693 FAIL; 1694 } 1695 code += prefix_len; 1696 } 1697 /* Validate the charset */ 1698 if (flags & SRE_INFO_CHARSET) { 1699 if (!_validate_charset(code, newcode-1)) 1700 FAIL; 1701 if (newcode[-1] != SRE_OP_FAILURE) 1702 FAIL; 1703 code = newcode; 1704 } 1705 else if (code != newcode) { 1706 VTRACE(("code=%p, newcode=%p\n", code, newcode)); 1707 FAIL; 1708 } 1709 } 1710 break; 1711 1712 case SRE_OP_BRANCH: 1713 { 1714 SRE_CODE *target = NULL; 1715 for (;;) { 1716 GET_SKIP; 1717 if (skip == 0) 1718 break; 1719 /* Stop 2 before the end; we check the JUMP below */ 1720 if (!_validate_inner(code, code+skip-3, groups)) 1721 FAIL; 1722 code += skip-3; 1723 /* Check that it ends with a JUMP, and that each JUMP 1724 has the same target */ 1725 GET_OP; 1726 if (op != SRE_OP_JUMP) 1727 FAIL; 1728 GET_SKIP; 1729 if (target == NULL) 1730 target = code+skip-1; 1731 else if (code+skip-1 != target) 1732 FAIL; 1733 } 1734 } 1735 break; 1736 1737 case SRE_OP_REPEAT_ONE: 1738 case SRE_OP_MIN_REPEAT_ONE: 1739 { 1740 SRE_CODE min, max; 1741 GET_SKIP; 1742 GET_ARG; min = arg; 1743 GET_ARG; max = arg; 1744 if (min > max) 1745 FAIL; 1746 if (max > SRE_MAXREPEAT) 1747 FAIL; 1748 if (!_validate_inner(code, code+skip-4, groups)) 1749 FAIL; 1750 code += skip-4; 1751 GET_OP; 1752 if (op != SRE_OP_SUCCESS) 1753 FAIL; 1754 } 1755 break; 1756 1757 case SRE_OP_REPEAT: 1758 { 1759 SRE_CODE min, max; 1760 GET_SKIP; 1761 GET_ARG; min = arg; 1762 GET_ARG; max = arg; 1763 if (min > max) 1764 FAIL; 1765 if (max > SRE_MAXREPEAT) 1766 FAIL; 1767 if (!_validate_inner(code, code+skip-3, groups)) 1768 FAIL; 1769 code += skip-3; 1770 GET_OP; 1771 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) 1772 FAIL; 1773 } 1774 break; 1775 1776 case SRE_OP_GROUPREF: 1777 case SRE_OP_GROUPREF_IGNORE: 1778 case SRE_OP_GROUPREF_UNI_IGNORE: 1779 case SRE_OP_GROUPREF_LOC_IGNORE: 1780 GET_ARG; 1781 if (arg >= (size_t)groups) 1782 FAIL; 1783 break; 1784 1785 case SRE_OP_GROUPREF_EXISTS: 1786 /* The regex syntax for this is: '(?(group)then|else)', where 1787 'group' is either an integer group number or a group name, 1788 'then' and 'else' are sub-regexes, and 'else' is optional. */ 1789 GET_ARG; 1790 if (arg >= (size_t)groups) 1791 FAIL; 1792 GET_SKIP_ADJ(1); 1793 code--; /* The skip is relative to the first arg! */ 1794 /* There are two possibilities here: if there is both a 'then' 1795 part and an 'else' part, the generated code looks like: 1796 1797 GROUPREF_EXISTS 1798 <group> 1799 <skipyes> 1800 ...then part... 1801 JUMP 1802 <skipno> 1803 (<skipyes> jumps here) 1804 ...else part... 1805 (<skipno> jumps here) 1806 1807 If there is only a 'then' part, it looks like: 1808 1809 GROUPREF_EXISTS 1810 <group> 1811 <skip> 1812 ...then part... 1813 (<skip> jumps here) 1814 1815 There is no direct way to decide which it is, and we don't want 1816 to allow arbitrary jumps anywhere in the code; so we just look 1817 for a JUMP opcode preceding our skip target. 1818 */ 1819 if (skip >= 3 && skip-3 < (uintptr_t)(end - code) && 1820 code[skip-3] == SRE_OP_JUMP) 1821 { 1822 VTRACE(("both then and else parts present\n")); 1823 if (!_validate_inner(code+1, code+skip-3, groups)) 1824 FAIL; 1825 code += skip-2; /* Position after JUMP, at <skipno> */ 1826 GET_SKIP; 1827 if (!_validate_inner(code, code+skip-1, groups)) 1828 FAIL; 1829 code += skip-1; 1830 } 1831 else { 1832 VTRACE(("only a then part present\n")); 1833 if (!_validate_inner(code+1, code+skip-1, groups)) 1834 FAIL; 1835 code += skip-1; 1836 } 1837 break; 1838 1839 case SRE_OP_ASSERT: 1840 case SRE_OP_ASSERT_NOT: 1841 GET_SKIP; 1842 GET_ARG; /* 0 for lookahead, width for lookbehind */ 1843 code--; /* Back up over arg to simplify math below */ 1844 if (arg & 0x80000000) 1845 FAIL; /* Width too large */ 1846 /* Stop 1 before the end; we check the SUCCESS below */ 1847 if (!_validate_inner(code+1, code+skip-2, groups)) 1848 FAIL; 1849 code += skip-2; 1850 GET_OP; 1851 if (op != SRE_OP_SUCCESS) 1852 FAIL; 1853 break; 1854 1855 default: 1856 FAIL; 1857 1858 } 1859 } 1860 1861 VTRACE(("okay\n")); 1862 return 1; 1863 } 1864 1865 static int 1866 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) 1867 { 1868 if (groups < 0 || (size_t)groups > SRE_MAXGROUPS || 1869 code >= end || end[-1] != SRE_OP_SUCCESS) 1870 FAIL; 1871 return _validate_inner(code, end-1, groups); 1872 } 1873 1874 static int 1875 _validate(PatternObject *self) 1876 { 1877 if (!_validate_outer(self->code, self->code+self->codesize, self->groups)) 1878 { 1879 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code"); 1880 return 0; 1881 } 1882 else 1883 VTRACE(("Success!\n")); 1884 return 1; 1885 } 1886 1887 /* -------------------------------------------------------------------- */ 1888 /* match methods */ 1889 1890 static void 1891 match_dealloc(MatchObject* self) 1892 { 1893 Py_XDECREF(self->regs); 1894 Py_XDECREF(self->string); 1895 Py_DECREF(self->pattern); 1896 PyObject_DEL(self); 1897 } 1898 1899 static PyObject* 1900 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def) 1901 { 1902 Py_ssize_t length; 1903 int isbytes, charsize; 1904 Py_buffer view; 1905 PyObject *result; 1906 void* ptr; 1907 Py_ssize_t i, j; 1908 1909 if (index < 0 || index >= self->groups) { 1910 /* raise IndexError if we were given a bad group number */ 1911 PyErr_SetString( 1912 PyExc_IndexError, 1913 "no such group" 1914 ); 1915 return NULL; 1916 } 1917 1918 index *= 2; 1919 1920 if (self->string == Py_None || self->mark[index] < 0) { 1921 /* return default value if the string or group is undefined */ 1922 Py_INCREF(def); 1923 return def; 1924 } 1925 1926 ptr = getstring(self->string, &length, &isbytes, &charsize, &view); 1927 if (ptr == NULL) 1928 return NULL; 1929 1930 i = self->mark[index]; 1931 j = self->mark[index+1]; 1932 i = Py_MIN(i, length); 1933 j = Py_MIN(j, length); 1934 result = getslice(isbytes, ptr, self->string, i, j); 1935 if (isbytes && view.buf != NULL) 1936 PyBuffer_Release(&view); 1937 return result; 1938 } 1939 1940 static Py_ssize_t 1941 match_getindex(MatchObject* self, PyObject* index) 1942 { 1943 Py_ssize_t i; 1944 1945 if (index == NULL) 1946 /* Default value */ 1947 return 0; 1948 1949 if (PyIndex_Check(index)) { 1950 return PyNumber_AsSsize_t(index, NULL); 1951 } 1952 1953 i = -1; 1954 1955 if (self->pattern->groupindex) { 1956 index = PyDict_GetItem(self->pattern->groupindex, index); 1957 if (index && PyLong_Check(index)) { 1958 i = PyLong_AsSsize_t(index); 1959 } 1960 } 1961 1962 return i; 1963 } 1964 1965 static PyObject* 1966 match_getslice(MatchObject* self, PyObject* index, PyObject* def) 1967 { 1968 return match_getslice_by_index(self, match_getindex(self, index), def); 1969 } 1970 1971 /*[clinic input] 1972 _sre.SRE_Match.expand 1973 1974 template: object 1975 1976 Return the string obtained by doing backslash substitution on the string template, as done by the sub() method. 1977 [clinic start generated code]*/ 1978 1979 static PyObject * 1980 _sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template) 1981 /*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/ 1982 { 1983 /* delegate to Python code */ 1984 return call( 1985 SRE_PY_MODULE, "_expand", 1986 PyTuple_Pack(3, self->pattern, self, template) 1987 ); 1988 } 1989 1990 static PyObject* 1991 match_group(MatchObject* self, PyObject* args) 1992 { 1993 PyObject* result; 1994 Py_ssize_t i, size; 1995 1996 size = PyTuple_GET_SIZE(args); 1997 1998 switch (size) { 1999 case 0: 2000 result = match_getslice(self, _PyLong_Zero, Py_None); 2001 break; 2002 case 1: 2003 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None); 2004 break; 2005 default: 2006 /* fetch multiple items */ 2007 result = PyTuple_New(size); 2008 if (!result) 2009 return NULL; 2010 for (i = 0; i < size; i++) { 2011 PyObject* item = match_getslice( 2012 self, PyTuple_GET_ITEM(args, i), Py_None 2013 ); 2014 if (!item) { 2015 Py_DECREF(result); 2016 return NULL; 2017 } 2018 PyTuple_SET_ITEM(result, i, item); 2019 } 2020 break; 2021 } 2022 return result; 2023 } 2024 2025 static PyObject* 2026 match_getitem(MatchObject* self, PyObject* name) 2027 { 2028 return match_getslice(self, name, Py_None); 2029 } 2030 2031 /*[clinic input] 2032 _sre.SRE_Match.groups 2033 2034 default: object = None 2035 Is used for groups that did not participate in the match. 2036 2037 Return a tuple containing all the subgroups of the match, from 1. 2038 [clinic start generated code]*/ 2039 2040 static PyObject * 2041 _sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value) 2042 /*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/ 2043 { 2044 PyObject* result; 2045 Py_ssize_t index; 2046 2047 result = PyTuple_New(self->groups-1); 2048 if (!result) 2049 return NULL; 2050 2051 for (index = 1; index < self->groups; index++) { 2052 PyObject* item; 2053 item = match_getslice_by_index(self, index, default_value); 2054 if (!item) { 2055 Py_DECREF(result); 2056 return NULL; 2057 } 2058 PyTuple_SET_ITEM(result, index-1, item); 2059 } 2060 2061 return result; 2062 } 2063 2064 /*[clinic input] 2065 _sre.SRE_Match.groupdict 2066 2067 default: object = None 2068 Is used for groups that did not participate in the match. 2069 2070 Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name. 2071 [clinic start generated code]*/ 2072 2073 static PyObject * 2074 _sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value) 2075 /*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/ 2076 { 2077 PyObject *result; 2078 PyObject *key; 2079 PyObject *value; 2080 Py_ssize_t pos = 0; 2081 Py_hash_t hash; 2082 2083 result = PyDict_New(); 2084 if (!result || !self->pattern->groupindex) 2085 return result; 2086 2087 while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) { 2088 int status; 2089 Py_INCREF(key); 2090 value = match_getslice(self, key, default_value); 2091 if (!value) { 2092 Py_DECREF(key); 2093 goto failed; 2094 } 2095 status = _PyDict_SetItem_KnownHash(result, key, value, hash); 2096 Py_DECREF(value); 2097 Py_DECREF(key); 2098 if (status < 0) 2099 goto failed; 2100 } 2101 2102 return result; 2103 2104 failed: 2105 Py_DECREF(result); 2106 return NULL; 2107 } 2108 2109 /*[clinic input] 2110 _sre.SRE_Match.start -> Py_ssize_t 2111 2112 group: object(c_default="NULL") = 0 2113 / 2114 2115 Return index of the start of the substring matched by group. 2116 [clinic start generated code]*/ 2117 2118 static Py_ssize_t 2119 _sre_SRE_Match_start_impl(MatchObject *self, PyObject *group) 2120 /*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/ 2121 { 2122 Py_ssize_t index = match_getindex(self, group); 2123 2124 if (index < 0 || index >= self->groups) { 2125 PyErr_SetString( 2126 PyExc_IndexError, 2127 "no such group" 2128 ); 2129 return -1; 2130 } 2131 2132 /* mark is -1 if group is undefined */ 2133 return self->mark[index*2]; 2134 } 2135 2136 /*[clinic input] 2137 _sre.SRE_Match.end -> Py_ssize_t 2138 2139 group: object(c_default="NULL") = 0 2140 / 2141 2142 Return index of the end of the substring matched by group. 2143 [clinic start generated code]*/ 2144 2145 static Py_ssize_t 2146 _sre_SRE_Match_end_impl(MatchObject *self, PyObject *group) 2147 /*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/ 2148 { 2149 Py_ssize_t index = match_getindex(self, group); 2150 2151 if (index < 0 || index >= self->groups) { 2152 PyErr_SetString( 2153 PyExc_IndexError, 2154 "no such group" 2155 ); 2156 return -1; 2157 } 2158 2159 /* mark is -1 if group is undefined */ 2160 return self->mark[index*2+1]; 2161 } 2162 2163 LOCAL(PyObject*) 2164 _pair(Py_ssize_t i1, Py_ssize_t i2) 2165 { 2166 PyObject* pair; 2167 PyObject* item; 2168 2169 pair = PyTuple_New(2); 2170 if (!pair) 2171 return NULL; 2172 2173 item = PyLong_FromSsize_t(i1); 2174 if (!item) 2175 goto error; 2176 PyTuple_SET_ITEM(pair, 0, item); 2177 2178 item = PyLong_FromSsize_t(i2); 2179 if (!item) 2180 goto error; 2181 PyTuple_SET_ITEM(pair, 1, item); 2182 2183 return pair; 2184 2185 error: 2186 Py_DECREF(pair); 2187 return NULL; 2188 } 2189 2190 /*[clinic input] 2191 _sre.SRE_Match.span 2192 2193 group: object(c_default="NULL") = 0 2194 / 2195 2196 For match object m, return the 2-tuple (m.start(group), m.end(group)). 2197 [clinic start generated code]*/ 2198 2199 static PyObject * 2200 _sre_SRE_Match_span_impl(MatchObject *self, PyObject *group) 2201 /*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/ 2202 { 2203 Py_ssize_t index = match_getindex(self, group); 2204 2205 if (index < 0 || index >= self->groups) { 2206 PyErr_SetString( 2207 PyExc_IndexError, 2208 "no such group" 2209 ); 2210 return NULL; 2211 } 2212 2213 /* marks are -1 if group is undefined */ 2214 return _pair(self->mark[index*2], self->mark[index*2+1]); 2215 } 2216 2217 static PyObject* 2218 match_regs(MatchObject* self) 2219 { 2220 PyObject* regs; 2221 PyObject* item; 2222 Py_ssize_t index; 2223 2224 regs = PyTuple_New(self->groups); 2225 if (!regs) 2226 return NULL; 2227 2228 for (index = 0; index < self->groups; index++) { 2229 item = _pair(self->mark[index*2], self->mark[index*2+1]); 2230 if (!item) { 2231 Py_DECREF(regs); 2232 return NULL; 2233 } 2234 PyTuple_SET_ITEM(regs, index, item); 2235 } 2236 2237 Py_INCREF(regs); 2238 self->regs = regs; 2239 2240 return regs; 2241 } 2242 2243 /*[clinic input] 2244 _sre.SRE_Match.__copy__ 2245 2246 [clinic start generated code]*/ 2247 2248 static PyObject * 2249 _sre_SRE_Match___copy___impl(MatchObject *self) 2250 /*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/ 2251 { 2252 Py_INCREF(self); 2253 return (PyObject *)self; 2254 } 2255 2256 /*[clinic input] 2257 _sre.SRE_Match.__deepcopy__ 2258 2259 memo: object 2260 / 2261 2262 [clinic start generated code]*/ 2263 2264 static PyObject * 2265 _sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo) 2266 /*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/ 2267 { 2268 Py_INCREF(self); 2269 return (PyObject *)self; 2270 } 2271 2272 PyDoc_STRVAR(match_doc, 2273 "The result of re.match() and re.search().\n\ 2274 Match objects always have a boolean value of True."); 2275 2276 PyDoc_STRVAR(match_group_doc, 2277 "group([group1, ...]) -> str or tuple.\n\ 2278 Return subgroup(s) of the match by indices or names.\n\ 2279 For 0 returns the entire match."); 2280 2281 static PyObject * 2282 match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored)) 2283 { 2284 if (self->lastindex >= 0) 2285 return PyLong_FromSsize_t(self->lastindex); 2286 Py_RETURN_NONE; 2287 } 2288 2289 static PyObject * 2290 match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored)) 2291 { 2292 if (self->pattern->indexgroup && 2293 self->lastindex >= 0 && 2294 self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup)) 2295 { 2296 PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup, 2297 self->lastindex); 2298 Py_INCREF(result); 2299 return result; 2300 } 2301 Py_RETURN_NONE; 2302 } 2303 2304 static PyObject * 2305 match_regs_get(MatchObject *self, void *Py_UNUSED(ignored)) 2306 { 2307 if (self->regs) { 2308 Py_INCREF(self->regs); 2309 return self->regs; 2310 } else 2311 return match_regs(self); 2312 } 2313 2314 static PyObject * 2315 match_repr(MatchObject *self) 2316 { 2317 PyObject *result; 2318 PyObject *group0 = match_getslice_by_index(self, 0, Py_None); 2319 if (group0 == NULL) 2320 return NULL; 2321 result = PyUnicode_FromFormat( 2322 "<%s object; span=(%zd, %zd), match=%.50R>", 2323 Py_TYPE(self)->tp_name, 2324 self->mark[0], self->mark[1], group0); 2325 Py_DECREF(group0); 2326 return result; 2327 } 2328 2329 2330 static PyObject* 2331 pattern_new_match(PatternObject* pattern, SRE_STATE* state, Py_ssize_t status) 2332 { 2333 /* create match object (from state object) */ 2334 2335 MatchObject* match; 2336 Py_ssize_t i, j; 2337 char* base; 2338 int n; 2339 2340 if (status > 0) { 2341 2342 /* create match object (with room for extra group marks) */ 2343 /* coverity[ampersand_in_size] */ 2344 match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2345 2*(pattern->groups+1)); 2346 if (!match) 2347 return NULL; 2348 2349 Py_INCREF(pattern); 2350 match->pattern = pattern; 2351 2352 Py_INCREF(state->string); 2353 match->string = state->string; 2354 2355 match->regs = NULL; 2356 match->groups = pattern->groups+1; 2357 2358 /* fill in group slices */ 2359 2360 base = (char*) state->beginning; 2361 n = state->charsize; 2362 2363 match->mark[0] = ((char*) state->start - base) / n; 2364 match->mark[1] = ((char*) state->ptr - base) / n; 2365 2366 for (i = j = 0; i < pattern->groups; i++, j+=2) 2367 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) { 2368 match->mark[j+2] = ((char*) state->mark[j] - base) / n; 2369 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n; 2370 } else 2371 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ 2372 2373 match->pos = state->pos; 2374 match->endpos = state->endpos; 2375 2376 match->lastindex = state->lastindex; 2377 2378 return (PyObject*) match; 2379 2380 } else if (status == 0) { 2381 2382 /* no match */ 2383 Py_RETURN_NONE; 2384 2385 } 2386 2387 /* internal error */ 2388 pattern_error(status); 2389 return NULL; 2390 } 2391 2392 2393 /* -------------------------------------------------------------------- */ 2394 /* scanner methods (experimental) */ 2395 2396 static void 2397 scanner_dealloc(ScannerObject* self) 2398 { 2399 state_fini(&self->state); 2400 Py_XDECREF(self->pattern); 2401 PyObject_DEL(self); 2402 } 2403 2404 /*[clinic input] 2405 _sre.SRE_Scanner.match 2406 2407 [clinic start generated code]*/ 2408 2409 static PyObject * 2410 _sre_SRE_Scanner_match_impl(ScannerObject *self) 2411 /*[clinic end generated code: output=936b30c63d4b81eb input=881a0154f8c13d9a]*/ 2412 { 2413 SRE_STATE* state = &self->state; 2414 PyObject* match; 2415 Py_ssize_t status; 2416 2417 if (state->start == NULL) 2418 Py_RETURN_NONE; 2419 2420 state_reset(state); 2421 2422 state->ptr = state->start; 2423 2424 status = sre_match(state, PatternObject_GetCode(self->pattern)); 2425 if (PyErr_Occurred()) 2426 return NULL; 2427 2428 match = pattern_new_match((PatternObject*) self->pattern, 2429 state, status); 2430 2431 if (status == 0) 2432 state->start = NULL; 2433 else { 2434 state->must_advance = (state->ptr == state->start); 2435 state->start = state->ptr; 2436 } 2437 2438 return match; 2439 } 2440 2441 2442 /*[clinic input] 2443 _sre.SRE_Scanner.search 2444 2445 [clinic start generated code]*/ 2446 2447 static PyObject * 2448 _sre_SRE_Scanner_search_impl(ScannerObject *self) 2449 /*[clinic end generated code: output=7dc211986088f025 input=161223ee92ef9270]*/ 2450 { 2451 SRE_STATE* state = &self->state; 2452 PyObject* match; 2453 Py_ssize_t status; 2454 2455 if (state->start == NULL) 2456 Py_RETURN_NONE; 2457 2458 state_reset(state); 2459 2460 state->ptr = state->start; 2461 2462 status = sre_search(state, PatternObject_GetCode(self->pattern)); 2463 if (PyErr_Occurred()) 2464 return NULL; 2465 2466 match = pattern_new_match((PatternObject*) self->pattern, 2467 state, status); 2468 2469 if (status == 0) 2470 state->start = NULL; 2471 else { 2472 state->must_advance = (state->ptr == state->start); 2473 state->start = state->ptr; 2474 } 2475 2476 return match; 2477 } 2478 2479 static PyObject * 2480 pattern_scanner(PatternObject *self, PyObject *string, Py_ssize_t pos, Py_ssize_t endpos) 2481 { 2482 ScannerObject* scanner; 2483 2484 /* create scanner object */ 2485 scanner = PyObject_NEW(ScannerObject, &Scanner_Type); 2486 if (!scanner) 2487 return NULL; 2488 scanner->pattern = NULL; 2489 2490 /* create search state object */ 2491 if (!state_init(&scanner->state, self, string, pos, endpos)) { 2492 Py_DECREF(scanner); 2493 return NULL; 2494 } 2495 2496 Py_INCREF(self); 2497 scanner->pattern = (PyObject*) self; 2498 2499 return (PyObject*) scanner; 2500 } 2501 2502 static Py_hash_t 2503 pattern_hash(PatternObject *self) 2504 { 2505 Py_hash_t hash, hash2; 2506 2507 hash = PyObject_Hash(self->pattern); 2508 if (hash == -1) { 2509 return -1; 2510 } 2511 2512 hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize); 2513 hash ^= hash2; 2514 2515 hash ^= self->flags; 2516 hash ^= self->isbytes; 2517 hash ^= self->codesize; 2518 2519 if (hash == -1) { 2520 hash = -2; 2521 } 2522 return hash; 2523 } 2524 2525 static PyObject* 2526 pattern_richcompare(PyObject *lefto, PyObject *righto, int op) 2527 { 2528 PatternObject *left, *right; 2529 int cmp; 2530 2531 if (op != Py_EQ && op != Py_NE) { 2532 Py_RETURN_NOTIMPLEMENTED; 2533 } 2534 2535 if (Py_TYPE(lefto) != &Pattern_Type || Py_TYPE(righto) != &Pattern_Type) { 2536 Py_RETURN_NOTIMPLEMENTED; 2537 } 2538 2539 if (lefto == righto) { 2540 /* a pattern is equal to itself */ 2541 return PyBool_FromLong(op == Py_EQ); 2542 } 2543 2544 left = (PatternObject *)lefto; 2545 right = (PatternObject *)righto; 2546 2547 cmp = (left->flags == right->flags 2548 && left->isbytes == right->isbytes 2549 && left->codesize == right->codesize); 2550 if (cmp) { 2551 /* Compare the code and the pattern because the same pattern can 2552 produce different codes depending on the locale used to compile the 2553 pattern when the re.LOCALE flag is used. Don't compare groups, 2554 indexgroup nor groupindex: they are derivated from the pattern. */ 2555 cmp = (memcmp(left->code, right->code, 2556 sizeof(left->code[0]) * left->codesize) == 0); 2557 } 2558 if (cmp) { 2559 cmp = PyObject_RichCompareBool(left->pattern, right->pattern, 2560 Py_EQ); 2561 if (cmp < 0) { 2562 return NULL; 2563 } 2564 } 2565 if (op == Py_NE) { 2566 cmp = !cmp; 2567 } 2568 return PyBool_FromLong(cmp); 2569 } 2570 2571 #include "clinic/_sre.c.h" 2572 2573 static PyMethodDef pattern_methods[] = { 2574 _SRE_SRE_PATTERN_MATCH_METHODDEF 2575 _SRE_SRE_PATTERN_FULLMATCH_METHODDEF 2576 _SRE_SRE_PATTERN_SEARCH_METHODDEF 2577 _SRE_SRE_PATTERN_SUB_METHODDEF 2578 _SRE_SRE_PATTERN_SUBN_METHODDEF 2579 _SRE_SRE_PATTERN_FINDALL_METHODDEF 2580 _SRE_SRE_PATTERN_SPLIT_METHODDEF 2581 _SRE_SRE_PATTERN_FINDITER_METHODDEF 2582 _SRE_SRE_PATTERN_SCANNER_METHODDEF 2583 _SRE_SRE_PATTERN___COPY___METHODDEF 2584 _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF 2585 {NULL, NULL} 2586 }; 2587 2588 static PyGetSetDef pattern_getset[] = { 2589 {"groupindex", (getter)pattern_groupindex, (setter)NULL, 2590 "A dictionary mapping group names to group numbers."}, 2591 {NULL} /* Sentinel */ 2592 }; 2593 2594 #define PAT_OFF(x) offsetof(PatternObject, x) 2595 static PyMemberDef pattern_members[] = { 2596 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY, 2597 "The pattern string from which the RE object was compiled."}, 2598 {"flags", T_INT, PAT_OFF(flags), READONLY, 2599 "The regex matching flags."}, 2600 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY, 2601 "The number of capturing groups in the pattern."}, 2602 {NULL} /* Sentinel */ 2603 }; 2604 2605 static PyTypeObject Pattern_Type = { 2606 PyVarObject_HEAD_INIT(NULL, 0) 2607 "re.Pattern", 2608 sizeof(PatternObject), sizeof(SRE_CODE), 2609 (destructor)pattern_dealloc, /* tp_dealloc */ 2610 0, /* tp_print */ 2611 0, /* tp_getattr */ 2612 0, /* tp_setattr */ 2613 0, /* tp_reserved */ 2614 (reprfunc)pattern_repr, /* tp_repr */ 2615 0, /* tp_as_number */ 2616 0, /* tp_as_sequence */ 2617 0, /* tp_as_mapping */ 2618 (hashfunc)pattern_hash, /* tp_hash */ 2619 0, /* tp_call */ 2620 0, /* tp_str */ 2621 0, /* tp_getattro */ 2622 0, /* tp_setattro */ 2623 0, /* tp_as_buffer */ 2624 Py_TPFLAGS_DEFAULT, /* tp_flags */ 2625 pattern_doc, /* tp_doc */ 2626 0, /* tp_traverse */ 2627 0, /* tp_clear */ 2628 pattern_richcompare, /* tp_richcompare */ 2629 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */ 2630 0, /* tp_iter */ 2631 0, /* tp_iternext */ 2632 pattern_methods, /* tp_methods */ 2633 pattern_members, /* tp_members */ 2634 pattern_getset, /* tp_getset */ 2635 }; 2636 2637 /* Match objects do not support length or assignment, but do support 2638 __getitem__. */ 2639 static PyMappingMethods match_as_mapping = { 2640 NULL, 2641 (binaryfunc)match_getitem, 2642 NULL 2643 }; 2644 2645 static PyMethodDef match_methods[] = { 2646 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc}, 2647 _SRE_SRE_MATCH_START_METHODDEF 2648 _SRE_SRE_MATCH_END_METHODDEF 2649 _SRE_SRE_MATCH_SPAN_METHODDEF 2650 _SRE_SRE_MATCH_GROUPS_METHODDEF 2651 _SRE_SRE_MATCH_GROUPDICT_METHODDEF 2652 _SRE_SRE_MATCH_EXPAND_METHODDEF 2653 _SRE_SRE_MATCH___COPY___METHODDEF 2654 _SRE_SRE_MATCH___DEEPCOPY___METHODDEF 2655 {NULL, NULL} 2656 }; 2657 2658 static PyGetSetDef match_getset[] = { 2659 {"lastindex", (getter)match_lastindex_get, (setter)NULL, 2660 "The integer index of the last matched capturing group."}, 2661 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL, 2662 "The name of the last matched capturing group."}, 2663 {"regs", (getter)match_regs_get, (setter)NULL}, 2664 {NULL} 2665 }; 2666 2667 #define MATCH_OFF(x) offsetof(MatchObject, x) 2668 static PyMemberDef match_members[] = { 2669 {"string", T_OBJECT, MATCH_OFF(string), READONLY, 2670 "The string passed to match() or search()."}, 2671 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY, 2672 "The regular expression object."}, 2673 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY, 2674 "The index into the string at which the RE engine started looking for a match."}, 2675 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY, 2676 "The index into the string beyond which the RE engine will not go."}, 2677 {NULL} 2678 }; 2679 2680 /* FIXME: implement setattr("string", None) as a special case (to 2681 detach the associated string, if any */ 2682 2683 static PyTypeObject Match_Type = { 2684 PyVarObject_HEAD_INIT(NULL,0) 2685 "re.Match", 2686 sizeof(MatchObject), sizeof(Py_ssize_t), 2687 (destructor)match_dealloc, /* tp_dealloc */ 2688 0, /* tp_print */ 2689 0, /* tp_getattr */ 2690 0, /* tp_setattr */ 2691 0, /* tp_reserved */ 2692 (reprfunc)match_repr, /* tp_repr */ 2693 0, /* tp_as_number */ 2694 0, /* tp_as_sequence */ 2695 &match_as_mapping, /* tp_as_mapping */ 2696 0, /* tp_hash */ 2697 0, /* tp_call */ 2698 0, /* tp_str */ 2699 0, /* tp_getattro */ 2700 0, /* tp_setattro */ 2701 0, /* tp_as_buffer */ 2702 Py_TPFLAGS_DEFAULT, /* tp_flags */ 2703 match_doc, /* tp_doc */ 2704 0, /* tp_traverse */ 2705 0, /* tp_clear */ 2706 0, /* tp_richcompare */ 2707 0, /* tp_weaklistoffset */ 2708 0, /* tp_iter */ 2709 0, /* tp_iternext */ 2710 match_methods, /* tp_methods */ 2711 match_members, /* tp_members */ 2712 match_getset, /* tp_getset */ 2713 }; 2714 2715 static PyMethodDef scanner_methods[] = { 2716 _SRE_SRE_SCANNER_MATCH_METHODDEF 2717 _SRE_SRE_SCANNER_SEARCH_METHODDEF 2718 {NULL, NULL} 2719 }; 2720 2721 #define SCAN_OFF(x) offsetof(ScannerObject, x) 2722 static PyMemberDef scanner_members[] = { 2723 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY}, 2724 {NULL} /* Sentinel */ 2725 }; 2726 2727 static PyTypeObject Scanner_Type = { 2728 PyVarObject_HEAD_INIT(NULL, 0) 2729 "_" SRE_MODULE ".SRE_Scanner", 2730 sizeof(ScannerObject), 0, 2731 (destructor)scanner_dealloc,/* tp_dealloc */ 2732 0, /* tp_print */ 2733 0, /* tp_getattr */ 2734 0, /* tp_setattr */ 2735 0, /* tp_reserved */ 2736 0, /* tp_repr */ 2737 0, /* tp_as_number */ 2738 0, /* tp_as_sequence */ 2739 0, /* tp_as_mapping */ 2740 0, /* tp_hash */ 2741 0, /* tp_call */ 2742 0, /* tp_str */ 2743 0, /* tp_getattro */ 2744 0, /* tp_setattro */ 2745 0, /* tp_as_buffer */ 2746 Py_TPFLAGS_DEFAULT, /* tp_flags */ 2747 0, /* tp_doc */ 2748 0, /* tp_traverse */ 2749 0, /* tp_clear */ 2750 0, /* tp_richcompare */ 2751 0, /* tp_weaklistoffset */ 2752 0, /* tp_iter */ 2753 0, /* tp_iternext */ 2754 scanner_methods, /* tp_methods */ 2755 scanner_members, /* tp_members */ 2756 0, /* tp_getset */ 2757 }; 2758 2759 static PyMethodDef _functions[] = { 2760 _SRE_COMPILE_METHODDEF 2761 _SRE_GETCODESIZE_METHODDEF 2762 _SRE_ASCII_ISCASED_METHODDEF 2763 _SRE_UNICODE_ISCASED_METHODDEF 2764 _SRE_ASCII_TOLOWER_METHODDEF 2765 _SRE_UNICODE_TOLOWER_METHODDEF 2766 {NULL, NULL} 2767 }; 2768 2769 static struct PyModuleDef sremodule = { 2770 PyModuleDef_HEAD_INIT, 2771 "_" SRE_MODULE, 2772 NULL, 2773 -1, 2774 _functions, 2775 NULL, 2776 NULL, 2777 NULL, 2778 NULL 2779 }; 2780 2781 PyMODINIT_FUNC PyInit__sre(void) 2782 { 2783 PyObject* m; 2784 PyObject* d; 2785 PyObject* x; 2786 2787 /* Patch object types */ 2788 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) || 2789 PyType_Ready(&Scanner_Type)) 2790 return NULL; 2791 2792 m = PyModule_Create(&sremodule); 2793 if (m == NULL) 2794 return NULL; 2795 d = PyModule_GetDict(m); 2796 2797 x = PyLong_FromLong(SRE_MAGIC); 2798 if (x) { 2799 PyDict_SetItemString(d, "MAGIC", x); 2800 Py_DECREF(x); 2801 } 2802 2803 x = PyLong_FromLong(sizeof(SRE_CODE)); 2804 if (x) { 2805 PyDict_SetItemString(d, "CODESIZE", x); 2806 Py_DECREF(x); 2807 } 2808 2809 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT); 2810 if (x) { 2811 PyDict_SetItemString(d, "MAXREPEAT", x); 2812 Py_DECREF(x); 2813 } 2814 2815 x = PyLong_FromUnsignedLong(SRE_MAXGROUPS); 2816 if (x) { 2817 PyDict_SetItemString(d, "MAXGROUPS", x); 2818 Py_DECREF(x); 2819 } 2820 2821 x = PyUnicode_FromString(copyright); 2822 if (x) { 2823 PyDict_SetItemString(d, "copyright", x); 2824 Py_DECREF(x); 2825 } 2826 return m; 2827 } 2828 2829 /* vim:ts=4:sw=4:et 2830 */ 2831