1 /* 2 * Secret Labs' Regular Expression Engine 3 * 4 * regular expression matching engine 5 * 6 * partial history: 7 * 1999-10-24 fl created (based on existing template matcher code) 8 * 2000-03-06 fl first alpha, sort of 9 * 2000-08-01 fl fixes for 1.6b1 10 * 2000-08-07 fl use PyOS_CheckStack() if available 11 * 2000-09-20 fl added expand method 12 * 2001-03-20 fl lots of fixes for 2.1b2 13 * 2001-04-15 fl export copyright as Python attribute, not global 14 * 2001-04-28 fl added __copy__ methods (work in progress) 15 * 2001-05-14 fl fixes for 1.5.2 compatibility 16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) 17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) 18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 19 * 2001-10-21 fl added sub/subn primitive 20 * 2001-10-24 fl added finditer primitive (for 2.2 only) 21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) 22 * 2002-11-09 fl fixed empty sub/subn return type 23 * 2003-04-18 mvl fully support 4-byte codes 24 * 2003-10-17 gn implemented non recursive scheme 25 * 26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 27 * 28 * This version of the SRE library can be redistributed under CNRI's 29 * Python 1.6 license. For any other use, please contact Secret Labs 30 * AB (info (at) pythonware.com). 31 * 32 * Portions of this engine have been developed in cooperation with 33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and 34 * other compatibility work. 35 */ 36 37 #ifndef SRE_RECURSIVE 38 39 static char copyright[] = 40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB "; 41 42 #define PY_SSIZE_T_CLEAN 43 44 #include "Python.h" 45 #include "structmember.h" /* offsetof */ 46 47 #include "sre.h" 48 49 #include <ctype.h> 50 51 /* name of this module, minus the leading underscore */ 52 #if !defined(SRE_MODULE) 53 #define SRE_MODULE "sre" 54 #endif 55 56 #define SRE_PY_MODULE "re" 57 58 /* defining this one enables tracing */ 59 #undef VERBOSE 60 61 #if PY_VERSION_HEX >= 0x01060000 62 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE) 63 /* defining this enables unicode support (default under 1.6a1 and later) */ 64 #define HAVE_UNICODE 65 #endif 66 #endif 67 68 /* -------------------------------------------------------------------- */ 69 /* optional features */ 70 71 /* enables fast searching */ 72 #define USE_FAST_SEARCH 73 74 /* enables aggressive inlining (always on for Visual C) */ 75 #undef USE_INLINE 76 77 /* enables copy/deepcopy handling (work in progress) */ 78 #undef USE_BUILTIN_COPY 79 80 #if PY_VERSION_HEX < 0x01060000 81 #define PyObject_DEL(op) PyMem_DEL((op)) 82 #endif 83 84 /* -------------------------------------------------------------------- */ 85 86 #if defined(_MSC_VER) 87 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */ 88 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */ 89 /* fastest possible local call under MSVC */ 90 #define LOCAL(type) static __inline type __fastcall 91 #elif defined(USE_INLINE) 92 #define LOCAL(type) static inline type 93 #else 94 #define LOCAL(type) static type 95 #endif 96 97 /* error codes */ 98 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ 99 #define SRE_ERROR_STATE -2 /* illegal state */ 100 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */ 101 #define SRE_ERROR_MEMORY -9 /* out of memory */ 102 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */ 103 104 #if defined(VERBOSE) 105 #define TRACE(v) printf v 106 #else 107 #define TRACE(v) 108 #endif 109 110 /* -------------------------------------------------------------------- */ 111 /* search engine state */ 112 113 /* default character predicates (run sre_chars.py to regenerate tables) */ 114 115 #define SRE_DIGIT_MASK 1 116 #define SRE_SPACE_MASK 2 117 #define SRE_LINEBREAK_MASK 4 118 #define SRE_ALNUM_MASK 8 119 #define SRE_WORD_MASK 16 120 121 /* FIXME: this assumes ASCII. create tables in init_sre() instead */ 122 123 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2, 124 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 126 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 127 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 128 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 129 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 }; 130 131 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 132 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 133 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 134 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 135 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 136 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 137 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 138 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 139 120, 121, 122, 123, 124, 125, 126, 127 }; 140 141 #define SRE_IS_DIGIT(ch)\ 142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0) 143 #define SRE_IS_SPACE(ch)\ 144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0) 145 #define SRE_IS_LINEBREAK(ch)\ 146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0) 147 #define SRE_IS_ALNUM(ch)\ 148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0) 149 #define SRE_IS_WORD(ch)\ 150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0) 151 152 static unsigned int sre_lower(unsigned int ch) 153 { 154 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch); 155 } 156 157 /* locale-specific character predicates */ 158 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids 159 * warnings when c's type supports only numbers < N+1 */ 160 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0) 161 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0) 162 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n') 163 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0) 164 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_') 165 166 static unsigned int sre_lower_locale(unsigned int ch) 167 { 168 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch); 169 } 170 171 /* unicode-specific character predicates */ 172 173 #if defined(HAVE_UNICODE) 174 175 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch)) 176 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) 177 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) 178 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch)) 179 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_') 180 181 static unsigned int sre_lower_unicode(unsigned int ch) 182 { 183 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch)); 184 } 185 186 #endif 187 188 LOCAL(int) 189 sre_category(SRE_CODE category, unsigned int ch) 190 { 191 switch (category) { 192 193 case SRE_CATEGORY_DIGIT: 194 return SRE_IS_DIGIT(ch); 195 case SRE_CATEGORY_NOT_DIGIT: 196 return !SRE_IS_DIGIT(ch); 197 case SRE_CATEGORY_SPACE: 198 return SRE_IS_SPACE(ch); 199 case SRE_CATEGORY_NOT_SPACE: 200 return !SRE_IS_SPACE(ch); 201 case SRE_CATEGORY_WORD: 202 return SRE_IS_WORD(ch); 203 case SRE_CATEGORY_NOT_WORD: 204 return !SRE_IS_WORD(ch); 205 case SRE_CATEGORY_LINEBREAK: 206 return SRE_IS_LINEBREAK(ch); 207 case SRE_CATEGORY_NOT_LINEBREAK: 208 return !SRE_IS_LINEBREAK(ch); 209 210 case SRE_CATEGORY_LOC_WORD: 211 return SRE_LOC_IS_WORD(ch); 212 case SRE_CATEGORY_LOC_NOT_WORD: 213 return !SRE_LOC_IS_WORD(ch); 214 215 #if defined(HAVE_UNICODE) 216 case SRE_CATEGORY_UNI_DIGIT: 217 return SRE_UNI_IS_DIGIT(ch); 218 case SRE_CATEGORY_UNI_NOT_DIGIT: 219 return !SRE_UNI_IS_DIGIT(ch); 220 case SRE_CATEGORY_UNI_SPACE: 221 return SRE_UNI_IS_SPACE(ch); 222 case SRE_CATEGORY_UNI_NOT_SPACE: 223 return !SRE_UNI_IS_SPACE(ch); 224 case SRE_CATEGORY_UNI_WORD: 225 return SRE_UNI_IS_WORD(ch); 226 case SRE_CATEGORY_UNI_NOT_WORD: 227 return !SRE_UNI_IS_WORD(ch); 228 case SRE_CATEGORY_UNI_LINEBREAK: 229 return SRE_UNI_IS_LINEBREAK(ch); 230 case SRE_CATEGORY_UNI_NOT_LINEBREAK: 231 return !SRE_UNI_IS_LINEBREAK(ch); 232 #else 233 case SRE_CATEGORY_UNI_DIGIT: 234 return SRE_IS_DIGIT(ch); 235 case SRE_CATEGORY_UNI_NOT_DIGIT: 236 return !SRE_IS_DIGIT(ch); 237 case SRE_CATEGORY_UNI_SPACE: 238 return SRE_IS_SPACE(ch); 239 case SRE_CATEGORY_UNI_NOT_SPACE: 240 return !SRE_IS_SPACE(ch); 241 case SRE_CATEGORY_UNI_WORD: 242 return SRE_LOC_IS_WORD(ch); 243 case SRE_CATEGORY_UNI_NOT_WORD: 244 return !SRE_LOC_IS_WORD(ch); 245 case SRE_CATEGORY_UNI_LINEBREAK: 246 return SRE_IS_LINEBREAK(ch); 247 case SRE_CATEGORY_UNI_NOT_LINEBREAK: 248 return !SRE_IS_LINEBREAK(ch); 249 #endif 250 } 251 return 0; 252 } 253 254 /* helpers */ 255 256 static void 257 data_stack_dealloc(SRE_STATE* state) 258 { 259 if (state->data_stack) { 260 PyMem_FREE(state->data_stack); 261 state->data_stack = NULL; 262 } 263 state->data_stack_size = state->data_stack_base = 0; 264 } 265 266 static int 267 data_stack_grow(SRE_STATE* state, Py_ssize_t size) 268 { 269 Py_ssize_t minsize, cursize; 270 minsize = state->data_stack_base+size; 271 cursize = state->data_stack_size; 272 if (cursize < minsize) { 273 void* stack; 274 cursize = minsize+minsize/4+1024; 275 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize)); 276 stack = PyMem_REALLOC(state->data_stack, cursize); 277 if (!stack) { 278 data_stack_dealloc(state); 279 return SRE_ERROR_MEMORY; 280 } 281 state->data_stack = (char *)stack; 282 state->data_stack_size = cursize; 283 } 284 return 0; 285 } 286 287 /* generate 8-bit version */ 288 289 #define SRE_CHAR unsigned char 290 #define SRE_AT sre_at 291 #define SRE_COUNT sre_count 292 #define SRE_CHARSET sre_charset 293 #define SRE_INFO sre_info 294 #define SRE_MATCH sre_match 295 #define SRE_MATCH_CONTEXT sre_match_context 296 #define SRE_SEARCH sre_search 297 #define SRE_LITERAL_TEMPLATE sre_literal_template 298 299 #if defined(HAVE_UNICODE) 300 301 #define SRE_RECURSIVE 302 #include "_sre.c" 303 #undef SRE_RECURSIVE 304 305 #undef SRE_LITERAL_TEMPLATE 306 #undef SRE_SEARCH 307 #undef SRE_MATCH 308 #undef SRE_MATCH_CONTEXT 309 #undef SRE_INFO 310 #undef SRE_CHARSET 311 #undef SRE_COUNT 312 #undef SRE_AT 313 #undef SRE_CHAR 314 315 /* generate 16-bit unicode version */ 316 317 #define SRE_CHAR Py_UNICODE 318 #define SRE_AT sre_uat 319 #define SRE_COUNT sre_ucount 320 #define SRE_CHARSET sre_ucharset 321 #define SRE_INFO sre_uinfo 322 #define SRE_MATCH sre_umatch 323 #define SRE_MATCH_CONTEXT sre_umatch_context 324 #define SRE_SEARCH sre_usearch 325 #define SRE_LITERAL_TEMPLATE sre_uliteral_template 326 #endif 327 328 #endif /* SRE_RECURSIVE */ 329 330 /* -------------------------------------------------------------------- */ 331 /* String matching engine */ 332 333 /* the following section is compiled twice, with different character 334 settings */ 335 336 LOCAL(int) 337 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) 338 { 339 /* check if pointer is at given position */ 340 341 Py_ssize_t thisp, thatp; 342 343 switch (at) { 344 345 case SRE_AT_BEGINNING: 346 case SRE_AT_BEGINNING_STRING: 347 return ((void*) ptr == state->beginning); 348 349 case SRE_AT_BEGINNING_LINE: 350 return ((void*) ptr == state->beginning || 351 SRE_IS_LINEBREAK((int) ptr[-1])); 352 353 case SRE_AT_END: 354 return (((SRE_CHAR *)state->end - ptr == 1 && 355 SRE_IS_LINEBREAK((int) ptr[0])) || 356 ((void*) ptr == state->end)); 357 358 case SRE_AT_END_LINE: 359 return ((void*) ptr == state->end || 360 SRE_IS_LINEBREAK((int) ptr[0])); 361 362 case SRE_AT_END_STRING: 363 return ((void*) ptr == state->end); 364 365 case SRE_AT_BOUNDARY: 366 if (state->beginning == state->end) 367 return 0; 368 thatp = ((void*) ptr > state->beginning) ? 369 SRE_IS_WORD((int) ptr[-1]) : 0; 370 thisp = ((void*) ptr < state->end) ? 371 SRE_IS_WORD((int) ptr[0]) : 0; 372 return thisp != thatp; 373 374 case SRE_AT_NON_BOUNDARY: 375 if (state->beginning == state->end) 376 return 0; 377 thatp = ((void*) ptr > state->beginning) ? 378 SRE_IS_WORD((int) ptr[-1]) : 0; 379 thisp = ((void*) ptr < state->end) ? 380 SRE_IS_WORD((int) ptr[0]) : 0; 381 return thisp == thatp; 382 383 case SRE_AT_LOC_BOUNDARY: 384 if (state->beginning == state->end) 385 return 0; 386 thatp = ((void*) ptr > state->beginning) ? 387 SRE_LOC_IS_WORD((int) ptr[-1]) : 0; 388 thisp = ((void*) ptr < state->end) ? 389 SRE_LOC_IS_WORD((int) ptr[0]) : 0; 390 return thisp != thatp; 391 392 case SRE_AT_LOC_NON_BOUNDARY: 393 if (state->beginning == state->end) 394 return 0; 395 thatp = ((void*) ptr > state->beginning) ? 396 SRE_LOC_IS_WORD((int) ptr[-1]) : 0; 397 thisp = ((void*) ptr < state->end) ? 398 SRE_LOC_IS_WORD((int) ptr[0]) : 0; 399 return thisp == thatp; 400 401 #if defined(HAVE_UNICODE) 402 case SRE_AT_UNI_BOUNDARY: 403 if (state->beginning == state->end) 404 return 0; 405 thatp = ((void*) ptr > state->beginning) ? 406 SRE_UNI_IS_WORD((int) ptr[-1]) : 0; 407 thisp = ((void*) ptr < state->end) ? 408 SRE_UNI_IS_WORD((int) ptr[0]) : 0; 409 return thisp != thatp; 410 411 case SRE_AT_UNI_NON_BOUNDARY: 412 if (state->beginning == state->end) 413 return 0; 414 thatp = ((void*) ptr > state->beginning) ? 415 SRE_UNI_IS_WORD((int) ptr[-1]) : 0; 416 thisp = ((void*) ptr < state->end) ? 417 SRE_UNI_IS_WORD((int) ptr[0]) : 0; 418 return thisp == thatp; 419 #endif 420 421 } 422 423 return 0; 424 } 425 426 LOCAL(int) 427 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) 428 { 429 /* check if character is a member of the given set */ 430 431 int ok = 1; 432 433 for (;;) { 434 switch (*set++) { 435 436 case SRE_OP_FAILURE: 437 return !ok; 438 439 case SRE_OP_LITERAL: 440 /* <LITERAL> <code> */ 441 if (ch == set[0]) 442 return ok; 443 set++; 444 break; 445 446 case SRE_OP_CATEGORY: 447 /* <CATEGORY> <code> */ 448 if (sre_category(set[0], (int) ch)) 449 return ok; 450 set += 1; 451 break; 452 453 case SRE_OP_CHARSET: 454 if (sizeof(SRE_CODE) == 2) { 455 /* <CHARSET> <bitmap> (16 bits per code word) */ 456 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15)))) 457 return ok; 458 set += 16; 459 } 460 else { 461 /* <CHARSET> <bitmap> (32 bits per code word) */ 462 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31)))) 463 return ok; 464 set += 8; 465 } 466 break; 467 468 case SRE_OP_RANGE: 469 /* <RANGE> <lower> <upper> */ 470 if (set[0] <= ch && ch <= set[1]) 471 return ok; 472 set += 2; 473 break; 474 475 case SRE_OP_NEGATE: 476 ok = !ok; 477 break; 478 479 case SRE_OP_BIGCHARSET: 480 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */ 481 { 482 Py_ssize_t count, block; 483 count = *(set++); 484 485 if (sizeof(SRE_CODE) == 2) { 486 block = ((unsigned char*)set)[ch >> 8]; 487 set += 128; 488 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15))) 489 return ok; 490 set += count*16; 491 } 492 else { 493 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids 494 * warnings when c's type supports only numbers < N+1 */ 495 if (!(ch & ~65535)) 496 block = ((unsigned char*)set)[ch >> 8]; 497 else 498 block = -1; 499 set += 64; 500 if (block >=0 && 501 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31)))) 502 return ok; 503 set += count*8; 504 } 505 break; 506 } 507 508 default: 509 /* internal error -- there's not much we can do about it 510 here, so let's just pretend it didn't match... */ 511 return 0; 512 } 513 } 514 } 515 516 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern); 517 518 LOCAL(Py_ssize_t) 519 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) 520 { 521 SRE_CODE chr; 522 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr; 523 SRE_CHAR* end = (SRE_CHAR *)state->end; 524 Py_ssize_t i; 525 526 /* adjust end */ 527 if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT) 528 end = ptr + maxcount; 529 530 switch (pattern[0]) { 531 532 case SRE_OP_IN: 533 /* repeated set */ 534 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr)); 535 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr)) 536 ptr++; 537 break; 538 539 case SRE_OP_ANY: 540 /* repeated dot wildcard. */ 541 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr)); 542 while (ptr < end && !SRE_IS_LINEBREAK(*ptr)) 543 ptr++; 544 break; 545 546 case SRE_OP_ANY_ALL: 547 /* repeated dot wildcard. skip to the end of the target 548 string, and backtrack from there */ 549 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr)); 550 ptr = end; 551 break; 552 553 case SRE_OP_LITERAL: 554 /* repeated literal */ 555 chr = pattern[1]; 556 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr)); 557 while (ptr < end && (SRE_CODE) *ptr == chr) 558 ptr++; 559 break; 560 561 case SRE_OP_LITERAL_IGNORE: 562 /* repeated literal */ 563 chr = pattern[1]; 564 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr)); 565 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr) 566 ptr++; 567 break; 568 569 case SRE_OP_NOT_LITERAL: 570 /* repeated non-literal */ 571 chr = pattern[1]; 572 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr)); 573 while (ptr < end && (SRE_CODE) *ptr != chr) 574 ptr++; 575 break; 576 577 case SRE_OP_NOT_LITERAL_IGNORE: 578 /* repeated non-literal */ 579 chr = pattern[1]; 580 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr)); 581 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr) 582 ptr++; 583 break; 584 585 default: 586 /* repeated single character pattern */ 587 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr)); 588 while ((SRE_CHAR*) state->ptr < end) { 589 i = SRE_MATCH(state, pattern); 590 if (i < 0) 591 return i; 592 if (!i) 593 break; 594 } 595 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr, 596 (SRE_CHAR*) state->ptr - ptr)); 597 return (SRE_CHAR*) state->ptr - ptr; 598 } 599 600 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr, 601 ptr - (SRE_CHAR*) state->ptr)); 602 return ptr - (SRE_CHAR*) state->ptr; 603 } 604 605 #if 0 /* not used in this release */ 606 LOCAL(int) 607 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern) 608 { 609 /* check if an SRE_OP_INFO block matches at the current position. 610 returns the number of SRE_CODE objects to skip if successful, 0 611 if no match */ 612 613 SRE_CHAR* end = state->end; 614 SRE_CHAR* ptr = state->ptr; 615 Py_ssize_t i; 616 617 /* check minimal length */ 618 if (pattern[3] && (end - ptr) < pattern[3]) 619 return 0; 620 621 /* check known prefix */ 622 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) { 623 /* <length> <skip> <prefix data> <overlap data> */ 624 for (i = 0; i < pattern[5]; i++) 625 if ((SRE_CODE) ptr[i] != pattern[7 + i]) 626 return 0; 627 return pattern[0] + 2 * pattern[6]; 628 } 629 return pattern[0]; 630 } 631 #endif 632 633 /* The macros below should be used to protect recursive SRE_MATCH() 634 * calls that *failed* and do *not* return immediately (IOW, those 635 * that will backtrack). Explaining: 636 * 637 * - Recursive SRE_MATCH() returned true: that's usually a success 638 * (besides atypical cases like ASSERT_NOT), therefore there's no 639 * reason to restore lastmark; 640 * 641 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH() 642 * is returning to the caller: If the current SRE_MATCH() is the 643 * top function of the recursion, returning false will be a matching 644 * failure, and it doesn't matter where lastmark is pointing to. 645 * If it's *not* the top function, it will be a recursive SRE_MATCH() 646 * failure by itself, and the calling SRE_MATCH() will have to deal 647 * with the failure by the same rules explained here (it will restore 648 * lastmark by itself if necessary); 649 * 650 * - Recursive SRE_MATCH() returned false, and will continue the 651 * outside 'for' loop: must be protected when breaking, since the next 652 * OP could potentially depend on lastmark; 653 * 654 * - Recursive SRE_MATCH() returned false, and will be called again 655 * inside a local for/while loop: must be protected between each 656 * loop iteration, since the recursive SRE_MATCH() could do anything, 657 * and could potentially depend on lastmark. 658 * 659 * For more information, check the discussion at SF patch #712900. 660 */ 661 #define LASTMARK_SAVE() \ 662 do { \ 663 ctx->lastmark = state->lastmark; \ 664 ctx->lastindex = state->lastindex; \ 665 } while (0) 666 #define LASTMARK_RESTORE() \ 667 do { \ 668 state->lastmark = ctx->lastmark; \ 669 state->lastindex = ctx->lastindex; \ 670 } while (0) 671 672 #define RETURN_ERROR(i) do { return i; } while(0) 673 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0) 674 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0) 675 676 #define RETURN_ON_ERROR(i) \ 677 do { if (i < 0) RETURN_ERROR(i); } while (0) 678 #define RETURN_ON_SUCCESS(i) \ 679 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0) 680 #define RETURN_ON_FAILURE(i) \ 681 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0) 682 683 #define SFY(x) #x 684 685 #define DATA_STACK_ALLOC(state, type, ptr) \ 686 do { \ 687 alloc_pos = state->data_stack_base; \ 688 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \ 689 "(%" PY_FORMAT_SIZE_T "d)\n", \ 690 SFY(type), alloc_pos, sizeof(type))); \ 691 if (sizeof(type) > state->data_stack_size - alloc_pos) { \ 692 int j = data_stack_grow(state, sizeof(type)); \ 693 if (j < 0) return j; \ 694 if (ctx_pos != -1) \ 695 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \ 696 } \ 697 ptr = (type*)(state->data_stack+alloc_pos); \ 698 state->data_stack_base += sizeof(type); \ 699 } while (0) 700 701 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \ 702 do { \ 703 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \ 704 ptr = (type*)(state->data_stack+pos); \ 705 } while (0) 706 707 #define DATA_STACK_PUSH(state, data, size) \ 708 do { \ 709 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \ 710 "(%" PY_FORMAT_SIZE_T "d)\n", \ 711 data, state->data_stack_base, size)); \ 712 if (size > state->data_stack_size - state->data_stack_base) { \ 713 int j = data_stack_grow(state, size); \ 714 if (j < 0) return j; \ 715 if (ctx_pos != -1) \ 716 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \ 717 } \ 718 memcpy(state->data_stack+state->data_stack_base, data, size); \ 719 state->data_stack_base += size; \ 720 } while (0) 721 722 #define DATA_STACK_POP(state, data, size, discard) \ 723 do { \ 724 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \ 725 "(%" PY_FORMAT_SIZE_T "d)\n", \ 726 data, state->data_stack_base-size, size)); \ 727 memcpy(data, state->data_stack+state->data_stack_base-size, size); \ 728 if (discard) \ 729 state->data_stack_base -= size; \ 730 } while (0) 731 732 #define DATA_STACK_POP_DISCARD(state, size) \ 733 do { \ 734 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \ 735 "(%" PY_FORMAT_SIZE_T "d)\n", \ 736 state->data_stack_base-size, size)); \ 737 state->data_stack_base -= size; \ 738 } while(0) 739 740 #define DATA_PUSH(x) \ 741 DATA_STACK_PUSH(state, (x), sizeof(*(x))) 742 #define DATA_POP(x) \ 743 DATA_STACK_POP(state, (x), sizeof(*(x)), 1) 744 #define DATA_POP_DISCARD(x) \ 745 DATA_STACK_POP_DISCARD(state, sizeof(*(x))) 746 #define DATA_ALLOC(t,p) \ 747 DATA_STACK_ALLOC(state, t, p) 748 #define DATA_LOOKUP_AT(t,p,pos) \ 749 DATA_STACK_LOOKUP_AT(state,t,p,pos) 750 751 #define MARK_PUSH(lastmark) \ 752 do if (lastmark > 0) { \ 753 i = lastmark; /* ctx->lastmark may change if reallocated */ \ 754 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ 755 } while (0) 756 #define MARK_POP(lastmark) \ 757 do if (lastmark > 0) { \ 758 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ 759 } while (0) 760 #define MARK_POP_KEEP(lastmark) \ 761 do if (lastmark > 0) { \ 762 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ 763 } while (0) 764 #define MARK_POP_DISCARD(lastmark) \ 765 do if (lastmark > 0) { \ 766 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ 767 } while (0) 768 769 #define JUMP_NONE 0 770 #define JUMP_MAX_UNTIL_1 1 771 #define JUMP_MAX_UNTIL_2 2 772 #define JUMP_MAX_UNTIL_3 3 773 #define JUMP_MIN_UNTIL_1 4 774 #define JUMP_MIN_UNTIL_2 5 775 #define JUMP_MIN_UNTIL_3 6 776 #define JUMP_REPEAT 7 777 #define JUMP_REPEAT_ONE_1 8 778 #define JUMP_REPEAT_ONE_2 9 779 #define JUMP_MIN_REPEAT_ONE 10 780 #define JUMP_BRANCH 11 781 #define JUMP_ASSERT 12 782 #define JUMP_ASSERT_NOT 13 783 784 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \ 785 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \ 786 nextctx->last_ctx_pos = ctx_pos; \ 787 nextctx->jump = jumpvalue; \ 788 nextctx->pattern = nextpattern; \ 789 ctx_pos = alloc_pos; \ 790 ctx = nextctx; \ 791 goto entrance; \ 792 jumplabel: \ 793 while (0) /* gcc doesn't like labels at end of scopes */ \ 794 795 typedef struct { 796 Py_ssize_t last_ctx_pos; 797 Py_ssize_t jump; 798 SRE_CHAR* ptr; 799 SRE_CODE* pattern; 800 Py_ssize_t count; 801 Py_ssize_t lastmark; 802 Py_ssize_t lastindex; 803 union { 804 SRE_CODE chr; 805 SRE_REPEAT* rep; 806 } u; 807 } SRE_MATCH_CONTEXT; 808 809 /* check if string matches the given pattern. returns <0 for 810 error, 0 for failure, and 1 for success */ 811 LOCAL(Py_ssize_t) 812 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) 813 { 814 SRE_CHAR* end = (SRE_CHAR *)state->end; 815 Py_ssize_t alloc_pos, ctx_pos = -1; 816 Py_ssize_t i, ret = 0; 817 Py_ssize_t jump; 818 unsigned int sigcount=0; 819 820 SRE_MATCH_CONTEXT* ctx; 821 SRE_MATCH_CONTEXT* nextctx; 822 823 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr)); 824 825 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx); 826 ctx->last_ctx_pos = -1; 827 ctx->jump = JUMP_NONE; 828 ctx->pattern = pattern; 829 ctx_pos = alloc_pos; 830 831 entrance: 832 833 ctx->ptr = (SRE_CHAR *)state->ptr; 834 835 if (ctx->pattern[0] == SRE_OP_INFO) { 836 /* optimization info block */ 837 /* <INFO> <1=skip> <2=flags> <3=min> ... */ 838 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) { 839 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, " 840 "need %" PY_FORMAT_SIZE_T "d)\n", 841 (end - ctx->ptr), (Py_ssize_t) ctx->pattern[3])); 842 RETURN_FAILURE; 843 } 844 ctx->pattern += ctx->pattern[1] + 1; 845 } 846 847 for (;;) { 848 ++sigcount; 849 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals()) 850 RETURN_ERROR(SRE_ERROR_INTERRUPTED); 851 852 switch (*ctx->pattern++) { 853 854 case SRE_OP_MARK: 855 /* set mark */ 856 /* <MARK> <gid> */ 857 TRACE(("|%p|%p|MARK %d\n", ctx->pattern, 858 ctx->ptr, ctx->pattern[0])); 859 i = ctx->pattern[0]; 860 if (i & 1) 861 state->lastindex = i/2 + 1; 862 if (i > state->lastmark) { 863 /* state->lastmark is the highest valid index in the 864 state->mark array. If it is increased by more than 1, 865 the intervening marks must be set to NULL to signal 866 that these marks have not been encountered. */ 867 Py_ssize_t j = state->lastmark + 1; 868 while (j < i) 869 state->mark[j++] = NULL; 870 state->lastmark = i; 871 } 872 state->mark[i] = ctx->ptr; 873 ctx->pattern++; 874 break; 875 876 case SRE_OP_LITERAL: 877 /* match literal string */ 878 /* <LITERAL> <code> */ 879 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern, 880 ctx->ptr, *ctx->pattern)); 881 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0]) 882 RETURN_FAILURE; 883 ctx->pattern++; 884 ctx->ptr++; 885 break; 886 887 case SRE_OP_NOT_LITERAL: 888 /* match anything that is not literal character */ 889 /* <NOT_LITERAL> <code> */ 890 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern, 891 ctx->ptr, *ctx->pattern)); 892 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0]) 893 RETURN_FAILURE; 894 ctx->pattern++; 895 ctx->ptr++; 896 break; 897 898 case SRE_OP_SUCCESS: 899 /* end of pattern */ 900 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr)); 901 state->ptr = ctx->ptr; 902 RETURN_SUCCESS; 903 904 case SRE_OP_AT: 905 /* match at given position */ 906 /* <AT> <code> */ 907 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); 908 if (!SRE_AT(state, ctx->ptr, *ctx->pattern)) 909 RETURN_FAILURE; 910 ctx->pattern++; 911 break; 912 913 case SRE_OP_CATEGORY: 914 /* match at given category */ 915 /* <CATEGORY> <code> */ 916 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern, 917 ctx->ptr, *ctx->pattern)); 918 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0])) 919 RETURN_FAILURE; 920 ctx->pattern++; 921 ctx->ptr++; 922 break; 923 924 case SRE_OP_ANY: 925 /* match anything (except a newline) */ 926 /* <ANY> */ 927 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr)); 928 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0])) 929 RETURN_FAILURE; 930 ctx->ptr++; 931 break; 932 933 case SRE_OP_ANY_ALL: 934 /* match anything */ 935 /* <ANY_ALL> */ 936 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr)); 937 if (ctx->ptr >= end) 938 RETURN_FAILURE; 939 ctx->ptr++; 940 break; 941 942 case SRE_OP_IN: 943 /* match set member (or non_member) */ 944 /* <IN> <skip> <set> */ 945 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr)); 946 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr)) 947 RETURN_FAILURE; 948 ctx->pattern += ctx->pattern[0]; 949 ctx->ptr++; 950 break; 951 952 case SRE_OP_LITERAL_IGNORE: 953 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", 954 ctx->pattern, ctx->ptr, ctx->pattern[0])); 955 if (ctx->ptr >= end || 956 state->lower(*ctx->ptr) != state->lower(*ctx->pattern)) 957 RETURN_FAILURE; 958 ctx->pattern++; 959 ctx->ptr++; 960 break; 961 962 case SRE_OP_NOT_LITERAL_IGNORE: 963 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", 964 ctx->pattern, ctx->ptr, *ctx->pattern)); 965 if (ctx->ptr >= end || 966 state->lower(*ctx->ptr) == state->lower(*ctx->pattern)) 967 RETURN_FAILURE; 968 ctx->pattern++; 969 ctx->ptr++; 970 break; 971 972 case SRE_OP_IN_IGNORE: 973 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); 974 if (ctx->ptr >= end 975 || !SRE_CHARSET(ctx->pattern+1, 976 (SRE_CODE)state->lower(*ctx->ptr))) 977 RETURN_FAILURE; 978 ctx->pattern += ctx->pattern[0]; 979 ctx->ptr++; 980 break; 981 982 case SRE_OP_JUMP: 983 case SRE_OP_INFO: 984 /* jump forward */ 985 /* <JUMP> <offset> */ 986 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern, 987 ctx->ptr, ctx->pattern[0])); 988 ctx->pattern += ctx->pattern[0]; 989 break; 990 991 case SRE_OP_BRANCH: 992 /* alternation */ 993 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */ 994 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr)); 995 LASTMARK_SAVE(); 996 ctx->u.rep = state->repeat; 997 if (ctx->u.rep) 998 MARK_PUSH(ctx->lastmark); 999 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { 1000 if (ctx->pattern[1] == SRE_OP_LITERAL && 1001 (ctx->ptr >= end || 1002 (SRE_CODE) *ctx->ptr != ctx->pattern[2])) 1003 continue; 1004 if (ctx->pattern[1] == SRE_OP_IN && 1005 (ctx->ptr >= end || 1006 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr))) 1007 continue; 1008 state->ptr = ctx->ptr; 1009 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); 1010 if (ret) { 1011 if (ctx->u.rep) 1012 MARK_POP_DISCARD(ctx->lastmark); 1013 RETURN_ON_ERROR(ret); 1014 RETURN_SUCCESS; 1015 } 1016 if (ctx->u.rep) 1017 MARK_POP_KEEP(ctx->lastmark); 1018 LASTMARK_RESTORE(); 1019 } 1020 if (ctx->u.rep) 1021 MARK_POP_DISCARD(ctx->lastmark); 1022 RETURN_FAILURE; 1023 1024 case SRE_OP_REPEAT_ONE: 1025 /* match repeated sequence (maximizing regexp) */ 1026 1027 /* this operator only works if the repeated item is 1028 exactly one character wide, and we're not already 1029 collecting backtracking points. for other cases, 1030 use the MAX_REPEAT operator */ 1031 1032 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */ 1033 1034 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, 1035 ctx->pattern[1], ctx->pattern[2])); 1036 1037 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr) 1038 RETURN_FAILURE; /* cannot match */ 1039 1040 state->ptr = ctx->ptr; 1041 1042 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]); 1043 RETURN_ON_ERROR(ret); 1044 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); 1045 ctx->count = ret; 1046 ctx->ptr += ctx->count; 1047 1048 /* when we arrive here, count contains the number of 1049 matches, and ctx->ptr points to the tail of the target 1050 string. check if the rest of the pattern matches, 1051 and backtrack if not. */ 1052 1053 if (ctx->count < (Py_ssize_t) ctx->pattern[1]) 1054 RETURN_FAILURE; 1055 1056 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { 1057 /* tail is empty. we're finished */ 1058 state->ptr = ctx->ptr; 1059 RETURN_SUCCESS; 1060 } 1061 1062 LASTMARK_SAVE(); 1063 1064 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) { 1065 /* tail starts with a literal. skip positions where 1066 the rest of the pattern cannot possibly match */ 1067 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1]; 1068 for (;;) { 1069 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] && 1070 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) { 1071 ctx->ptr--; 1072 ctx->count--; 1073 } 1074 if (ctx->count < (Py_ssize_t) ctx->pattern[1]) 1075 break; 1076 state->ptr = ctx->ptr; 1077 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1, 1078 ctx->pattern+ctx->pattern[0]); 1079 if (ret) { 1080 RETURN_ON_ERROR(ret); 1081 RETURN_SUCCESS; 1082 } 1083 1084 LASTMARK_RESTORE(); 1085 1086 ctx->ptr--; 1087 ctx->count--; 1088 } 1089 1090 } else { 1091 /* general case */ 1092 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) { 1093 state->ptr = ctx->ptr; 1094 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2, 1095 ctx->pattern+ctx->pattern[0]); 1096 if (ret) { 1097 RETURN_ON_ERROR(ret); 1098 RETURN_SUCCESS; 1099 } 1100 ctx->ptr--; 1101 ctx->count--; 1102 LASTMARK_RESTORE(); 1103 } 1104 } 1105 RETURN_FAILURE; 1106 1107 case SRE_OP_MIN_REPEAT_ONE: 1108 /* match repeated sequence (minimizing regexp) */ 1109 1110 /* this operator only works if the repeated item is 1111 exactly one character wide, and we're not already 1112 collecting backtracking points. for other cases, 1113 use the MIN_REPEAT operator */ 1114 1115 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */ 1116 1117 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, 1118 ctx->pattern[1], ctx->pattern[2])); 1119 1120 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr) 1121 RETURN_FAILURE; /* cannot match */ 1122 1123 state->ptr = ctx->ptr; 1124 1125 if (ctx->pattern[1] == 0) 1126 ctx->count = 0; 1127 else { 1128 /* count using pattern min as the maximum */ 1129 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]); 1130 RETURN_ON_ERROR(ret); 1131 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); 1132 if (ret < (Py_ssize_t) ctx->pattern[1]) 1133 /* didn't match minimum number of times */ 1134 RETURN_FAILURE; 1135 /* advance past minimum matches of repeat */ 1136 ctx->count = ret; 1137 ctx->ptr += ctx->count; 1138 } 1139 1140 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { 1141 /* tail is empty. we're finished */ 1142 state->ptr = ctx->ptr; 1143 RETURN_SUCCESS; 1144 1145 } else { 1146 /* general case */ 1147 LASTMARK_SAVE(); 1148 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT 1149 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) { 1150 state->ptr = ctx->ptr; 1151 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one, 1152 ctx->pattern+ctx->pattern[0]); 1153 if (ret) { 1154 RETURN_ON_ERROR(ret); 1155 RETURN_SUCCESS; 1156 } 1157 state->ptr = ctx->ptr; 1158 ret = SRE_COUNT(state, ctx->pattern+3, 1); 1159 RETURN_ON_ERROR(ret); 1160 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); 1161 if (ret == 0) 1162 break; 1163 assert(ret == 1); 1164 ctx->ptr++; 1165 ctx->count++; 1166 LASTMARK_RESTORE(); 1167 } 1168 } 1169 RETURN_FAILURE; 1170 1171 case SRE_OP_REPEAT: 1172 /* create repeat context. all the hard work is done 1173 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ 1174 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */ 1175 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr, 1176 ctx->pattern[1], ctx->pattern[2])); 1177 1178 /* install new repeat context */ 1179 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep)); 1180 if (!ctx->u.rep) { 1181 PyErr_NoMemory(); 1182 RETURN_FAILURE; 1183 } 1184 ctx->u.rep->count = -1; 1185 ctx->u.rep->pattern = ctx->pattern; 1186 ctx->u.rep->prev = state->repeat; 1187 ctx->u.rep->last_ptr = NULL; 1188 state->repeat = ctx->u.rep; 1189 1190 state->ptr = ctx->ptr; 1191 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]); 1192 state->repeat = ctx->u.rep->prev; 1193 PyObject_FREE(ctx->u.rep); 1194 1195 if (ret) { 1196 RETURN_ON_ERROR(ret); 1197 RETURN_SUCCESS; 1198 } 1199 RETURN_FAILURE; 1200 1201 case SRE_OP_MAX_UNTIL: 1202 /* maximizing repeat */ 1203 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */ 1204 1205 /* FIXME: we probably need to deal with zero-width 1206 matches in here... */ 1207 1208 ctx->u.rep = state->repeat; 1209 if (!ctx->u.rep) 1210 RETURN_ERROR(SRE_ERROR_STATE); 1211 1212 state->ptr = ctx->ptr; 1213 1214 ctx->count = ctx->u.rep->count+1; 1215 1216 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern, 1217 ctx->ptr, ctx->count)); 1218 1219 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) { 1220 /* not enough matches */ 1221 ctx->u.rep->count = ctx->count; 1222 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1, 1223 ctx->u.rep->pattern+3); 1224 if (ret) { 1225 RETURN_ON_ERROR(ret); 1226 RETURN_SUCCESS; 1227 } 1228 ctx->u.rep->count = ctx->count-1; 1229 state->ptr = ctx->ptr; 1230 RETURN_FAILURE; 1231 } 1232 1233 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] || 1234 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) && 1235 state->ptr != ctx->u.rep->last_ptr) { 1236 /* we may have enough matches, but if we can 1237 match another item, do so */ 1238 ctx->u.rep->count = ctx->count; 1239 LASTMARK_SAVE(); 1240 MARK_PUSH(ctx->lastmark); 1241 /* zero-width match protection */ 1242 DATA_PUSH(&ctx->u.rep->last_ptr); 1243 ctx->u.rep->last_ptr = state->ptr; 1244 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2, 1245 ctx->u.rep->pattern+3); 1246 DATA_POP(&ctx->u.rep->last_ptr); 1247 if (ret) { 1248 MARK_POP_DISCARD(ctx->lastmark); 1249 RETURN_ON_ERROR(ret); 1250 RETURN_SUCCESS; 1251 } 1252 MARK_POP(ctx->lastmark); 1253 LASTMARK_RESTORE(); 1254 ctx->u.rep->count = ctx->count-1; 1255 state->ptr = ctx->ptr; 1256 } 1257 1258 /* cannot match more repeated items here. make sure the 1259 tail matches */ 1260 state->repeat = ctx->u.rep->prev; 1261 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); 1262 RETURN_ON_SUCCESS(ret); 1263 state->repeat = ctx->u.rep; 1264 state->ptr = ctx->ptr; 1265 RETURN_FAILURE; 1266 1267 case SRE_OP_MIN_UNTIL: 1268 /* minimizing repeat */ 1269 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */ 1270 1271 ctx->u.rep = state->repeat; 1272 if (!ctx->u.rep) 1273 RETURN_ERROR(SRE_ERROR_STATE); 1274 1275 state->ptr = ctx->ptr; 1276 1277 ctx->count = ctx->u.rep->count+1; 1278 1279 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern, 1280 ctx->ptr, ctx->count, ctx->u.rep->pattern)); 1281 1282 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) { 1283 /* not enough matches */ 1284 ctx->u.rep->count = ctx->count; 1285 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1, 1286 ctx->u.rep->pattern+3); 1287 if (ret) { 1288 RETURN_ON_ERROR(ret); 1289 RETURN_SUCCESS; 1290 } 1291 ctx->u.rep->count = ctx->count-1; 1292 state->ptr = ctx->ptr; 1293 RETURN_FAILURE; 1294 } 1295 1296 LASTMARK_SAVE(); 1297 1298 /* see if the tail matches */ 1299 state->repeat = ctx->u.rep->prev; 1300 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); 1301 if (ret) { 1302 RETURN_ON_ERROR(ret); 1303 RETURN_SUCCESS; 1304 } 1305 1306 state->repeat = ctx->u.rep; 1307 state->ptr = ctx->ptr; 1308 1309 LASTMARK_RESTORE(); 1310 1311 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2] 1312 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) || 1313 state->ptr == ctx->u.rep->last_ptr) 1314 RETURN_FAILURE; 1315 1316 ctx->u.rep->count = ctx->count; 1317 /* zero-width match protection */ 1318 DATA_PUSH(&ctx->u.rep->last_ptr); 1319 ctx->u.rep->last_ptr = state->ptr; 1320 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3, 1321 ctx->u.rep->pattern+3); 1322 DATA_POP(&ctx->u.rep->last_ptr); 1323 if (ret) { 1324 RETURN_ON_ERROR(ret); 1325 RETURN_SUCCESS; 1326 } 1327 ctx->u.rep->count = ctx->count-1; 1328 state->ptr = ctx->ptr; 1329 RETURN_FAILURE; 1330 1331 case SRE_OP_GROUPREF: 1332 /* match backreference */ 1333 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern, 1334 ctx->ptr, ctx->pattern[0])); 1335 i = ctx->pattern[0]; 1336 { 1337 Py_ssize_t groupref = i+i; 1338 if (groupref >= state->lastmark) { 1339 RETURN_FAILURE; 1340 } else { 1341 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; 1342 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; 1343 if (!p || !e || e < p) 1344 RETURN_FAILURE; 1345 while (p < e) { 1346 if (ctx->ptr >= end || *ctx->ptr != *p) 1347 RETURN_FAILURE; 1348 p++; ctx->ptr++; 1349 } 1350 } 1351 } 1352 ctx->pattern++; 1353 break; 1354 1355 case SRE_OP_GROUPREF_IGNORE: 1356 /* match backreference */ 1357 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern, 1358 ctx->ptr, ctx->pattern[0])); 1359 i = ctx->pattern[0]; 1360 { 1361 Py_ssize_t groupref = i+i; 1362 if (groupref >= state->lastmark) { 1363 RETURN_FAILURE; 1364 } else { 1365 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; 1366 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; 1367 if (!p || !e || e < p) 1368 RETURN_FAILURE; 1369 while (p < e) { 1370 if (ctx->ptr >= end || 1371 state->lower(*ctx->ptr) != state->lower(*p)) 1372 RETURN_FAILURE; 1373 p++; ctx->ptr++; 1374 } 1375 } 1376 } 1377 ctx->pattern++; 1378 break; 1379 1380 case SRE_OP_GROUPREF_EXISTS: 1381 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern, 1382 ctx->ptr, ctx->pattern[0])); 1383 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */ 1384 i = ctx->pattern[0]; 1385 { 1386 Py_ssize_t groupref = i+i; 1387 if (groupref >= state->lastmark) { 1388 ctx->pattern += ctx->pattern[1]; 1389 break; 1390 } else { 1391 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; 1392 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; 1393 if (!p || !e || e < p) { 1394 ctx->pattern += ctx->pattern[1]; 1395 break; 1396 } 1397 } 1398 } 1399 ctx->pattern += 2; 1400 break; 1401 1402 case SRE_OP_ASSERT: 1403 /* assert subpattern */ 1404 /* <ASSERT> <skip> <back> <pattern> */ 1405 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern, 1406 ctx->ptr, ctx->pattern[1])); 1407 if (ctx->ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)ctx->pattern[1]) 1408 RETURN_FAILURE; 1409 state->ptr = ctx->ptr - ctx->pattern[1]; 1410 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2); 1411 RETURN_ON_FAILURE(ret); 1412 ctx->pattern += ctx->pattern[0]; 1413 break; 1414 1415 case SRE_OP_ASSERT_NOT: 1416 /* assert not subpattern */ 1417 /* <ASSERT_NOT> <skip> <back> <pattern> */ 1418 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern, 1419 ctx->ptr, ctx->pattern[1])); 1420 if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) { 1421 state->ptr = ctx->ptr - ctx->pattern[1]; 1422 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); 1423 if (ret) { 1424 RETURN_ON_ERROR(ret); 1425 RETURN_FAILURE; 1426 } 1427 } 1428 ctx->pattern += ctx->pattern[0]; 1429 break; 1430 1431 case SRE_OP_FAILURE: 1432 /* immediate failure */ 1433 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr)); 1434 RETURN_FAILURE; 1435 1436 default: 1437 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr, 1438 ctx->pattern[-1])); 1439 RETURN_ERROR(SRE_ERROR_ILLEGAL); 1440 } 1441 } 1442 1443 exit: 1444 ctx_pos = ctx->last_ctx_pos; 1445 jump = ctx->jump; 1446 DATA_POP_DISCARD(ctx); 1447 if (ctx_pos == -1) 1448 return ret; 1449 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); 1450 1451 switch (jump) { 1452 case JUMP_MAX_UNTIL_2: 1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr)); 1454 goto jump_max_until_2; 1455 case JUMP_MAX_UNTIL_3: 1456 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr)); 1457 goto jump_max_until_3; 1458 case JUMP_MIN_UNTIL_2: 1459 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr)); 1460 goto jump_min_until_2; 1461 case JUMP_MIN_UNTIL_3: 1462 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr)); 1463 goto jump_min_until_3; 1464 case JUMP_BRANCH: 1465 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr)); 1466 goto jump_branch; 1467 case JUMP_MAX_UNTIL_1: 1468 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr)); 1469 goto jump_max_until_1; 1470 case JUMP_MIN_UNTIL_1: 1471 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr)); 1472 goto jump_min_until_1; 1473 case JUMP_REPEAT: 1474 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr)); 1475 goto jump_repeat; 1476 case JUMP_REPEAT_ONE_1: 1477 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr)); 1478 goto jump_repeat_one_1; 1479 case JUMP_REPEAT_ONE_2: 1480 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr)); 1481 goto jump_repeat_one_2; 1482 case JUMP_MIN_REPEAT_ONE: 1483 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr)); 1484 goto jump_min_repeat_one; 1485 case JUMP_ASSERT: 1486 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr)); 1487 goto jump_assert; 1488 case JUMP_ASSERT_NOT: 1489 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr)); 1490 goto jump_assert_not; 1491 case JUMP_NONE: 1492 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern, 1493 ctx->ptr, ret)); 1494 break; 1495 } 1496 1497 return ret; /* should never get here */ 1498 } 1499 1500 LOCAL(Py_ssize_t) 1501 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) 1502 { 1503 SRE_CHAR* ptr = (SRE_CHAR *)state->start; 1504 SRE_CHAR* end = (SRE_CHAR *)state->end; 1505 Py_ssize_t status = 0; 1506 Py_ssize_t prefix_len = 0; 1507 Py_ssize_t prefix_skip = 0; 1508 SRE_CODE* prefix = NULL; 1509 SRE_CODE* charset = NULL; 1510 SRE_CODE* overlap = NULL; 1511 int flags = 0; 1512 1513 if (ptr > end) 1514 return 0; 1515 1516 if (pattern[0] == SRE_OP_INFO) { 1517 /* optimization info block */ 1518 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ 1519 1520 flags = pattern[2]; 1521 1522 if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) { 1523 TRACE(("reject (got %u chars, need %u)\n", 1524 (unsigned int)(end - ptr), pattern[3])); 1525 return 0; 1526 } 1527 if (pattern[3] > 1) { 1528 /* adjust end point (but make sure we leave at least one 1529 character in there, so literal search will work) */ 1530 end -= pattern[3]-1; 1531 if (end <= ptr) 1532 end = ptr+1; 1533 } 1534 1535 if (flags & SRE_INFO_PREFIX) { 1536 /* pattern starts with a known prefix */ 1537 /* <length> <skip> <prefix data> <overlap data> */ 1538 prefix_len = pattern[5]; 1539 prefix_skip = pattern[6]; 1540 prefix = pattern + 7; 1541 overlap = prefix + prefix_len - 1; 1542 } else if (flags & SRE_INFO_CHARSET) 1543 /* pattern starts with a character from a known set */ 1544 /* <charset> */ 1545 charset = pattern + 5; 1546 1547 pattern += 1 + pattern[1]; 1548 } 1549 1550 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n", 1551 prefix, prefix_len, prefix_skip)); 1552 TRACE(("charset = %p\n", charset)); 1553 1554 #if defined(USE_FAST_SEARCH) 1555 if (prefix_len > 1) { 1556 /* pattern starts with a known prefix. use the overlap 1557 table to skip forward as fast as we possibly can */ 1558 Py_ssize_t i = 0; 1559 end = (SRE_CHAR *)state->end; 1560 while (ptr < end) { 1561 for (;;) { 1562 if ((SRE_CODE) ptr[0] != prefix[i]) { 1563 if (!i) 1564 break; 1565 else 1566 i = overlap[i]; 1567 } else { 1568 if (++i == prefix_len) { 1569 /* found a potential match */ 1570 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr)); 1571 state->start = ptr + 1 - prefix_len; 1572 state->ptr = ptr + 1 - prefix_len + prefix_skip; 1573 if (flags & SRE_INFO_LITERAL) 1574 return 1; /* we got all of it */ 1575 status = SRE_MATCH(state, pattern + 2*prefix_skip); 1576 if (status != 0) 1577 return status; 1578 /* close but no cigar -- try again */ 1579 i = overlap[i]; 1580 } 1581 break; 1582 } 1583 } 1584 ptr++; 1585 } 1586 return 0; 1587 } 1588 #endif 1589 1590 if (pattern[0] == SRE_OP_LITERAL) { 1591 /* pattern starts with a literal character. this is used 1592 for short prefixes, and if fast search is disabled */ 1593 SRE_CODE chr = pattern[1]; 1594 end = (SRE_CHAR *)state->end; 1595 for (;;) { 1596 while (ptr < end && (SRE_CODE) ptr[0] != chr) 1597 ptr++; 1598 if (ptr >= end) 1599 return 0; 1600 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr)); 1601 state->start = ptr; 1602 state->ptr = ++ptr; 1603 if (flags & SRE_INFO_LITERAL) 1604 return 1; /* we got all of it */ 1605 status = SRE_MATCH(state, pattern + 2); 1606 if (status != 0) 1607 break; 1608 } 1609 } else if (charset) { 1610 /* pattern starts with a character from a known set */ 1611 end = (SRE_CHAR *)state->end; 1612 for (;;) { 1613 while (ptr < end && !SRE_CHARSET(charset, ptr[0])) 1614 ptr++; 1615 if (ptr >= end) 1616 return 0; 1617 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr)); 1618 state->start = ptr; 1619 state->ptr = ptr; 1620 status = SRE_MATCH(state, pattern); 1621 if (status != 0) 1622 break; 1623 ptr++; 1624 } 1625 } else { 1626 /* general case */ 1627 assert(ptr <= end); 1628 while (1) { 1629 TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); 1630 state->start = state->ptr = ptr; 1631 status = SRE_MATCH(state, pattern); 1632 if (status != 0 || ptr >= end) 1633 break; 1634 ptr++; 1635 } 1636 } 1637 1638 return status; 1639 } 1640 1641 LOCAL(int) 1642 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len) 1643 { 1644 /* check if given string is a literal template (i.e. no escapes) */ 1645 while (len-- > 0) 1646 if (*ptr++ == '\\') 1647 return 0; 1648 return 1; 1649 } 1650 1651 #if !defined(SRE_RECURSIVE) 1652 1653 /* -------------------------------------------------------------------- */ 1654 /* factories and destructors */ 1655 1656 /* see sre.h for object declarations */ 1657 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int); 1658 static PyObject*pattern_scanner(PatternObject*, PyObject*); 1659 1660 static PyObject * 1661 sre_codesize(PyObject* self, PyObject *unused) 1662 { 1663 return PyInt_FromSize_t(sizeof(SRE_CODE)); 1664 } 1665 1666 static PyObject * 1667 sre_getlower(PyObject* self, PyObject* args) 1668 { 1669 int character, flags; 1670 if (!PyArg_ParseTuple(args, "ii", &character, &flags)) 1671 return NULL; 1672 if (flags & SRE_FLAG_LOCALE) 1673 return Py_BuildValue("i", sre_lower_locale(character)); 1674 if (flags & SRE_FLAG_UNICODE) 1675 #if defined(HAVE_UNICODE) 1676 return Py_BuildValue("i", sre_lower_unicode(character)); 1677 #else 1678 return Py_BuildValue("i", sre_lower_locale(character)); 1679 #endif 1680 return Py_BuildValue("i", sre_lower(character)); 1681 } 1682 1683 LOCAL(void) 1684 state_reset(SRE_STATE* state) 1685 { 1686 /* FIXME: dynamic! */ 1687 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/ 1688 1689 state->lastmark = -1; 1690 state->lastindex = -1; 1691 1692 state->repeat = NULL; 1693 1694 data_stack_dealloc(state); 1695 } 1696 1697 static void* 1698 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) 1699 { 1700 /* given a python object, return a data pointer, a length (in 1701 characters), and a character size. return NULL if the object 1702 is not a string (or not compatible) */ 1703 1704 PyBufferProcs *buffer; 1705 Py_ssize_t size, bytes; 1706 int charsize; 1707 void* ptr; 1708 1709 #if defined(HAVE_UNICODE) 1710 if (PyUnicode_Check(string)) { 1711 /* unicode strings doesn't always support the buffer interface */ 1712 ptr = (void*) PyUnicode_AS_DATA(string); 1713 /* bytes = PyUnicode_GET_DATA_SIZE(string); */ 1714 size = PyUnicode_GET_SIZE(string); 1715 charsize = sizeof(Py_UNICODE); 1716 1717 } else { 1718 #endif 1719 1720 /* get pointer to string buffer */ 1721 buffer = Py_TYPE(string)->tp_as_buffer; 1722 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount || 1723 buffer->bf_getsegcount(string, NULL) != 1) { 1724 PyErr_SetString(PyExc_TypeError, "expected string or buffer"); 1725 return NULL; 1726 } 1727 1728 /* determine buffer size */ 1729 bytes = buffer->bf_getreadbuffer(string, 0, &ptr); 1730 if (bytes < 0) { 1731 PyErr_SetString(PyExc_TypeError, "buffer has negative size"); 1732 return NULL; 1733 } 1734 1735 /* determine character size */ 1736 #if PY_VERSION_HEX >= 0x01060000 1737 size = PyObject_Size(string); 1738 #else 1739 size = PyObject_Length(string); 1740 #endif 1741 1742 if (PyString_Check(string) || bytes == size) 1743 charsize = 1; 1744 #if defined(HAVE_UNICODE) 1745 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE))) 1746 charsize = sizeof(Py_UNICODE); 1747 #endif 1748 else { 1749 PyErr_SetString(PyExc_TypeError, "buffer size mismatch"); 1750 return NULL; 1751 } 1752 1753 #if defined(HAVE_UNICODE) 1754 } 1755 #endif 1756 1757 *p_length = size; 1758 *p_charsize = charsize; 1759 1760 return ptr; 1761 } 1762 1763 LOCAL(PyObject*) 1764 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, 1765 Py_ssize_t start, Py_ssize_t end) 1766 { 1767 /* prepare state object */ 1768 1769 Py_ssize_t length; 1770 int charsize; 1771 void* ptr; 1772 1773 memset(state, 0, sizeof(SRE_STATE)); 1774 1775 state->lastmark = -1; 1776 state->lastindex = -1; 1777 1778 ptr = getstring(string, &length, &charsize); 1779 if (!ptr) 1780 return NULL; 1781 1782 /* adjust boundaries */ 1783 if (start < 0) 1784 start = 0; 1785 else if (start > length) 1786 start = length; 1787 1788 if (end < 0) 1789 end = 0; 1790 else if (end > length) 1791 end = length; 1792 1793 state->charsize = charsize; 1794 1795 state->beginning = ptr; 1796 1797 state->start = (void*) ((char*) ptr + start * state->charsize); 1798 state->end = (void*) ((char*) ptr + end * state->charsize); 1799 1800 Py_INCREF(string); 1801 state->string = string; 1802 state->pos = start; 1803 state->endpos = end; 1804 1805 if (pattern->flags & SRE_FLAG_LOCALE) 1806 state->lower = sre_lower_locale; 1807 else if (pattern->flags & SRE_FLAG_UNICODE) 1808 #if defined(HAVE_UNICODE) 1809 state->lower = sre_lower_unicode; 1810 #else 1811 state->lower = sre_lower_locale; 1812 #endif 1813 else 1814 state->lower = sre_lower; 1815 1816 return string; 1817 } 1818 1819 LOCAL(void) 1820 state_fini(SRE_STATE* state) 1821 { 1822 Py_XDECREF(state->string); 1823 data_stack_dealloc(state); 1824 } 1825 1826 /* calculate offset from start of string */ 1827 #define STATE_OFFSET(state, member)\ 1828 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize) 1829 1830 LOCAL(PyObject*) 1831 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty) 1832 { 1833 Py_ssize_t i, j; 1834 1835 index = (index - 1) * 2; 1836 1837 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) { 1838 if (empty) 1839 /* want empty string */ 1840 i = j = 0; 1841 else { 1842 Py_INCREF(Py_None); 1843 return Py_None; 1844 } 1845 } else { 1846 i = STATE_OFFSET(state, state->mark[index]); 1847 j = STATE_OFFSET(state, state->mark[index+1]); 1848 } 1849 1850 return PySequence_GetSlice(string, i, j); 1851 } 1852 1853 static void 1854 pattern_error(int status) 1855 { 1856 switch (status) { 1857 case SRE_ERROR_RECURSION_LIMIT: 1858 PyErr_SetString( 1859 PyExc_RuntimeError, 1860 "maximum recursion limit exceeded" 1861 ); 1862 break; 1863 case SRE_ERROR_MEMORY: 1864 PyErr_NoMemory(); 1865 break; 1866 case SRE_ERROR_INTERRUPTED: 1867 /* An exception has already been raised, so let it fly */ 1868 break; 1869 default: 1870 /* other error codes indicate compiler/engine bugs */ 1871 PyErr_SetString( 1872 PyExc_RuntimeError, 1873 "internal error in regular expression engine" 1874 ); 1875 } 1876 } 1877 1878 static void 1879 pattern_dealloc(PatternObject* self) 1880 { 1881 if (self->weakreflist != NULL) 1882 PyObject_ClearWeakRefs((PyObject *) self); 1883 Py_XDECREF(self->pattern); 1884 Py_XDECREF(self->groupindex); 1885 Py_XDECREF(self->indexgroup); 1886 PyObject_DEL(self); 1887 } 1888 1889 static int 1890 check_args_size(const char *name, PyObject* args, PyObject* kw, int n) 1891 { 1892 Py_ssize_t m = PyTuple_GET_SIZE(args) + (kw ? PyDict_Size(kw) : 0); 1893 if (m <= n) 1894 return 1; 1895 PyErr_Format(PyExc_TypeError, 1896 "%s() takes at most %d positional arguments (%zd given)", 1897 name, n, m); 1898 return 0; 1899 } 1900 1901 static PyObject* 1902 fix_string_param(PyObject *string, PyObject *string2, const char *oldname) 1903 { 1904 if (string2 != NULL) { 1905 char buf[100]; 1906 if (string != NULL) { 1907 PyErr_Format(PyExc_TypeError, 1908 "Argument given by name ('%s') and position (1)", 1909 oldname); 1910 return NULL; 1911 } 1912 sprintf(buf, "The '%s' keyword parameter name is deprecated. " 1913 "Use 'string' instead.", oldname); 1914 if (PyErr_Warn(PyExc_DeprecationWarning, buf) < 0) 1915 return NULL; 1916 return string2; 1917 } 1918 if (string == NULL) { 1919 PyErr_SetString(PyExc_TypeError, 1920 "Required argument 'string' (pos 1) not found"); 1921 return NULL; 1922 } 1923 return string; 1924 } 1925 1926 static PyObject* 1927 pattern_match(PatternObject* self, PyObject* args, PyObject* kw) 1928 { 1929 SRE_STATE state; 1930 int status; 1931 1932 PyObject *string = NULL, *string2 = NULL; 1933 Py_ssize_t start = 0; 1934 Py_ssize_t end = PY_SSIZE_T_MAX; 1935 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL }; 1936 if (!check_args_size("match", args, kw, 3)) 1937 return NULL; 1938 1939 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:match", kwlist, 1940 &string, &start, &end, &string2)) 1941 return NULL; 1942 1943 string = fix_string_param(string, string2, "pattern"); 1944 if (!string) 1945 return NULL; 1946 1947 string = state_init(&state, self, string, start, end); 1948 if (!string) 1949 return NULL; 1950 1951 state.ptr = state.start; 1952 1953 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr)); 1954 1955 if (state.charsize == 1) { 1956 status = sre_match(&state, PatternObject_GetCode(self)); 1957 } else { 1958 #if defined(HAVE_UNICODE) 1959 status = sre_umatch(&state, PatternObject_GetCode(self)); 1960 #endif 1961 } 1962 1963 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); 1964 if (PyErr_Occurred()) 1965 return NULL; 1966 1967 state_fini(&state); 1968 1969 return pattern_new_match(self, &state, status); 1970 } 1971 1972 static PyObject* 1973 pattern_search(PatternObject* self, PyObject* args, PyObject* kw) 1974 { 1975 SRE_STATE state; 1976 int status; 1977 1978 PyObject *string = NULL, *string2 = NULL; 1979 Py_ssize_t start = 0; 1980 Py_ssize_t end = PY_SSIZE_T_MAX; 1981 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL }; 1982 if (!check_args_size("search", args, kw, 3)) 1983 return NULL; 1984 1985 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:search", kwlist, 1986 &string, &start, &end, &string2)) 1987 return NULL; 1988 1989 string = fix_string_param(string, string2, "pattern"); 1990 if (!string) 1991 return NULL; 1992 1993 string = state_init(&state, self, string, start, end); 1994 if (!string) 1995 return NULL; 1996 1997 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr)); 1998 1999 if (state.charsize == 1) { 2000 status = sre_search(&state, PatternObject_GetCode(self)); 2001 } else { 2002 #if defined(HAVE_UNICODE) 2003 status = sre_usearch(&state, PatternObject_GetCode(self)); 2004 #endif 2005 } 2006 2007 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); 2008 2009 state_fini(&state); 2010 2011 if (PyErr_Occurred()) 2012 return NULL; 2013 2014 return pattern_new_match(self, &state, status); 2015 } 2016 2017 static PyObject* 2018 call(char* module, char* function, PyObject* args) 2019 { 2020 PyObject* name; 2021 PyObject* mod; 2022 PyObject* func; 2023 PyObject* result; 2024 2025 if (!args) 2026 return NULL; 2027 name = PyString_FromString(module); 2028 if (!name) 2029 return NULL; 2030 mod = PyImport_Import(name); 2031 Py_DECREF(name); 2032 if (!mod) 2033 return NULL; 2034 func = PyObject_GetAttrString(mod, function); 2035 Py_DECREF(mod); 2036 if (!func) 2037 return NULL; 2038 result = PyObject_CallObject(func, args); 2039 Py_DECREF(func); 2040 Py_DECREF(args); 2041 return result; 2042 } 2043 2044 #ifdef USE_BUILTIN_COPY 2045 static int 2046 deepcopy(PyObject** object, PyObject* memo) 2047 { 2048 PyObject* copy; 2049 2050 copy = call( 2051 "copy", "deepcopy", 2052 PyTuple_Pack(2, *object, memo) 2053 ); 2054 if (!copy) 2055 return 0; 2056 2057 Py_SETREF(*object, copy); 2058 2059 return 1; /* success */ 2060 } 2061 #endif 2062 2063 static PyObject* 2064 join_list(PyObject* list, PyObject* string) 2065 { 2066 /* join list elements */ 2067 2068 PyObject* joiner; 2069 #if PY_VERSION_HEX >= 0x01060000 2070 PyObject* function; 2071 PyObject* args; 2072 #endif 2073 PyObject* result; 2074 2075 joiner = PySequence_GetSlice(string, 0, 0); 2076 if (!joiner) 2077 return NULL; 2078 2079 if (PyList_GET_SIZE(list) == 0) { 2080 Py_DECREF(list); 2081 return joiner; 2082 } 2083 2084 #if PY_VERSION_HEX >= 0x01060000 2085 function = PyObject_GetAttrString(joiner, "join"); 2086 if (!function) { 2087 Py_DECREF(joiner); 2088 return NULL; 2089 } 2090 args = PyTuple_New(1); 2091 if (!args) { 2092 Py_DECREF(function); 2093 Py_DECREF(joiner); 2094 return NULL; 2095 } 2096 PyTuple_SET_ITEM(args, 0, list); 2097 result = PyObject_CallObject(function, args); 2098 Py_DECREF(args); /* also removes list */ 2099 Py_DECREF(function); 2100 #else 2101 result = call( 2102 "string", "join", 2103 PyTuple_Pack(2, list, joiner) 2104 ); 2105 #endif 2106 Py_DECREF(joiner); 2107 2108 return result; 2109 } 2110 2111 static PyObject* 2112 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw) 2113 { 2114 SRE_STATE state; 2115 PyObject* list; 2116 int status; 2117 Py_ssize_t i, b, e; 2118 2119 PyObject *string = NULL, *string2 = NULL; 2120 Py_ssize_t start = 0; 2121 Py_ssize_t end = PY_SSIZE_T_MAX; 2122 static char* kwlist[] = { "string", "pos", "endpos", "source", NULL }; 2123 if (!check_args_size("findall", args, kw, 3)) 2124 return NULL; 2125 2126 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:findall", kwlist, 2127 &string, &start, &end, &string2)) 2128 return NULL; 2129 2130 string = fix_string_param(string, string2, "source"); 2131 if (!string) 2132 return NULL; 2133 2134 string = state_init(&state, self, string, start, end); 2135 if (!string) 2136 return NULL; 2137 2138 list = PyList_New(0); 2139 if (!list) { 2140 state_fini(&state); 2141 return NULL; 2142 } 2143 2144 while (state.start <= state.end) { 2145 2146 PyObject* item; 2147 2148 state_reset(&state); 2149 2150 state.ptr = state.start; 2151 2152 if (state.charsize == 1) { 2153 status = sre_search(&state, PatternObject_GetCode(self)); 2154 } else { 2155 #if defined(HAVE_UNICODE) 2156 status = sre_usearch(&state, PatternObject_GetCode(self)); 2157 #endif 2158 } 2159 2160 if (PyErr_Occurred()) 2161 goto error; 2162 2163 if (status <= 0) { 2164 if (status == 0) 2165 break; 2166 pattern_error(status); 2167 goto error; 2168 } 2169 2170 /* don't bother to build a match object */ 2171 switch (self->groups) { 2172 case 0: 2173 b = STATE_OFFSET(&state, state.start); 2174 e = STATE_OFFSET(&state, state.ptr); 2175 item = PySequence_GetSlice(string, b, e); 2176 if (!item) 2177 goto error; 2178 break; 2179 case 1: 2180 item = state_getslice(&state, 1, string, 1); 2181 if (!item) 2182 goto error; 2183 break; 2184 default: 2185 item = PyTuple_New(self->groups); 2186 if (!item) 2187 goto error; 2188 for (i = 0; i < self->groups; i++) { 2189 PyObject* o = state_getslice(&state, i+1, string, 1); 2190 if (!o) { 2191 Py_DECREF(item); 2192 goto error; 2193 } 2194 PyTuple_SET_ITEM(item, i, o); 2195 } 2196 break; 2197 } 2198 2199 status = PyList_Append(list, item); 2200 Py_DECREF(item); 2201 if (status < 0) 2202 goto error; 2203 2204 if (state.ptr == state.start) 2205 state.start = (void*) ((char*) state.ptr + state.charsize); 2206 else 2207 state.start = state.ptr; 2208 2209 } 2210 2211 state_fini(&state); 2212 return list; 2213 2214 error: 2215 Py_DECREF(list); 2216 state_fini(&state); 2217 return NULL; 2218 2219 } 2220 2221 #if PY_VERSION_HEX >= 0x02020000 2222 static PyObject* 2223 pattern_finditer(PatternObject* pattern, PyObject* args) 2224 { 2225 PyObject* scanner; 2226 PyObject* search; 2227 PyObject* iterator; 2228 2229 scanner = pattern_scanner(pattern, args); 2230 if (!scanner) 2231 return NULL; 2232 2233 search = PyObject_GetAttrString(scanner, "search"); 2234 Py_DECREF(scanner); 2235 if (!search) 2236 return NULL; 2237 2238 iterator = PyCallIter_New(search, Py_None); 2239 Py_DECREF(search); 2240 2241 return iterator; 2242 } 2243 #endif 2244 2245 static PyObject* 2246 pattern_split(PatternObject* self, PyObject* args, PyObject* kw) 2247 { 2248 SRE_STATE state; 2249 PyObject* list; 2250 PyObject* item; 2251 int status; 2252 Py_ssize_t n; 2253 Py_ssize_t i; 2254 void* last; 2255 2256 PyObject *string = NULL, *string2 = NULL; 2257 Py_ssize_t maxsplit = 0; 2258 static char* kwlist[] = { "string", "maxsplit", "source", NULL }; 2259 if (!check_args_size("split", args, kw, 2)) 2260 return NULL; 2261 2262 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnO:split", kwlist, 2263 &string, &maxsplit, &string2)) 2264 return NULL; 2265 2266 string = fix_string_param(string, string2, "source"); 2267 if (!string) 2268 return NULL; 2269 2270 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); 2271 if (!string) 2272 return NULL; 2273 2274 list = PyList_New(0); 2275 if (!list) { 2276 state_fini(&state); 2277 return NULL; 2278 } 2279 2280 n = 0; 2281 last = state.start; 2282 2283 while (!maxsplit || n < maxsplit) { 2284 2285 state_reset(&state); 2286 2287 state.ptr = state.start; 2288 2289 if (state.charsize == 1) { 2290 status = sre_search(&state, PatternObject_GetCode(self)); 2291 } else { 2292 #if defined(HAVE_UNICODE) 2293 status = sre_usearch(&state, PatternObject_GetCode(self)); 2294 #endif 2295 } 2296 2297 if (PyErr_Occurred()) 2298 goto error; 2299 2300 if (status <= 0) { 2301 if (status == 0) 2302 break; 2303 pattern_error(status); 2304 goto error; 2305 } 2306 2307 if (state.start == state.ptr) { 2308 if (last == state.end || state.ptr == state.end) 2309 break; 2310 /* skip one character */ 2311 state.start = (void*) ((char*) state.ptr + state.charsize); 2312 continue; 2313 } 2314 2315 /* get segment before this match */ 2316 item = PySequence_GetSlice( 2317 string, STATE_OFFSET(&state, last), 2318 STATE_OFFSET(&state, state.start) 2319 ); 2320 if (!item) 2321 goto error; 2322 status = PyList_Append(list, item); 2323 Py_DECREF(item); 2324 if (status < 0) 2325 goto error; 2326 2327 /* add groups (if any) */ 2328 for (i = 0; i < self->groups; i++) { 2329 item = state_getslice(&state, i+1, string, 0); 2330 if (!item) 2331 goto error; 2332 status = PyList_Append(list, item); 2333 Py_DECREF(item); 2334 if (status < 0) 2335 goto error; 2336 } 2337 2338 n = n + 1; 2339 2340 last = state.start = state.ptr; 2341 2342 } 2343 2344 /* get segment following last match (even if empty) */ 2345 item = PySequence_GetSlice( 2346 string, STATE_OFFSET(&state, last), state.endpos 2347 ); 2348 if (!item) 2349 goto error; 2350 status = PyList_Append(list, item); 2351 Py_DECREF(item); 2352 if (status < 0) 2353 goto error; 2354 2355 state_fini(&state); 2356 return list; 2357 2358 error: 2359 Py_DECREF(list); 2360 state_fini(&state); 2361 return NULL; 2362 2363 } 2364 2365 static PyObject* 2366 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, 2367 Py_ssize_t count, Py_ssize_t subn) 2368 { 2369 SRE_STATE state; 2370 PyObject* list; 2371 PyObject* item; 2372 PyObject* filter; 2373 PyObject* args; 2374 PyObject* match; 2375 void* ptr; 2376 int status; 2377 Py_ssize_t n; 2378 Py_ssize_t i, b, e; 2379 int bint; 2380 int filter_is_callable; 2381 2382 if (PyCallable_Check(ptemplate)) { 2383 /* sub/subn takes either a function or a template */ 2384 filter = ptemplate; 2385 Py_INCREF(filter); 2386 filter_is_callable = 1; 2387 } else { 2388 /* if not callable, check if it's a literal string */ 2389 int literal; 2390 ptr = getstring(ptemplate, &n, &bint); 2391 b = bint; 2392 if (ptr) { 2393 if (b == 1) { 2394 literal = sre_literal_template((unsigned char *)ptr, n); 2395 } else { 2396 #if defined(HAVE_UNICODE) 2397 literal = sre_uliteral_template((Py_UNICODE *)ptr, n); 2398 #endif 2399 } 2400 } else { 2401 PyErr_Clear(); 2402 literal = 0; 2403 } 2404 if (literal) { 2405 filter = ptemplate; 2406 Py_INCREF(filter); 2407 filter_is_callable = 0; 2408 } else { 2409 /* not a literal; hand it over to the template compiler */ 2410 filter = call( 2411 SRE_PY_MODULE, "_subx", 2412 PyTuple_Pack(2, self, ptemplate) 2413 ); 2414 if (!filter) 2415 return NULL; 2416 filter_is_callable = PyCallable_Check(filter); 2417 } 2418 } 2419 2420 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); 2421 if (!string) { 2422 Py_DECREF(filter); 2423 return NULL; 2424 } 2425 2426 list = PyList_New(0); 2427 if (!list) { 2428 Py_DECREF(filter); 2429 state_fini(&state); 2430 return NULL; 2431 } 2432 2433 n = i = 0; 2434 2435 while (!count || n < count) { 2436 2437 state_reset(&state); 2438 2439 state.ptr = state.start; 2440 2441 if (state.charsize == 1) { 2442 status = sre_search(&state, PatternObject_GetCode(self)); 2443 } else { 2444 #if defined(HAVE_UNICODE) 2445 status = sre_usearch(&state, PatternObject_GetCode(self)); 2446 #endif 2447 } 2448 2449 if (PyErr_Occurred()) 2450 goto error; 2451 2452 if (status <= 0) { 2453 if (status == 0) 2454 break; 2455 pattern_error(status); 2456 goto error; 2457 } 2458 2459 b = STATE_OFFSET(&state, state.start); 2460 e = STATE_OFFSET(&state, state.ptr); 2461 2462 if (i < b) { 2463 /* get segment before this match */ 2464 item = PySequence_GetSlice(string, i, b); 2465 if (!item) 2466 goto error; 2467 status = PyList_Append(list, item); 2468 Py_DECREF(item); 2469 if (status < 0) 2470 goto error; 2471 2472 } else if (i == b && i == e && n > 0) 2473 /* ignore empty match on latest position */ 2474 goto next; 2475 2476 if (filter_is_callable) { 2477 /* pass match object through filter */ 2478 match = pattern_new_match(self, &state, 1); 2479 if (!match) 2480 goto error; 2481 args = PyTuple_Pack(1, match); 2482 if (!args) { 2483 Py_DECREF(match); 2484 goto error; 2485 } 2486 item = PyObject_CallObject(filter, args); 2487 Py_DECREF(args); 2488 Py_DECREF(match); 2489 if (!item) 2490 goto error; 2491 } else { 2492 /* filter is literal string */ 2493 item = filter; 2494 Py_INCREF(item); 2495 } 2496 2497 /* add to list */ 2498 if (item != Py_None) { 2499 status = PyList_Append(list, item); 2500 Py_DECREF(item); 2501 if (status < 0) 2502 goto error; 2503 } 2504 2505 i = e; 2506 n = n + 1; 2507 2508 next: 2509 /* move on */ 2510 if (state.ptr == state.end) 2511 break; 2512 if (state.ptr == state.start) 2513 state.start = (void*) ((char*) state.ptr + state.charsize); 2514 else 2515 state.start = state.ptr; 2516 2517 } 2518 2519 /* get segment following last match */ 2520 if (i < state.endpos) { 2521 item = PySequence_GetSlice(string, i, state.endpos); 2522 if (!item) 2523 goto error; 2524 status = PyList_Append(list, item); 2525 Py_DECREF(item); 2526 if (status < 0) 2527 goto error; 2528 } 2529 2530 state_fini(&state); 2531 2532 Py_DECREF(filter); 2533 2534 /* convert list to single string (also removes list) */ 2535 item = join_list(list, string); 2536 2537 if (!item) 2538 return NULL; 2539 2540 if (subn) 2541 return Py_BuildValue("Nn", item, n); 2542 2543 return item; 2544 2545 error: 2546 Py_DECREF(list); 2547 state_fini(&state); 2548 Py_DECREF(filter); 2549 return NULL; 2550 2551 } 2552 2553 static PyObject* 2554 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw) 2555 { 2556 PyObject* ptemplate; 2557 PyObject* string; 2558 Py_ssize_t count = 0; 2559 static char* kwlist[] = { "repl", "string", "count", NULL }; 2560 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist, 2561 &ptemplate, &string, &count)) 2562 return NULL; 2563 2564 return pattern_subx(self, ptemplate, string, count, 0); 2565 } 2566 2567 static PyObject* 2568 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw) 2569 { 2570 PyObject* ptemplate; 2571 PyObject* string; 2572 Py_ssize_t count = 0; 2573 static char* kwlist[] = { "repl", "string", "count", NULL }; 2574 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist, 2575 &ptemplate, &string, &count)) 2576 return NULL; 2577 2578 return pattern_subx(self, ptemplate, string, count, 1); 2579 } 2580 2581 static PyObject* 2582 pattern_copy(PatternObject* self, PyObject *unused) 2583 { 2584 #ifdef USE_BUILTIN_COPY 2585 PatternObject* copy; 2586 int offset; 2587 2588 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize); 2589 if (!copy) 2590 return NULL; 2591 2592 offset = offsetof(PatternObject, groups); 2593 2594 Py_XINCREF(self->groupindex); 2595 Py_XINCREF(self->indexgroup); 2596 Py_XINCREF(self->pattern); 2597 2598 memcpy((char*) copy + offset, (char*) self + offset, 2599 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset); 2600 copy->weakreflist = NULL; 2601 2602 return (PyObject*) copy; 2603 #else 2604 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object"); 2605 return NULL; 2606 #endif 2607 } 2608 2609 static PyObject* 2610 pattern_deepcopy(PatternObject* self, PyObject* memo) 2611 { 2612 #ifdef USE_BUILTIN_COPY 2613 PatternObject* copy; 2614 2615 copy = (PatternObject*) pattern_copy(self); 2616 if (!copy) 2617 return NULL; 2618 2619 if (!deepcopy(©->groupindex, memo) || 2620 !deepcopy(©->indexgroup, memo) || 2621 !deepcopy(©->pattern, memo)) { 2622 Py_DECREF(copy); 2623 return NULL; 2624 } 2625 2626 #else 2627 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object"); 2628 return NULL; 2629 #endif 2630 } 2631 2632 PyDoc_STRVAR(pattern_match_doc, 2633 "match(string[, pos[, endpos]]) --> match object or None.\n\ 2634 Matches zero or more characters at the beginning of the string"); 2635 2636 PyDoc_STRVAR(pattern_search_doc, 2637 "search(string[, pos[, endpos]]) --> match object or None.\n\ 2638 Scan through string looking for a match, and return a corresponding\n\ 2639 match object instance. Return None if no position in the string matches."); 2640 2641 PyDoc_STRVAR(pattern_split_doc, 2642 "split(string[, maxsplit = 0]) --> list.\n\ 2643 Split string by the occurrences of pattern."); 2644 2645 PyDoc_STRVAR(pattern_findall_doc, 2646 "findall(string[, pos[, endpos]]) --> list.\n\ 2647 Return a list of all non-overlapping matches of pattern in string."); 2648 2649 PyDoc_STRVAR(pattern_finditer_doc, 2650 "finditer(string[, pos[, endpos]]) --> iterator.\n\ 2651 Return an iterator over all non-overlapping matches for the \n\ 2652 RE pattern in string. For each match, the iterator returns a\n\ 2653 match object."); 2654 2655 PyDoc_STRVAR(pattern_sub_doc, 2656 "sub(repl, string[, count = 0]) --> newstring\n\ 2657 Return the string obtained by replacing the leftmost non-overlapping\n\ 2658 occurrences of pattern in string by the replacement repl."); 2659 2660 PyDoc_STRVAR(pattern_subn_doc, 2661 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\ 2662 Return the tuple (new_string, number_of_subs_made) found by replacing\n\ 2663 the leftmost non-overlapping occurrences of pattern with the\n\ 2664 replacement repl."); 2665 2666 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects"); 2667 2668 static PyMethodDef pattern_methods[] = { 2669 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS, 2670 pattern_match_doc}, 2671 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS, 2672 pattern_search_doc}, 2673 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS, 2674 pattern_sub_doc}, 2675 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS, 2676 pattern_subn_doc}, 2677 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS, 2678 pattern_split_doc}, 2679 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS, 2680 pattern_findall_doc}, 2681 #if PY_VERSION_HEX >= 0x02020000 2682 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS, 2683 pattern_finditer_doc}, 2684 #endif 2685 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS}, 2686 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS}, 2687 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O}, 2688 {NULL, NULL} 2689 }; 2690 2691 #define PAT_OFF(x) offsetof(PatternObject, x) 2692 static PyMemberDef pattern_members[] = { 2693 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY}, 2694 {"flags", T_INT, PAT_OFF(flags), READONLY}, 2695 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY}, 2696 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY}, 2697 {NULL} /* Sentinel */ 2698 }; 2699 2700 statichere PyTypeObject Pattern_Type = { 2701 PyObject_HEAD_INIT(NULL) 2702 0, "_" SRE_MODULE ".SRE_Pattern", 2703 sizeof(PatternObject), sizeof(SRE_CODE), 2704 (destructor)pattern_dealloc, /*tp_dealloc*/ 2705 0, /* tp_print */ 2706 0, /* tp_getattrn */ 2707 0, /* tp_setattr */ 2708 0, /* tp_compare */ 2709 0, /* tp_repr */ 2710 0, /* tp_as_number */ 2711 0, /* tp_as_sequence */ 2712 0, /* tp_as_mapping */ 2713 0, /* tp_hash */ 2714 0, /* tp_call */ 2715 0, /* tp_str */ 2716 0, /* tp_getattro */ 2717 0, /* tp_setattro */ 2718 0, /* tp_as_buffer */ 2719 Py_TPFLAGS_DEFAULT, /* tp_flags */ 2720 pattern_doc, /* tp_doc */ 2721 0, /* tp_traverse */ 2722 0, /* tp_clear */ 2723 0, /* tp_richcompare */ 2724 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */ 2725 0, /* tp_iter */ 2726 0, /* tp_iternext */ 2727 pattern_methods, /* tp_methods */ 2728 pattern_members, /* tp_members */ 2729 }; 2730 2731 static int _validate(PatternObject *self); /* Forward */ 2732 2733 static PyObject * 2734 _compile(PyObject* self_, PyObject* args) 2735 { 2736 /* "compile" pattern descriptor to pattern object */ 2737 2738 PatternObject* self; 2739 Py_ssize_t i, n; 2740 2741 PyObject* pattern; 2742 int flags = 0; 2743 PyObject* code; 2744 Py_ssize_t groups = 0; 2745 PyObject* groupindex = NULL; 2746 PyObject* indexgroup = NULL; 2747 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags, 2748 &PyList_Type, &code, &groups, 2749 &groupindex, &indexgroup)) 2750 return NULL; 2751 2752 n = PyList_GET_SIZE(code); 2753 /* coverity[ampersand_in_size] */ 2754 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n); 2755 if (!self) 2756 return NULL; 2757 self->weakreflist = NULL; 2758 self->pattern = NULL; 2759 self->groupindex = NULL; 2760 self->indexgroup = NULL; 2761 2762 self->codesize = n; 2763 2764 for (i = 0; i < n; i++) { 2765 PyObject *o = PyList_GET_ITEM(code, i); 2766 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o) 2767 : PyLong_AsUnsignedLong(o); 2768 if (value == (unsigned long)-1 && PyErr_Occurred()) { 2769 if (PyErr_ExceptionMatches(PyExc_OverflowError)) { 2770 PyErr_SetString(PyExc_OverflowError, 2771 "regular expression code size limit exceeded"); 2772 } 2773 break; 2774 } 2775 self->code[i] = (SRE_CODE) value; 2776 if ((unsigned long) self->code[i] != value) { 2777 PyErr_SetString(PyExc_OverflowError, 2778 "regular expression code size limit exceeded"); 2779 break; 2780 } 2781 } 2782 2783 if (PyErr_Occurred()) { 2784 Py_DECREF(self); 2785 return NULL; 2786 } 2787 2788 Py_INCREF(pattern); 2789 self->pattern = pattern; 2790 2791 self->flags = flags; 2792 2793 self->groups = groups; 2794 2795 Py_XINCREF(groupindex); 2796 self->groupindex = groupindex; 2797 2798 Py_XINCREF(indexgroup); 2799 self->indexgroup = indexgroup; 2800 2801 self->weakreflist = NULL; 2802 2803 if (!_validate(self)) { 2804 Py_DECREF(self); 2805 return NULL; 2806 } 2807 2808 return (PyObject*) self; 2809 } 2810 2811 /* -------------------------------------------------------------------- */ 2812 /* Code validation */ 2813 2814 /* To learn more about this code, have a look at the _compile() function in 2815 Lib/sre_compile.py. The validation functions below checks the code array 2816 for conformance with the code patterns generated there. 2817 2818 The nice thing about the generated code is that it is position-independent: 2819 all jumps are relative jumps forward. Also, jumps don't cross each other: 2820 the target of a later jump is always earlier than the target of an earlier 2821 jump. IOW, this is okay: 2822 2823 J---------J-------T--------T 2824 \ \_____/ / 2825 \______________________/ 2826 2827 but this is not: 2828 2829 J---------J-------T--------T 2830 \_________\_____/ / 2831 \____________/ 2832 2833 It also helps that SRE_CODE is always an unsigned type. 2834 */ 2835 2836 /* Defining this one enables tracing of the validator */ 2837 #undef VVERBOSE 2838 2839 /* Trace macro for the validator */ 2840 #if defined(VVERBOSE) 2841 #define VTRACE(v) printf v 2842 #else 2843 #define VTRACE(v) do {} while(0) /* do nothing */ 2844 #endif 2845 2846 /* Report failure */ 2847 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0) 2848 2849 /* Extract opcode, argument, or skip count from code array */ 2850 #define GET_OP \ 2851 do { \ 2852 VTRACE(("%p: ", code)); \ 2853 if (code >= end) FAIL; \ 2854 op = *code++; \ 2855 VTRACE(("%lu (op)\n", (unsigned long)op)); \ 2856 } while (0) 2857 #define GET_ARG \ 2858 do { \ 2859 VTRACE(("%p= ", code)); \ 2860 if (code >= end) FAIL; \ 2861 arg = *code++; \ 2862 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \ 2863 } while (0) 2864 #define GET_SKIP_ADJ(adj) \ 2865 do { \ 2866 VTRACE(("%p= ", code)); \ 2867 if (code >= end) FAIL; \ 2868 skip = *code; \ 2869 VTRACE(("%lu (skip to %p)\n", \ 2870 (unsigned long)skip, code+skip)); \ 2871 if (skip-adj > end-code) \ 2872 FAIL; \ 2873 code++; \ 2874 } while (0) 2875 #define GET_SKIP GET_SKIP_ADJ(0) 2876 2877 static int 2878 _validate_charset(SRE_CODE *code, SRE_CODE *end) 2879 { 2880 /* Some variables are manipulated by the macros above */ 2881 SRE_CODE op; 2882 SRE_CODE arg; 2883 SRE_CODE offset; 2884 int i; 2885 2886 while (code < end) { 2887 GET_OP; 2888 switch (op) { 2889 2890 case SRE_OP_NEGATE: 2891 break; 2892 2893 case SRE_OP_LITERAL: 2894 GET_ARG; 2895 break; 2896 2897 case SRE_OP_RANGE: 2898 GET_ARG; 2899 GET_ARG; 2900 break; 2901 2902 case SRE_OP_CHARSET: 2903 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */ 2904 if (offset > end-code) 2905 FAIL; 2906 code += offset; 2907 break; 2908 2909 case SRE_OP_BIGCHARSET: 2910 GET_ARG; /* Number of blocks */ 2911 offset = 256/sizeof(SRE_CODE); /* 256-byte table */ 2912 if (offset > end-code) 2913 FAIL; 2914 /* Make sure that each byte points to a valid block */ 2915 for (i = 0; i < 256; i++) { 2916 if (((unsigned char *)code)[i] >= arg) 2917 FAIL; 2918 } 2919 code += offset; 2920 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */ 2921 if (offset > end-code) 2922 FAIL; 2923 code += offset; 2924 break; 2925 2926 case SRE_OP_CATEGORY: 2927 GET_ARG; 2928 switch (arg) { 2929 case SRE_CATEGORY_DIGIT: 2930 case SRE_CATEGORY_NOT_DIGIT: 2931 case SRE_CATEGORY_SPACE: 2932 case SRE_CATEGORY_NOT_SPACE: 2933 case SRE_CATEGORY_WORD: 2934 case SRE_CATEGORY_NOT_WORD: 2935 case SRE_CATEGORY_LINEBREAK: 2936 case SRE_CATEGORY_NOT_LINEBREAK: 2937 case SRE_CATEGORY_LOC_WORD: 2938 case SRE_CATEGORY_LOC_NOT_WORD: 2939 case SRE_CATEGORY_UNI_DIGIT: 2940 case SRE_CATEGORY_UNI_NOT_DIGIT: 2941 case SRE_CATEGORY_UNI_SPACE: 2942 case SRE_CATEGORY_UNI_NOT_SPACE: 2943 case SRE_CATEGORY_UNI_WORD: 2944 case SRE_CATEGORY_UNI_NOT_WORD: 2945 case SRE_CATEGORY_UNI_LINEBREAK: 2946 case SRE_CATEGORY_UNI_NOT_LINEBREAK: 2947 break; 2948 default: 2949 FAIL; 2950 } 2951 break; 2952 2953 default: 2954 FAIL; 2955 2956 } 2957 } 2958 2959 return 1; 2960 } 2961 2962 static int 2963 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) 2964 { 2965 /* Some variables are manipulated by the macros above */ 2966 SRE_CODE op; 2967 SRE_CODE arg; 2968 SRE_CODE skip; 2969 2970 VTRACE(("code=%p, end=%p\n", code, end)); 2971 2972 if (code > end) 2973 FAIL; 2974 2975 while (code < end) { 2976 GET_OP; 2977 switch (op) { 2978 2979 case SRE_OP_MARK: 2980 /* We don't check whether marks are properly nested; the 2981 sre_match() code is robust even if they don't, and the worst 2982 you can get is nonsensical match results. */ 2983 GET_ARG; 2984 if (arg > 2*groups+1) { 2985 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups)); 2986 FAIL; 2987 } 2988 break; 2989 2990 case SRE_OP_LITERAL: 2991 case SRE_OP_NOT_LITERAL: 2992 case SRE_OP_LITERAL_IGNORE: 2993 case SRE_OP_NOT_LITERAL_IGNORE: 2994 GET_ARG; 2995 /* The arg is just a character, nothing to check */ 2996 break; 2997 2998 case SRE_OP_SUCCESS: 2999 case SRE_OP_FAILURE: 3000 /* Nothing to check; these normally end the matching process */ 3001 break; 3002 3003 case SRE_OP_AT: 3004 GET_ARG; 3005 switch (arg) { 3006 case SRE_AT_BEGINNING: 3007 case SRE_AT_BEGINNING_STRING: 3008 case SRE_AT_BEGINNING_LINE: 3009 case SRE_AT_END: 3010 case SRE_AT_END_LINE: 3011 case SRE_AT_END_STRING: 3012 case SRE_AT_BOUNDARY: 3013 case SRE_AT_NON_BOUNDARY: 3014 case SRE_AT_LOC_BOUNDARY: 3015 case SRE_AT_LOC_NON_BOUNDARY: 3016 case SRE_AT_UNI_BOUNDARY: 3017 case SRE_AT_UNI_NON_BOUNDARY: 3018 break; 3019 default: 3020 FAIL; 3021 } 3022 break; 3023 3024 case SRE_OP_ANY: 3025 case SRE_OP_ANY_ALL: 3026 /* These have no operands */ 3027 break; 3028 3029 case SRE_OP_IN: 3030 case SRE_OP_IN_IGNORE: 3031 GET_SKIP; 3032 /* Stop 1 before the end; we check the FAILURE below */ 3033 if (!_validate_charset(code, code+skip-2)) 3034 FAIL; 3035 if (code[skip-2] != SRE_OP_FAILURE) 3036 FAIL; 3037 code += skip-1; 3038 break; 3039 3040 case SRE_OP_INFO: 3041 { 3042 /* A minimal info field is 3043 <INFO> <1=skip> <2=flags> <3=min> <4=max>; 3044 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags, 3045 more follows. */ 3046 SRE_CODE flags, i; 3047 SRE_CODE *newcode; 3048 GET_SKIP; 3049 newcode = code+skip-1; 3050 GET_ARG; flags = arg; 3051 GET_ARG; /* min */ 3052 GET_ARG; /* max */ 3053 /* Check that only valid flags are present */ 3054 if ((flags & ~(SRE_INFO_PREFIX | 3055 SRE_INFO_LITERAL | 3056 SRE_INFO_CHARSET)) != 0) 3057 FAIL; 3058 /* PREFIX and CHARSET are mutually exclusive */ 3059 if ((flags & SRE_INFO_PREFIX) && 3060 (flags & SRE_INFO_CHARSET)) 3061 FAIL; 3062 /* LITERAL implies PREFIX */ 3063 if ((flags & SRE_INFO_LITERAL) && 3064 !(flags & SRE_INFO_PREFIX)) 3065 FAIL; 3066 /* Validate the prefix */ 3067 if (flags & SRE_INFO_PREFIX) { 3068 SRE_CODE prefix_len; 3069 GET_ARG; prefix_len = arg; 3070 GET_ARG; /* prefix skip */ 3071 /* Here comes the prefix string */ 3072 if (prefix_len > newcode-code) 3073 FAIL; 3074 code += prefix_len; 3075 /* And here comes the overlap table */ 3076 if (prefix_len > newcode-code) 3077 FAIL; 3078 /* Each overlap value should be < prefix_len */ 3079 for (i = 0; i < prefix_len; i++) { 3080 if (code[i] >= prefix_len) 3081 FAIL; 3082 } 3083 code += prefix_len; 3084 } 3085 /* Validate the charset */ 3086 if (flags & SRE_INFO_CHARSET) { 3087 if (!_validate_charset(code, newcode-1)) 3088 FAIL; 3089 if (newcode[-1] != SRE_OP_FAILURE) 3090 FAIL; 3091 code = newcode; 3092 } 3093 else if (code != newcode) { 3094 VTRACE(("code=%p, newcode=%p\n", code, newcode)); 3095 FAIL; 3096 } 3097 } 3098 break; 3099 3100 case SRE_OP_BRANCH: 3101 { 3102 SRE_CODE *target = NULL; 3103 for (;;) { 3104 GET_SKIP; 3105 if (skip == 0) 3106 break; 3107 /* Stop 2 before the end; we check the JUMP below */ 3108 if (!_validate_inner(code, code+skip-3, groups)) 3109 FAIL; 3110 code += skip-3; 3111 /* Check that it ends with a JUMP, and that each JUMP 3112 has the same target */ 3113 GET_OP; 3114 if (op != SRE_OP_JUMP) 3115 FAIL; 3116 GET_SKIP; 3117 if (target == NULL) 3118 target = code+skip-1; 3119 else if (code+skip-1 != target) 3120 FAIL; 3121 } 3122 } 3123 break; 3124 3125 case SRE_OP_REPEAT_ONE: 3126 case SRE_OP_MIN_REPEAT_ONE: 3127 { 3128 SRE_CODE min, max; 3129 GET_SKIP; 3130 GET_ARG; min = arg; 3131 GET_ARG; max = arg; 3132 if (min > max) 3133 FAIL; 3134 if (max > SRE_MAXREPEAT) 3135 FAIL; 3136 if (!_validate_inner(code, code+skip-4, groups)) 3137 FAIL; 3138 code += skip-4; 3139 GET_OP; 3140 if (op != SRE_OP_SUCCESS) 3141 FAIL; 3142 } 3143 break; 3144 3145 case SRE_OP_REPEAT: 3146 { 3147 SRE_CODE min, max; 3148 GET_SKIP; 3149 GET_ARG; min = arg; 3150 GET_ARG; max = arg; 3151 if (min > max) 3152 FAIL; 3153 if (max > SRE_MAXREPEAT) 3154 FAIL; 3155 if (!_validate_inner(code, code+skip-3, groups)) 3156 FAIL; 3157 code += skip-3; 3158 GET_OP; 3159 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) 3160 FAIL; 3161 } 3162 break; 3163 3164 case SRE_OP_GROUPREF: 3165 case SRE_OP_GROUPREF_IGNORE: 3166 GET_ARG; 3167 if (arg >= groups) 3168 FAIL; 3169 break; 3170 3171 case SRE_OP_GROUPREF_EXISTS: 3172 /* The regex syntax for this is: '(?(group)then|else)', where 3173 'group' is either an integer group number or a group name, 3174 'then' and 'else' are sub-regexes, and 'else' is optional. */ 3175 GET_ARG; 3176 if (arg >= groups) 3177 FAIL; 3178 GET_SKIP_ADJ(1); 3179 code--; /* The skip is relative to the first arg! */ 3180 /* There are two possibilities here: if there is both a 'then' 3181 part and an 'else' part, the generated code looks like: 3182 3183 GROUPREF_EXISTS 3184 <group> 3185 <skipyes> 3186 ...then part... 3187 JUMP 3188 <skipno> 3189 (<skipyes> jumps here) 3190 ...else part... 3191 (<skipno> jumps here) 3192 3193 If there is only a 'then' part, it looks like: 3194 3195 GROUPREF_EXISTS 3196 <group> 3197 <skip> 3198 ...then part... 3199 (<skip> jumps here) 3200 3201 There is no direct way to decide which it is, and we don't want 3202 to allow arbitrary jumps anywhere in the code; so we just look 3203 for a JUMP opcode preceding our skip target. 3204 */ 3205 if (skip >= 3 && skip-3 < end-code && 3206 code[skip-3] == SRE_OP_JUMP) 3207 { 3208 VTRACE(("both then and else parts present\n")); 3209 if (!_validate_inner(code+1, code+skip-3, groups)) 3210 FAIL; 3211 code += skip-2; /* Position after JUMP, at <skipno> */ 3212 GET_SKIP; 3213 if (!_validate_inner(code, code+skip-1, groups)) 3214 FAIL; 3215 code += skip-1; 3216 } 3217 else { 3218 VTRACE(("only a then part present\n")); 3219 if (!_validate_inner(code+1, code+skip-1, groups)) 3220 FAIL; 3221 code += skip-1; 3222 } 3223 break; 3224 3225 case SRE_OP_ASSERT: 3226 case SRE_OP_ASSERT_NOT: 3227 GET_SKIP; 3228 GET_ARG; /* 0 for lookahead, width for lookbehind */ 3229 code--; /* Back up over arg to simplify math below */ 3230 if (arg & 0x80000000) 3231 FAIL; /* Width too large */ 3232 /* Stop 1 before the end; we check the SUCCESS below */ 3233 if (!_validate_inner(code+1, code+skip-2, groups)) 3234 FAIL; 3235 code += skip-2; 3236 GET_OP; 3237 if (op != SRE_OP_SUCCESS) 3238 FAIL; 3239 break; 3240 3241 default: 3242 FAIL; 3243 3244 } 3245 } 3246 3247 VTRACE(("okay\n")); 3248 return 1; 3249 } 3250 3251 static int 3252 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) 3253 { 3254 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS) 3255 FAIL; 3256 if (groups == 0) /* fix for simplejson */ 3257 groups = 100; /* 100 groups should always be safe */ 3258 return _validate_inner(code, end-1, groups); 3259 } 3260 3261 static int 3262 _validate(PatternObject *self) 3263 { 3264 if (!_validate_outer(self->code, self->code+self->codesize, self->groups)) 3265 { 3266 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code"); 3267 return 0; 3268 } 3269 else 3270 VTRACE(("Success!\n")); 3271 return 1; 3272 } 3273 3274 /* -------------------------------------------------------------------- */ 3275 /* match methods */ 3276 3277 static void 3278 match_dealloc(MatchObject* self) 3279 { 3280 Py_XDECREF(self->regs); 3281 Py_XDECREF(self->string); 3282 Py_DECREF(self->pattern); 3283 PyObject_DEL(self); 3284 } 3285 3286 static PyObject* 3287 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def) 3288 { 3289 if (index < 0 || index >= self->groups) { 3290 /* raise IndexError if we were given a bad group number */ 3291 PyErr_SetString( 3292 PyExc_IndexError, 3293 "no such group" 3294 ); 3295 return NULL; 3296 } 3297 3298 index *= 2; 3299 3300 if (self->string == Py_None || self->mark[index] < 0) { 3301 /* return default value if the string or group is undefined */ 3302 Py_INCREF(def); 3303 return def; 3304 } 3305 3306 return PySequence_GetSlice( 3307 self->string, self->mark[index], self->mark[index+1] 3308 ); 3309 } 3310 3311 static Py_ssize_t 3312 match_getindex(MatchObject* self, PyObject* index) 3313 { 3314 Py_ssize_t i; 3315 3316 if (PyInt_Check(index) || PyLong_Check(index)) 3317 return PyInt_AsSsize_t(index); 3318 3319 i = -1; 3320 3321 if (self->pattern->groupindex) { 3322 index = PyObject_GetItem(self->pattern->groupindex, index); 3323 if (index) { 3324 if (PyInt_Check(index) || PyLong_Check(index)) 3325 i = PyInt_AsSsize_t(index); 3326 Py_DECREF(index); 3327 } else 3328 PyErr_Clear(); 3329 } 3330 3331 return i; 3332 } 3333 3334 static PyObject* 3335 match_getslice(MatchObject* self, PyObject* index, PyObject* def) 3336 { 3337 return match_getslice_by_index(self, match_getindex(self, index), def); 3338 } 3339 3340 static PyObject* 3341 match_expand(MatchObject* self, PyObject* ptemplate) 3342 { 3343 /* delegate to Python code */ 3344 return call( 3345 SRE_PY_MODULE, "_expand", 3346 PyTuple_Pack(3, self->pattern, self, ptemplate) 3347 ); 3348 } 3349 3350 static PyObject* 3351 match_group(MatchObject* self, PyObject* args) 3352 { 3353 PyObject* result; 3354 Py_ssize_t i, size; 3355 3356 size = PyTuple_GET_SIZE(args); 3357 3358 switch (size) { 3359 case 0: 3360 result = match_getslice(self, Py_False, Py_None); 3361 break; 3362 case 1: 3363 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None); 3364 break; 3365 default: 3366 /* fetch multiple items */ 3367 result = PyTuple_New(size); 3368 if (!result) 3369 return NULL; 3370 for (i = 0; i < size; i++) { 3371 PyObject* item = match_getslice( 3372 self, PyTuple_GET_ITEM(args, i), Py_None 3373 ); 3374 if (!item) { 3375 Py_DECREF(result); 3376 return NULL; 3377 } 3378 PyTuple_SET_ITEM(result, i, item); 3379 } 3380 break; 3381 } 3382 return result; 3383 } 3384 3385 static PyObject* 3386 match_groups(MatchObject* self, PyObject* args, PyObject* kw) 3387 { 3388 PyObject* result; 3389 Py_ssize_t index; 3390 3391 PyObject* def = Py_None; 3392 static char* kwlist[] = { "default", NULL }; 3393 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def)) 3394 return NULL; 3395 3396 result = PyTuple_New(self->groups-1); 3397 if (!result) 3398 return NULL; 3399 3400 for (index = 1; index < self->groups; index++) { 3401 PyObject* item; 3402 item = match_getslice_by_index(self, index, def); 3403 if (!item) { 3404 Py_DECREF(result); 3405 return NULL; 3406 } 3407 PyTuple_SET_ITEM(result, index-1, item); 3408 } 3409 3410 return result; 3411 } 3412 3413 static PyObject* 3414 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw) 3415 { 3416 PyObject* result; 3417 PyObject* keys; 3418 Py_ssize_t index; 3419 3420 PyObject* def = Py_None; 3421 static char* kwlist[] = { "default", NULL }; 3422 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def)) 3423 return NULL; 3424 3425 result = PyDict_New(); 3426 if (!result || !self->pattern->groupindex) 3427 return result; 3428 3429 keys = PyMapping_Keys(self->pattern->groupindex); 3430 if (!keys) 3431 goto failed; 3432 3433 for (index = 0; index < PyList_GET_SIZE(keys); index++) { 3434 int status; 3435 PyObject* key; 3436 PyObject* value; 3437 key = PyList_GET_ITEM(keys, index); 3438 if (!key) 3439 goto failed; 3440 value = match_getslice(self, key, def); 3441 if (!value) 3442 goto failed; 3443 status = PyDict_SetItem(result, key, value); 3444 Py_DECREF(value); 3445 if (status < 0) 3446 goto failed; 3447 } 3448 3449 Py_DECREF(keys); 3450 3451 return result; 3452 3453 failed: 3454 Py_XDECREF(keys); 3455 Py_DECREF(result); 3456 return NULL; 3457 } 3458 3459 static PyObject* 3460 match_start(MatchObject* self, PyObject* args) 3461 { 3462 Py_ssize_t index; 3463 3464 PyObject* index_ = Py_False; /* zero */ 3465 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_)) 3466 return NULL; 3467 3468 index = match_getindex(self, index_); 3469 3470 if (index < 0 || index >= self->groups) { 3471 PyErr_SetString( 3472 PyExc_IndexError, 3473 "no such group" 3474 ); 3475 return NULL; 3476 } 3477 3478 /* mark is -1 if group is undefined */ 3479 return PyInt_FromSsize_t(self->mark[index*2]); 3480 } 3481 3482 static PyObject* 3483 match_end(MatchObject* self, PyObject* args) 3484 { 3485 Py_ssize_t index; 3486 3487 PyObject* index_ = Py_False; /* zero */ 3488 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_)) 3489 return NULL; 3490 3491 index = match_getindex(self, index_); 3492 3493 if (index < 0 || index >= self->groups) { 3494 PyErr_SetString( 3495 PyExc_IndexError, 3496 "no such group" 3497 ); 3498 return NULL; 3499 } 3500 3501 /* mark is -1 if group is undefined */ 3502 return PyInt_FromSsize_t(self->mark[index*2+1]); 3503 } 3504 3505 LOCAL(PyObject*) 3506 _pair(Py_ssize_t i1, Py_ssize_t i2) 3507 { 3508 PyObject* pair; 3509 PyObject* item; 3510 3511 pair = PyTuple_New(2); 3512 if (!pair) 3513 return NULL; 3514 3515 item = PyInt_FromSsize_t(i1); 3516 if (!item) 3517 goto error; 3518 PyTuple_SET_ITEM(pair, 0, item); 3519 3520 item = PyInt_FromSsize_t(i2); 3521 if (!item) 3522 goto error; 3523 PyTuple_SET_ITEM(pair, 1, item); 3524 3525 return pair; 3526 3527 error: 3528 Py_DECREF(pair); 3529 return NULL; 3530 } 3531 3532 static PyObject* 3533 match_span(MatchObject* self, PyObject* args) 3534 { 3535 Py_ssize_t index; 3536 3537 PyObject* index_ = Py_False; /* zero */ 3538 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_)) 3539 return NULL; 3540 3541 index = match_getindex(self, index_); 3542 3543 if (index < 0 || index >= self->groups) { 3544 PyErr_SetString( 3545 PyExc_IndexError, 3546 "no such group" 3547 ); 3548 return NULL; 3549 } 3550 3551 /* marks are -1 if group is undefined */ 3552 return _pair(self->mark[index*2], self->mark[index*2+1]); 3553 } 3554 3555 static PyObject* 3556 match_regs(MatchObject* self) 3557 { 3558 PyObject* regs; 3559 PyObject* item; 3560 Py_ssize_t index; 3561 3562 regs = PyTuple_New(self->groups); 3563 if (!regs) 3564 return NULL; 3565 3566 for (index = 0; index < self->groups; index++) { 3567 item = _pair(self->mark[index*2], self->mark[index*2+1]); 3568 if (!item) { 3569 Py_DECREF(regs); 3570 return NULL; 3571 } 3572 PyTuple_SET_ITEM(regs, index, item); 3573 } 3574 3575 Py_INCREF(regs); 3576 self->regs = regs; 3577 3578 return regs; 3579 } 3580 3581 static PyObject* 3582 match_copy(MatchObject* self, PyObject *unused) 3583 { 3584 #ifdef USE_BUILTIN_COPY 3585 MatchObject* copy; 3586 Py_ssize_t slots, offset; 3587 3588 slots = 2 * (self->pattern->groups+1); 3589 3590 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots); 3591 if (!copy) 3592 return NULL; 3593 3594 /* this value a constant, but any compiler should be able to 3595 figure that out all by itself */ 3596 offset = offsetof(MatchObject, string); 3597 3598 Py_XINCREF(self->pattern); 3599 Py_XINCREF(self->string); 3600 Py_XINCREF(self->regs); 3601 3602 memcpy((char*) copy + offset, (char*) self + offset, 3603 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset); 3604 3605 return (PyObject*) copy; 3606 #else 3607 PyErr_SetString(PyExc_TypeError, "cannot copy this match object"); 3608 return NULL; 3609 #endif 3610 } 3611 3612 static PyObject* 3613 match_deepcopy(MatchObject* self, PyObject* memo) 3614 { 3615 #ifdef USE_BUILTIN_COPY 3616 MatchObject* copy; 3617 3618 copy = (MatchObject*) match_copy(self); 3619 if (!copy) 3620 return NULL; 3621 3622 if (!deepcopy((PyObject**) ©->pattern, memo) || 3623 !deepcopy(©->string, memo) || 3624 !deepcopy(©->regs, memo)) { 3625 Py_DECREF(copy); 3626 return NULL; 3627 } 3628 3629 #else 3630 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object"); 3631 return NULL; 3632 #endif 3633 } 3634 3635 PyDoc_STRVAR(match_doc, 3636 "The result of re.match() and re.search().\n\ 3637 Match objects always have a boolean value of True."); 3638 3639 PyDoc_STRVAR(match_group_doc, 3640 "group([group1, ...]) -> str or tuple.\n\ 3641 Return subgroup(s) of the match by indices or names.\n\ 3642 For 0 returns the entire match."); 3643 3644 PyDoc_STRVAR(match_start_doc, 3645 "start([group=0]) -> int.\n\ 3646 Return index of the start of the substring matched by group."); 3647 3648 PyDoc_STRVAR(match_end_doc, 3649 "end([group=0]) -> int.\n\ 3650 Return index of the end of the substring matched by group."); 3651 3652 PyDoc_STRVAR(match_span_doc, 3653 "span([group]) -> tuple.\n\ 3654 For MatchObject m, return the 2-tuple (m.start(group), m.end(group))."); 3655 3656 PyDoc_STRVAR(match_groups_doc, 3657 "groups([default=None]) -> tuple.\n\ 3658 Return a tuple containing all the subgroups of the match, from 1.\n\ 3659 The default argument is used for groups\n\ 3660 that did not participate in the match"); 3661 3662 PyDoc_STRVAR(match_groupdict_doc, 3663 "groupdict([default=None]) -> dict.\n\ 3664 Return a dictionary containing all the named subgroups of the match,\n\ 3665 keyed by the subgroup name. The default argument is used for groups\n\ 3666 that did not participate in the match"); 3667 3668 PyDoc_STRVAR(match_expand_doc, 3669 "expand(template) -> str.\n\ 3670 Return the string obtained by doing backslash substitution\n\ 3671 on the string template, as done by the sub() method."); 3672 3673 static PyMethodDef match_methods[] = { 3674 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc}, 3675 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc}, 3676 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc}, 3677 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc}, 3678 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS, 3679 match_groups_doc}, 3680 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS, 3681 match_groupdict_doc}, 3682 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc}, 3683 {"__copy__", (PyCFunction) match_copy, METH_NOARGS}, 3684 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O}, 3685 {NULL, NULL} 3686 }; 3687 3688 static PyObject * 3689 match_lastindex_get(MatchObject *self) 3690 { 3691 if (self->lastindex >= 0) 3692 return PyInt_FromSsize_t(self->lastindex); 3693 Py_INCREF(Py_None); 3694 return Py_None; 3695 } 3696 3697 static PyObject * 3698 match_lastgroup_get(MatchObject *self) 3699 { 3700 if (self->pattern->indexgroup && self->lastindex >= 0) { 3701 PyObject* result = PySequence_GetItem( 3702 self->pattern->indexgroup, self->lastindex 3703 ); 3704 if (result) 3705 return result; 3706 PyErr_Clear(); 3707 } 3708 Py_INCREF(Py_None); 3709 return Py_None; 3710 } 3711 3712 static PyObject * 3713 match_regs_get(MatchObject *self) 3714 { 3715 if (self->regs) { 3716 Py_INCREF(self->regs); 3717 return self->regs; 3718 } else 3719 return match_regs(self); 3720 } 3721 3722 static PyGetSetDef match_getset[] = { 3723 {"lastindex", (getter)match_lastindex_get, (setter)NULL}, 3724 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL}, 3725 {"regs", (getter)match_regs_get, (setter)NULL}, 3726 {NULL} 3727 }; 3728 3729 #define MATCH_OFF(x) offsetof(MatchObject, x) 3730 static PyMemberDef match_members[] = { 3731 {"string", T_OBJECT, MATCH_OFF(string), READONLY}, 3732 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY}, 3733 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY}, 3734 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY}, 3735 {NULL} 3736 }; 3737 3738 3739 /* FIXME: implement setattr("string", None) as a special case (to 3740 detach the associated string, if any */ 3741 3742 static PyTypeObject Match_Type = { 3743 PyVarObject_HEAD_INIT(NULL, 0) 3744 "_" SRE_MODULE ".SRE_Match", 3745 sizeof(MatchObject), sizeof(Py_ssize_t), 3746 (destructor)match_dealloc, /* tp_dealloc */ 3747 0, /* tp_print */ 3748 0, /* tp_getattr */ 3749 0, /* tp_setattr */ 3750 0, /* tp_compare */ 3751 0, /* tp_repr */ 3752 0, /* tp_as_number */ 3753 0, /* tp_as_sequence */ 3754 0, /* tp_as_mapping */ 3755 0, /* tp_hash */ 3756 0, /* tp_call */ 3757 0, /* tp_str */ 3758 0, /* tp_getattro */ 3759 0, /* tp_setattro */ 3760 0, /* tp_as_buffer */ 3761 Py_TPFLAGS_DEFAULT, 3762 match_doc, /* tp_doc */ 3763 0, /* tp_traverse */ 3764 0, /* tp_clear */ 3765 0, /* tp_richcompare */ 3766 0, /* tp_weaklistoffset */ 3767 0, /* tp_iter */ 3768 0, /* tp_iternext */ 3769 match_methods, /* tp_methods */ 3770 match_members, /* tp_members */ 3771 match_getset, /* tp_getset */ 3772 }; 3773 3774 static PyObject* 3775 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status) 3776 { 3777 /* create match object (from state object) */ 3778 3779 MatchObject* match; 3780 Py_ssize_t i, j; 3781 char* base; 3782 int n; 3783 3784 if (status > 0) { 3785 3786 /* create match object (with room for extra group marks) */ 3787 /* coverity[ampersand_in_size] */ 3788 match = PyObject_NEW_VAR(MatchObject, &Match_Type, 3789 2*(pattern->groups+1)); 3790 if (!match) 3791 return NULL; 3792 3793 Py_INCREF(pattern); 3794 match->pattern = pattern; 3795 3796 Py_INCREF(state->string); 3797 match->string = state->string; 3798 3799 match->regs = NULL; 3800 match->groups = pattern->groups+1; 3801 3802 /* fill in group slices */ 3803 3804 base = (char*) state->beginning; 3805 n = state->charsize; 3806 3807 match->mark[0] = ((char*) state->start - base) / n; 3808 match->mark[1] = ((char*) state->ptr - base) / n; 3809 3810 for (i = j = 0; i < pattern->groups; i++, j+=2) 3811 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) { 3812 match->mark[j+2] = ((char*) state->mark[j] - base) / n; 3813 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n; 3814 } else 3815 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ 3816 3817 match->pos = state->pos; 3818 match->endpos = state->endpos; 3819 3820 match->lastindex = state->lastindex; 3821 3822 return (PyObject*) match; 3823 3824 } else if (status == 0) { 3825 3826 /* no match */ 3827 Py_INCREF(Py_None); 3828 return Py_None; 3829 3830 } 3831 3832 /* internal error */ 3833 pattern_error(status); 3834 return NULL; 3835 } 3836 3837 3838 /* -------------------------------------------------------------------- */ 3839 /* scanner methods (experimental) */ 3840 3841 static void 3842 scanner_dealloc(ScannerObject* self) 3843 { 3844 state_fini(&self->state); 3845 Py_XDECREF(self->pattern); 3846 PyObject_DEL(self); 3847 } 3848 3849 static PyObject* 3850 scanner_match(ScannerObject* self, PyObject *unused) 3851 { 3852 SRE_STATE* state = &self->state; 3853 PyObject* match; 3854 int status; 3855 3856 if (state->start == NULL) 3857 Py_RETURN_NONE; 3858 3859 state_reset(state); 3860 3861 state->ptr = state->start; 3862 3863 if (state->charsize == 1) { 3864 status = sre_match(state, PatternObject_GetCode(self->pattern)); 3865 } else { 3866 #if defined(HAVE_UNICODE) 3867 status = sre_umatch(state, PatternObject_GetCode(self->pattern)); 3868 #endif 3869 } 3870 if (PyErr_Occurred()) 3871 return NULL; 3872 3873 match = pattern_new_match((PatternObject*) self->pattern, 3874 state, status); 3875 3876 if (status == 0) 3877 state->start = NULL; 3878 else if (state->ptr != state->start) 3879 state->start = state->ptr; 3880 else if (state->ptr != state->end) 3881 state->start = (void*) ((char*) state->ptr + state->charsize); 3882 else 3883 state->start = NULL; 3884 3885 return match; 3886 } 3887 3888 3889 static PyObject* 3890 scanner_search(ScannerObject* self, PyObject *unused) 3891 { 3892 SRE_STATE* state = &self->state; 3893 PyObject* match; 3894 int status; 3895 3896 if (state->start == NULL) 3897 Py_RETURN_NONE; 3898 3899 state_reset(state); 3900 3901 state->ptr = state->start; 3902 3903 if (state->charsize == 1) { 3904 status = sre_search(state, PatternObject_GetCode(self->pattern)); 3905 } else { 3906 #if defined(HAVE_UNICODE) 3907 status = sre_usearch(state, PatternObject_GetCode(self->pattern)); 3908 #endif 3909 } 3910 if (PyErr_Occurred()) 3911 return NULL; 3912 3913 match = pattern_new_match((PatternObject*) self->pattern, 3914 state, status); 3915 3916 if (status == 0) 3917 state->start = NULL; 3918 else if (state->ptr != state->start) 3919 state->start = state->ptr; 3920 else if (state->ptr != state->end) 3921 state->start = (void*) ((char*) state->ptr + state->charsize); 3922 else 3923 state->start = NULL; 3924 3925 return match; 3926 } 3927 3928 static PyMethodDef scanner_methods[] = { 3929 {"match", (PyCFunction) scanner_match, METH_NOARGS}, 3930 {"search", (PyCFunction) scanner_search, METH_NOARGS}, 3931 {NULL, NULL} 3932 }; 3933 3934 #define SCAN_OFF(x) offsetof(ScannerObject, x) 3935 static PyMemberDef scanner_members[] = { 3936 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY}, 3937 {NULL} /* Sentinel */ 3938 }; 3939 3940 statichere PyTypeObject Scanner_Type = { 3941 PyObject_HEAD_INIT(NULL) 3942 0, "_" SRE_MODULE ".SRE_Scanner", 3943 sizeof(ScannerObject), 0, 3944 (destructor)scanner_dealloc, /*tp_dealloc*/ 3945 0, /* tp_print */ 3946 0, /* tp_getattr */ 3947 0, /* tp_setattr */ 3948 0, /* tp_reserved */ 3949 0, /* tp_repr */ 3950 0, /* tp_as_number */ 3951 0, /* tp_as_sequence */ 3952 0, /* tp_as_mapping */ 3953 0, /* tp_hash */ 3954 0, /* tp_call */ 3955 0, /* tp_str */ 3956 0, /* tp_getattro */ 3957 0, /* tp_setattro */ 3958 0, /* tp_as_buffer */ 3959 Py_TPFLAGS_DEFAULT, /* tp_flags */ 3960 0, /* tp_doc */ 3961 0, /* tp_traverse */ 3962 0, /* tp_clear */ 3963 0, /* tp_richcompare */ 3964 0, /* tp_weaklistoffset */ 3965 0, /* tp_iter */ 3966 0, /* tp_iternext */ 3967 scanner_methods, /* tp_methods */ 3968 scanner_members, /* tp_members */ 3969 0, /* tp_getset */ 3970 }; 3971 3972 static PyObject* 3973 pattern_scanner(PatternObject* pattern, PyObject* args) 3974 { 3975 /* create search state object */ 3976 3977 ScannerObject* self; 3978 3979 PyObject* string; 3980 Py_ssize_t start = 0; 3981 Py_ssize_t end = PY_SSIZE_T_MAX; 3982 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end)) 3983 return NULL; 3984 3985 /* create scanner object */ 3986 self = PyObject_NEW(ScannerObject, &Scanner_Type); 3987 if (!self) 3988 return NULL; 3989 self->pattern = NULL; 3990 3991 string = state_init(&self->state, pattern, string, start, end); 3992 if (!string) { 3993 Py_DECREF(self); 3994 return NULL; 3995 } 3996 3997 Py_INCREF(pattern); 3998 self->pattern = (PyObject*) pattern; 3999 4000 return (PyObject*) self; 4001 } 4002 4003 static PyMethodDef _functions[] = { 4004 {"compile", _compile, METH_VARARGS}, 4005 {"getcodesize", sre_codesize, METH_NOARGS}, 4006 {"getlower", sre_getlower, METH_VARARGS}, 4007 {NULL, NULL} 4008 }; 4009 4010 #if PY_VERSION_HEX < 0x02030000 4011 DL_EXPORT(void) init_sre(void) 4012 #else 4013 PyMODINIT_FUNC init_sre(void) 4014 #endif 4015 { 4016 PyObject* m; 4017 PyObject* d; 4018 PyObject* x; 4019 4020 /* Patch object types */ 4021 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) || 4022 PyType_Ready(&Scanner_Type)) 4023 return; 4024 4025 m = Py_InitModule("_" SRE_MODULE, _functions); 4026 if (m == NULL) 4027 return; 4028 d = PyModule_GetDict(m); 4029 4030 x = PyInt_FromLong(SRE_MAGIC); 4031 if (x) { 4032 PyDict_SetItemString(d, "MAGIC", x); 4033 Py_DECREF(x); 4034 } 4035 4036 x = PyInt_FromLong(sizeof(SRE_CODE)); 4037 if (x) { 4038 PyDict_SetItemString(d, "CODESIZE", x); 4039 Py_DECREF(x); 4040 } 4041 4042 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT); 4043 if (x) { 4044 PyDict_SetItemString(d, "MAXREPEAT", x); 4045 Py_DECREF(x); 4046 } 4047 4048 x = PyString_FromString(copyright); 4049 if (x) { 4050 PyDict_SetItemString(d, "copyright", x); 4051 Py_DECREF(x); 4052 } 4053 } 4054 4055 #endif /* !defined(SRE_RECURSIVE) */ 4056 4057 /* vim:ts=4:sw=4:et 4058 */ 4059