1 /* 2 * Secret Labs' Regular Expression Engine 3 * 4 * regular expression matching engine 5 6 Copyright (c) 2011, Intel Corporation. All rights reserved.<BR> 7 This program and the accompanying materials are licensed and made available under 8 the terms and conditions of the BSD License that accompanies this distribution. 9 The full text of the license may be found at 10 http://opensource.org/licenses/bsd-license. 11 12 THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, 13 WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. 14 * 15 * partial history: 16 * 1999-10-24 fl created (based on existing template matcher code) 17 * 2000-03-06 fl first alpha, sort of 18 * 2000-08-01 fl fixes for 1.6b1 19 * 2000-08-07 fl use PyOS_CheckStack() if available 20 * 2000-09-20 fl added expand method 21 * 2001-03-20 fl lots of fixes for 2.1b2 22 * 2001-04-15 fl export copyright as Python attribute, not global 23 * 2001-04-28 fl added __copy__ methods (work in progress) 24 * 2001-05-14 fl fixes for 1.5.2 compatibility 25 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) 26 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) 27 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 28 * 2001-10-21 fl added sub/subn primitive 29 * 2001-10-24 fl added finditer primitive (for 2.2 only) 30 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) 31 * 2002-11-09 fl fixed empty sub/subn return type 32 * 2003-04-18 mvl fully support 4-byte codes 33 * 2003-10-17 gn implemented non recursive scheme 34 * 35 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 36 * 37 * This version of the SRE library can be redistributed under CNRI's 38 * Python 1.6 license. For any other use, please contact Secret Labs 39 * AB (info (at) pythonware.com). 40 * 41 * Portions of this engine have been developed in cooperation with 42 * CNRI. Hewlett-Packard provided funding for 1.6 integration and 43 * other compatibility work. 44 */ 45 46 /* Get rid of these macros to prevent collisions between EFI and Python in this file. */ 47 #undef RETURN_ERROR 48 #undef RETURN_SUCCESS 49 50 #ifndef SRE_RECURSIVE 51 52 static char copyright[] = 53 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB "; 54 55 #define PY_SSIZE_T_CLEAN 56 57 #include "Python.h" 58 #include "structmember.h" /* offsetof */ 59 60 #include "sre.h" 61 62 #include <ctype.h> 63 64 /* name of this module, minus the leading underscore */ 65 #if !defined(SRE_MODULE) 66 #define SRE_MODULE "sre" 67 #endif 68 69 #define SRE_PY_MODULE "re" 70 71 /* defining this one enables tracing */ 72 #undef VERBOSE 73 74 #if PY_VERSION_HEX >= 0x01060000 75 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE) 76 /* defining this enables unicode support (default under 1.6a1 and later) */ 77 #define HAVE_UNICODE 78 #endif 79 #endif 80 81 /* -------------------------------------------------------------------- */ 82 /* optional features */ 83 84 /* enables fast searching */ 85 #define USE_FAST_SEARCH 86 87 /* enables aggressive inlining (always on for Visual C) */ 88 #undef USE_INLINE 89 90 /* enables copy/deepcopy handling (work in progress) */ 91 #undef USE_BUILTIN_COPY 92 93 #if PY_VERSION_HEX < 0x01060000 94 #define PyObject_DEL(op) PyMem_DEL((op)) 95 #endif 96 97 /* -------------------------------------------------------------------- */ 98 99 #if defined(_MSC_VER) 100 #pragma optimize("gt", on) /* doesn't seem to make much difference... */ 101 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */ 102 /* fastest possible local call under MSVC */ 103 #define LOCAL(type) static __inline type __fastcall 104 #elif defined(USE_INLINE) 105 #define LOCAL(type) static inline type 106 #else 107 #define LOCAL(type) static type 108 #endif 109 110 /* error codes */ 111 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ 112 #define SRE_ERROR_STATE -2 /* illegal state */ 113 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */ 114 #define SRE_ERROR_MEMORY -9 /* out of memory */ 115 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */ 116 117 #if defined(VERBOSE) 118 #define TRACE(v) printf v 119 #else 120 #define TRACE(v) 121 #endif 122 123 /* -------------------------------------------------------------------- */ 124 /* search engine state */ 125 126 /* default character predicates (run sre_chars.py to regenerate tables) */ 127 128 #define SRE_DIGIT_MASK 1 129 #define SRE_SPACE_MASK 2 130 #define SRE_LINEBREAK_MASK 4 131 #define SRE_ALNUM_MASK 8 132 #define SRE_WORD_MASK 16 133 134 /* FIXME: this assumes ASCII. create tables in init_sre() instead */ 135 136 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2, 137 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 138 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 139 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 140 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 141 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 142 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 }; 143 144 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 145 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 146 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 147 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 148 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 149 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 150 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 151 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 152 120, 121, 122, 123, 124, 125, 126, 127 }; 153 154 #define SRE_IS_DIGIT(ch)\ 155 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0) 156 #define SRE_IS_SPACE(ch)\ 157 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0) 158 #define SRE_IS_LINEBREAK(ch)\ 159 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0) 160 #define SRE_IS_ALNUM(ch)\ 161 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0) 162 #define SRE_IS_WORD(ch)\ 163 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0) 164 165 static unsigned int sre_lower(unsigned int ch) 166 { 167 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch); 168 } 169 170 /* locale-specific character predicates */ 171 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids 172 * warnings when c's type supports only numbers < N+1 */ 173 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0) 174 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0) 175 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n') 176 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0) 177 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_') 178 179 static unsigned int sre_lower_locale(unsigned int ch) 180 { 181 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch); 182 } 183 184 /* unicode-specific character predicates */ 185 186 #if defined(HAVE_UNICODE) 187 188 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch)) 189 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) 190 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) 191 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch)) 192 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_') 193 194 static unsigned int sre_lower_unicode(unsigned int ch) 195 { 196 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch)); 197 } 198 199 #endif 200 201 LOCAL(int) 202 sre_category(SRE_CODE category, unsigned int ch) 203 { 204 switch (category) { 205 206 case SRE_CATEGORY_DIGIT: 207 return SRE_IS_DIGIT(ch); 208 case SRE_CATEGORY_NOT_DIGIT: 209 return !SRE_IS_DIGIT(ch); 210 case SRE_CATEGORY_SPACE: 211 return SRE_IS_SPACE(ch); 212 case SRE_CATEGORY_NOT_SPACE: 213 return !SRE_IS_SPACE(ch); 214 case SRE_CATEGORY_WORD: 215 return SRE_IS_WORD(ch); 216 case SRE_CATEGORY_NOT_WORD: 217 return !SRE_IS_WORD(ch); 218 case SRE_CATEGORY_LINEBREAK: 219 return SRE_IS_LINEBREAK(ch); 220 case SRE_CATEGORY_NOT_LINEBREAK: 221 return !SRE_IS_LINEBREAK(ch); 222 223 case SRE_CATEGORY_LOC_WORD: 224 return SRE_LOC_IS_WORD(ch); 225 case SRE_CATEGORY_LOC_NOT_WORD: 226 return !SRE_LOC_IS_WORD(ch); 227 228 #if defined(HAVE_UNICODE) 229 case SRE_CATEGORY_UNI_DIGIT: 230 return SRE_UNI_IS_DIGIT(ch); 231 case SRE_CATEGORY_UNI_NOT_DIGIT: 232 return !SRE_UNI_IS_DIGIT(ch); 233 case SRE_CATEGORY_UNI_SPACE: 234 return SRE_UNI_IS_SPACE(ch); 235 case SRE_CATEGORY_UNI_NOT_SPACE: 236 return !SRE_UNI_IS_SPACE(ch); 237 case SRE_CATEGORY_UNI_WORD: 238 return SRE_UNI_IS_WORD(ch); 239 case SRE_CATEGORY_UNI_NOT_WORD: 240 return !SRE_UNI_IS_WORD(ch); 241 case SRE_CATEGORY_UNI_LINEBREAK: 242 return SRE_UNI_IS_LINEBREAK(ch); 243 case SRE_CATEGORY_UNI_NOT_LINEBREAK: 244 return !SRE_UNI_IS_LINEBREAK(ch); 245 #else 246 case SRE_CATEGORY_UNI_DIGIT: 247 return SRE_IS_DIGIT(ch); 248 case SRE_CATEGORY_UNI_NOT_DIGIT: 249 return !SRE_IS_DIGIT(ch); 250 case SRE_CATEGORY_UNI_SPACE: 251 return SRE_IS_SPACE(ch); 252 case SRE_CATEGORY_UNI_NOT_SPACE: 253 return !SRE_IS_SPACE(ch); 254 case SRE_CATEGORY_UNI_WORD: 255 return SRE_LOC_IS_WORD(ch); 256 case SRE_CATEGORY_UNI_NOT_WORD: 257 return !SRE_LOC_IS_WORD(ch); 258 case SRE_CATEGORY_UNI_LINEBREAK: 259 return SRE_IS_LINEBREAK(ch); 260 case SRE_CATEGORY_UNI_NOT_LINEBREAK: 261 return !SRE_IS_LINEBREAK(ch); 262 #endif 263 } 264 return 0; 265 } 266 267 /* helpers */ 268 269 static void 270 data_stack_dealloc(SRE_STATE* state) 271 { 272 if (state->data_stack) { 273 PyMem_FREE(state->data_stack); 274 state->data_stack = NULL; 275 } 276 state->data_stack_size = state->data_stack_base = 0; 277 } 278 279 static int 280 data_stack_grow(SRE_STATE* state, Py_ssize_t size) 281 { 282 Py_ssize_t minsize, cursize; 283 minsize = state->data_stack_base+size; 284 cursize = state->data_stack_size; 285 if (cursize < minsize) { 286 void* stack; 287 cursize = minsize+minsize/4+1024; 288 TRACE(("allocate/grow stack %d\n", cursize)); 289 stack = PyMem_REALLOC(state->data_stack, cursize); 290 if (!stack) { 291 data_stack_dealloc(state); 292 return SRE_ERROR_MEMORY; 293 } 294 state->data_stack = (char *)stack; 295 state->data_stack_size = cursize; 296 } 297 return 0; 298 } 299 300 /* generate 8-bit version */ 301 302 #define SRE_CHAR unsigned char 303 #define SRE_AT sre_at 304 #define SRE_COUNT sre_count 305 #define SRE_CHARSET sre_charset 306 #define SRE_INFO sre_info 307 #define SRE_MATCH sre_match 308 #define SRE_MATCH_CONTEXT sre_match_context 309 #define SRE_SEARCH sre_search 310 #define SRE_LITERAL_TEMPLATE sre_literal_template 311 312 #if defined(HAVE_UNICODE) 313 314 #define SRE_RECURSIVE 315 #include "_sre.c" 316 #undef SRE_RECURSIVE 317 318 #undef SRE_LITERAL_TEMPLATE 319 #undef SRE_SEARCH 320 #undef SRE_MATCH 321 #undef SRE_MATCH_CONTEXT 322 #undef SRE_INFO 323 #undef SRE_CHARSET 324 #undef SRE_COUNT 325 #undef SRE_AT 326 #undef SRE_CHAR 327 328 /* generate 16-bit unicode version */ 329 330 #define SRE_CHAR Py_UNICODE 331 #define SRE_AT sre_uat 332 #define SRE_COUNT sre_ucount 333 #define SRE_CHARSET sre_ucharset 334 #define SRE_INFO sre_uinfo 335 #define SRE_MATCH sre_umatch 336 #define SRE_MATCH_CONTEXT sre_umatch_context 337 #define SRE_SEARCH sre_usearch 338 #define SRE_LITERAL_TEMPLATE sre_uliteral_template 339 #endif 340 341 #endif /* SRE_RECURSIVE */ 342 343 /* -------------------------------------------------------------------- */ 344 /* String matching engine */ 345 346 /* the following section is compiled twice, with different character 347 settings */ 348 349 LOCAL(int) 350 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) 351 { 352 /* check if pointer is at given position */ 353 354 Py_ssize_t thisp, thatp; 355 356 switch (at) { 357 358 case SRE_AT_BEGINNING: 359 case SRE_AT_BEGINNING_STRING: 360 return ((void*) ptr == state->beginning); 361 362 case SRE_AT_BEGINNING_LINE: 363 return ((void*) ptr == state->beginning || 364 SRE_IS_LINEBREAK((int) ptr[-1])); 365 366 case SRE_AT_END: 367 return (((void*) (ptr+1) == state->end && 368 SRE_IS_LINEBREAK((int) ptr[0])) || 369 ((void*) ptr == state->end)); 370 371 case SRE_AT_END_LINE: 372 return ((void*) ptr == state->end || 373 SRE_IS_LINEBREAK((int) ptr[0])); 374 375 case SRE_AT_END_STRING: 376 return ((void*) ptr == state->end); 377 378 case SRE_AT_BOUNDARY: 379 if (state->beginning == state->end) 380 return 0; 381 thatp = ((void*) ptr > state->beginning) ? 382 SRE_IS_WORD((int) ptr[-1]) : 0; 383 thisp = ((void*) ptr < state->end) ? 384 SRE_IS_WORD((int) ptr[0]) : 0; 385 return thisp != thatp; 386 387 case SRE_AT_NON_BOUNDARY: 388 if (state->beginning == state->end) 389 return 0; 390 thatp = ((void*) ptr > state->beginning) ? 391 SRE_IS_WORD((int) ptr[-1]) : 0; 392 thisp = ((void*) ptr < state->end) ? 393 SRE_IS_WORD((int) ptr[0]) : 0; 394 return thisp == thatp; 395 396 case SRE_AT_LOC_BOUNDARY: 397 if (state->beginning == state->end) 398 return 0; 399 thatp = ((void*) ptr > state->beginning) ? 400 SRE_LOC_IS_WORD((int) ptr[-1]) : 0; 401 thisp = ((void*) ptr < state->end) ? 402 SRE_LOC_IS_WORD((int) ptr[0]) : 0; 403 return thisp != thatp; 404 405 case SRE_AT_LOC_NON_BOUNDARY: 406 if (state->beginning == state->end) 407 return 0; 408 thatp = ((void*) ptr > state->beginning) ? 409 SRE_LOC_IS_WORD((int) ptr[-1]) : 0; 410 thisp = ((void*) ptr < state->end) ? 411 SRE_LOC_IS_WORD((int) ptr[0]) : 0; 412 return thisp == thatp; 413 414 #if defined(HAVE_UNICODE) 415 case SRE_AT_UNI_BOUNDARY: 416 if (state->beginning == state->end) 417 return 0; 418 thatp = ((void*) ptr > state->beginning) ? 419 SRE_UNI_IS_WORD((int) ptr[-1]) : 0; 420 thisp = ((void*) ptr < state->end) ? 421 SRE_UNI_IS_WORD((int) ptr[0]) : 0; 422 return thisp != thatp; 423 424 case SRE_AT_UNI_NON_BOUNDARY: 425 if (state->beginning == state->end) 426 return 0; 427 thatp = ((void*) ptr > state->beginning) ? 428 SRE_UNI_IS_WORD((int) ptr[-1]) : 0; 429 thisp = ((void*) ptr < state->end) ? 430 SRE_UNI_IS_WORD((int) ptr[0]) : 0; 431 return thisp == thatp; 432 #endif 433 434 } 435 436 return 0; 437 } 438 439 LOCAL(int) 440 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) 441 { 442 /* check if character is a member of the given set */ 443 444 int ok = 1; 445 446 for (;;) { 447 switch (*set++) { 448 449 case SRE_OP_FAILURE: 450 return !ok; 451 452 case SRE_OP_LITERAL: 453 /* <LITERAL> <code> */ 454 if (ch == set[0]) 455 return ok; 456 set++; 457 break; 458 459 case SRE_OP_CATEGORY: 460 /* <CATEGORY> <code> */ 461 if (sre_category(set[0], (int) ch)) 462 return ok; 463 set += 1; 464 break; 465 466 case SRE_OP_CHARSET: 467 if (sizeof(SRE_CODE) == 2) { 468 /* <CHARSET> <bitmap> (16 bits per code word) */ 469 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15)))) 470 return ok; 471 set += 16; 472 } 473 else { 474 /* <CHARSET> <bitmap> (32 bits per code word) */ 475 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31)))) 476 return ok; 477 set += 8; 478 } 479 break; 480 481 case SRE_OP_RANGE: 482 /* <RANGE> <lower> <upper> */ 483 if (set[0] <= ch && ch <= set[1]) 484 return ok; 485 set += 2; 486 break; 487 488 case SRE_OP_NEGATE: 489 ok = !ok; 490 break; 491 492 case SRE_OP_BIGCHARSET: 493 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */ 494 { 495 Py_ssize_t count, block; 496 count = *(set++); 497 498 if (sizeof(SRE_CODE) == 2) { 499 block = ((unsigned char*)set)[ch >> 8]; 500 set += 128; 501 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15))) 502 return ok; 503 set += count*16; 504 } 505 else { 506 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids 507 * warnings when c's type supports only numbers < N+1 */ 508 if (!(ch & ~65535)) 509 block = ((unsigned char*)set)[ch >> 8]; 510 else 511 block = -1; 512 set += 64; 513 if (block >=0 && 514 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31)))) 515 return ok; 516 set += count*8; 517 } 518 break; 519 } 520 521 default: 522 /* internal error -- there's not much we can do about it 523 here, so let's just pretend it didn't match... */ 524 return 0; 525 } 526 } 527 } 528 529 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern); 530 531 LOCAL(Py_ssize_t) 532 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) 533 { 534 SRE_CODE chr; 535 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr; 536 SRE_CHAR* end = (SRE_CHAR *)state->end; 537 Py_ssize_t i; 538 539 /* adjust end */ 540 if (maxcount < end - ptr && maxcount != 65535) 541 end = ptr + maxcount; 542 543 switch (pattern[0]) { 544 545 case SRE_OP_IN: 546 /* repeated set */ 547 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr)); 548 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr)) 549 ptr++; 550 break; 551 552 case SRE_OP_ANY: 553 /* repeated dot wildcard. */ 554 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr)); 555 while (ptr < end && !SRE_IS_LINEBREAK(*ptr)) 556 ptr++; 557 break; 558 559 case SRE_OP_ANY_ALL: 560 /* repeated dot wildcard. skip to the end of the target 561 string, and backtrack from there */ 562 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr)); 563 ptr = end; 564 break; 565 566 case SRE_OP_LITERAL: 567 /* repeated literal */ 568 chr = pattern[1]; 569 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr)); 570 while (ptr < end && (SRE_CODE) *ptr == chr) 571 ptr++; 572 break; 573 574 case SRE_OP_LITERAL_IGNORE: 575 /* repeated literal */ 576 chr = pattern[1]; 577 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr)); 578 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr) 579 ptr++; 580 break; 581 582 case SRE_OP_NOT_LITERAL: 583 /* repeated non-literal */ 584 chr = pattern[1]; 585 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr)); 586 while (ptr < end && (SRE_CODE) *ptr != chr) 587 ptr++; 588 break; 589 590 case SRE_OP_NOT_LITERAL_IGNORE: 591 /* repeated non-literal */ 592 chr = pattern[1]; 593 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr)); 594 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr) 595 ptr++; 596 break; 597 598 default: 599 /* repeated single character pattern */ 600 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr)); 601 while ((SRE_CHAR*) state->ptr < end) { 602 i = SRE_MATCH(state, pattern); 603 if (i < 0) 604 return i; 605 if (!i) 606 break; 607 } 608 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, 609 (SRE_CHAR*) state->ptr - ptr)); 610 return (SRE_CHAR*) state->ptr - ptr; 611 } 612 613 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr)); 614 return ptr - (SRE_CHAR*) state->ptr; 615 } 616 617 #if 0 /* not used in this release */ 618 LOCAL(int) 619 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern) 620 { 621 /* check if an SRE_OP_INFO block matches at the current position. 622 returns the number of SRE_CODE objects to skip if successful, 0 623 if no match */ 624 625 SRE_CHAR* end = state->end; 626 SRE_CHAR* ptr = state->ptr; 627 Py_ssize_t i; 628 629 /* check minimal length */ 630 if (pattern[3] && (end - ptr) < pattern[3]) 631 return 0; 632 633 /* check known prefix */ 634 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) { 635 /* <length> <skip> <prefix data> <overlap data> */ 636 for (i = 0; i < pattern[5]; i++) 637 if ((SRE_CODE) ptr[i] != pattern[7 + i]) 638 return 0; 639 return pattern[0] + 2 * pattern[6]; 640 } 641 return pattern[0]; 642 } 643 #endif 644 645 /* The macros below should be used to protect recursive SRE_MATCH() 646 * calls that *failed* and do *not* return immediately (IOW, those 647 * that will backtrack). Explaining: 648 * 649 * - Recursive SRE_MATCH() returned true: that's usually a success 650 * (besides atypical cases like ASSERT_NOT), therefore there's no 651 * reason to restore lastmark; 652 * 653 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH() 654 * is returning to the caller: If the current SRE_MATCH() is the 655 * top function of the recursion, returning false will be a matching 656 * failure, and it doesn't matter where lastmark is pointing to. 657 * If it's *not* the top function, it will be a recursive SRE_MATCH() 658 * failure by itself, and the calling SRE_MATCH() will have to deal 659 * with the failure by the same rules explained here (it will restore 660 * lastmark by itself if necessary); 661 * 662 * - Recursive SRE_MATCH() returned false, and will continue the 663 * outside 'for' loop: must be protected when breaking, since the next 664 * OP could potentially depend on lastmark; 665 * 666 * - Recursive SRE_MATCH() returned false, and will be called again 667 * inside a local for/while loop: must be protected between each 668 * loop iteration, since the recursive SRE_MATCH() could do anything, 669 * and could potentially depend on lastmark. 670 * 671 * For more information, check the discussion at SF patch #712900. 672 */ 673 #define LASTMARK_SAVE() \ 674 do { \ 675 ctx->lastmark = state->lastmark; \ 676 ctx->lastindex = state->lastindex; \ 677 } while (0) 678 #define LASTMARK_RESTORE() \ 679 do { \ 680 state->lastmark = ctx->lastmark; \ 681 state->lastindex = ctx->lastindex; \ 682 } while (0) 683 684 #define RETURN_ERROR(i) do { return i; } while(0) 685 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0) 686 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0) 687 688 #define RETURN_ON_ERROR(i) \ 689 do { if (i < 0) RETURN_ERROR(i); } while (0) 690 #define RETURN_ON_SUCCESS(i) \ 691 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0) 692 #define RETURN_ON_FAILURE(i) \ 693 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0) 694 695 #define SFY(x) #x 696 697 #define DATA_STACK_ALLOC(state, type, ptr) \ 698 do { \ 699 alloc_pos = state->data_stack_base; \ 700 TRACE(("allocating %s in %d (%d)\n", \ 701 SFY(type), alloc_pos, sizeof(type))); \ 702 if (state->data_stack_size < alloc_pos+sizeof(type)) { \ 703 int j = data_stack_grow(state, sizeof(type)); \ 704 if (j < 0) return j; \ 705 if (ctx_pos != -1) \ 706 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \ 707 } \ 708 ptr = (type*)(state->data_stack+alloc_pos); \ 709 state->data_stack_base += sizeof(type); \ 710 } while (0) 711 712 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \ 713 do { \ 714 TRACE(("looking up %s at %d\n", SFY(type), pos)); \ 715 ptr = (type*)(state->data_stack+pos); \ 716 } while (0) 717 718 #define DATA_STACK_PUSH(state, data, size) \ 719 do { \ 720 TRACE(("copy data in %p to %d (%d)\n", \ 721 data, state->data_stack_base, size)); \ 722 if (state->data_stack_size < state->data_stack_base+size) { \ 723 int j = data_stack_grow(state, size); \ 724 if (j < 0) return j; \ 725 if (ctx_pos != -1) \ 726 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \ 727 } \ 728 memcpy(state->data_stack+state->data_stack_base, data, size); \ 729 state->data_stack_base += size; \ 730 } while (0) 731 732 #define DATA_STACK_POP(state, data, size, discard) \ 733 do { \ 734 TRACE(("copy data to %p from %d (%d)\n", \ 735 data, state->data_stack_base-size, size)); \ 736 memcpy(data, state->data_stack+state->data_stack_base-size, size); \ 737 if (discard) \ 738 state->data_stack_base -= size; \ 739 } while (0) 740 741 #define DATA_STACK_POP_DISCARD(state, size) \ 742 do { \ 743 TRACE(("discard data from %d (%d)\n", \ 744 state->data_stack_base-size, size)); \ 745 state->data_stack_base -= size; \ 746 } while(0) 747 748 #define DATA_PUSH(x) \ 749 DATA_STACK_PUSH(state, (x), sizeof(*(x))) 750 #define DATA_POP(x) \ 751 DATA_STACK_POP(state, (x), sizeof(*(x)), 1) 752 #define DATA_POP_DISCARD(x) \ 753 DATA_STACK_POP_DISCARD(state, sizeof(*(x))) 754 #define DATA_ALLOC(t,p) \ 755 DATA_STACK_ALLOC(state, t, p) 756 #define DATA_LOOKUP_AT(t,p,pos) \ 757 DATA_STACK_LOOKUP_AT(state,t,p,pos) 758 759 #define MARK_PUSH(lastmark) \ 760 do if (lastmark > 0) { \ 761 i = lastmark; /* ctx->lastmark may change if reallocated */ \ 762 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ 763 } while (0) 764 #define MARK_POP(lastmark) \ 765 do if (lastmark > 0) { \ 766 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ 767 } while (0) 768 #define MARK_POP_KEEP(lastmark) \ 769 do if (lastmark > 0) { \ 770 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ 771 } while (0) 772 #define MARK_POP_DISCARD(lastmark) \ 773 do if (lastmark > 0) { \ 774 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ 775 } while (0) 776 777 #define JUMP_NONE 0 778 #define JUMP_MAX_UNTIL_1 1 779 #define JUMP_MAX_UNTIL_2 2 780 #define JUMP_MAX_UNTIL_3 3 781 #define JUMP_MIN_UNTIL_1 4 782 #define JUMP_MIN_UNTIL_2 5 783 #define JUMP_MIN_UNTIL_3 6 784 #define JUMP_REPEAT 7 785 #define JUMP_REPEAT_ONE_1 8 786 #define JUMP_REPEAT_ONE_2 9 787 #define JUMP_MIN_REPEAT_ONE 10 788 #define JUMP_BRANCH 11 789 #define JUMP_ASSERT 12 790 #define JUMP_ASSERT_NOT 13 791 792 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \ 793 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \ 794 nextctx->last_ctx_pos = ctx_pos; \ 795 nextctx->jump = jumpvalue; \ 796 nextctx->pattern = nextpattern; \ 797 ctx_pos = alloc_pos; \ 798 ctx = nextctx; \ 799 goto entrance; \ 800 jumplabel: \ 801 while (0) /* gcc doesn't like labels at end of scopes */ \ 802 803 typedef struct { 804 Py_ssize_t last_ctx_pos; 805 Py_ssize_t jump; 806 SRE_CHAR* ptr; 807 SRE_CODE* pattern; 808 Py_ssize_t count; 809 Py_ssize_t lastmark; 810 Py_ssize_t lastindex; 811 union { 812 SRE_CODE chr; 813 SRE_REPEAT* rep; 814 } u; 815 } SRE_MATCH_CONTEXT; 816 817 /* check if string matches the given pattern. returns <0 for 818 error, 0 for failure, and 1 for success */ 819 LOCAL(Py_ssize_t) 820 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) 821 { 822 SRE_CHAR* end = (SRE_CHAR *)state->end; 823 Py_ssize_t alloc_pos, ctx_pos = -1; 824 Py_ssize_t i, ret = 0; 825 Py_ssize_t jump; 826 unsigned int sigcount=0; 827 828 SRE_MATCH_CONTEXT* ctx; 829 SRE_MATCH_CONTEXT* nextctx; 830 831 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr)); 832 833 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx); 834 ctx->last_ctx_pos = -1; 835 ctx->jump = JUMP_NONE; 836 ctx->pattern = pattern; 837 ctx_pos = alloc_pos; 838 839 entrance: 840 841 ctx->ptr = (SRE_CHAR *)state->ptr; 842 843 if (ctx->pattern[0] == SRE_OP_INFO) { 844 /* optimization info block */ 845 /* <INFO> <1=skip> <2=flags> <3=min> ... */ 846 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) { 847 TRACE(("reject (got %d chars, need %d)\n", 848 (end - ctx->ptr), ctx->pattern[3])); 849 RETURN_FAILURE; 850 } 851 ctx->pattern += ctx->pattern[1] + 1; 852 } 853 854 for (;;) { 855 ++sigcount; 856 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals()) 857 RETURN_ERROR(SRE_ERROR_INTERRUPTED); 858 859 switch (*ctx->pattern++) { 860 861 case SRE_OP_MARK: 862 /* set mark */ 863 /* <MARK> <gid> */ 864 TRACE(("|%p|%p|MARK %d\n", ctx->pattern, 865 ctx->ptr, ctx->pattern[0])); 866 i = ctx->pattern[0]; 867 if (i & 1) 868 state->lastindex = i/2 + 1; 869 if (i > state->lastmark) { 870 /* state->lastmark is the highest valid index in the 871 state->mark array. If it is increased by more than 1, 872 the intervening marks must be set to NULL to signal 873 that these marks have not been encountered. */ 874 Py_ssize_t j = state->lastmark + 1; 875 while (j < i) 876 state->mark[j++] = NULL; 877 state->lastmark = i; 878 } 879 state->mark[i] = ctx->ptr; 880 ctx->pattern++; 881 break; 882 883 case SRE_OP_LITERAL: 884 /* match literal string */ 885 /* <LITERAL> <code> */ 886 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern, 887 ctx->ptr, *ctx->pattern)); 888 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0]) 889 RETURN_FAILURE; 890 ctx->pattern++; 891 ctx->ptr++; 892 break; 893 894 case SRE_OP_NOT_LITERAL: 895 /* match anything that is not literal character */ 896 /* <NOT_LITERAL> <code> */ 897 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern, 898 ctx->ptr, *ctx->pattern)); 899 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0]) 900 RETURN_FAILURE; 901 ctx->pattern++; 902 ctx->ptr++; 903 break; 904 905 case SRE_OP_SUCCESS: 906 /* end of pattern */ 907 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr)); 908 state->ptr = ctx->ptr; 909 RETURN_SUCCESS; 910 911 case SRE_OP_AT: 912 /* match at given position */ 913 /* <AT> <code> */ 914 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); 915 if (!SRE_AT(state, ctx->ptr, *ctx->pattern)) 916 RETURN_FAILURE; 917 ctx->pattern++; 918 break; 919 920 case SRE_OP_CATEGORY: 921 /* match at given category */ 922 /* <CATEGORY> <code> */ 923 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern, 924 ctx->ptr, *ctx->pattern)); 925 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0])) 926 RETURN_FAILURE; 927 ctx->pattern++; 928 ctx->ptr++; 929 break; 930 931 case SRE_OP_ANY: 932 /* match anything (except a newline) */ 933 /* <ANY> */ 934 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr)); 935 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0])) 936 RETURN_FAILURE; 937 ctx->ptr++; 938 break; 939 940 case SRE_OP_ANY_ALL: 941 /* match anything */ 942 /* <ANY_ALL> */ 943 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr)); 944 if (ctx->ptr >= end) 945 RETURN_FAILURE; 946 ctx->ptr++; 947 break; 948 949 case SRE_OP_IN: 950 /* match set member (or non_member) */ 951 /* <IN> <skip> <set> */ 952 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr)); 953 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr)) 954 RETURN_FAILURE; 955 ctx->pattern += ctx->pattern[0]; 956 ctx->ptr++; 957 break; 958 959 case SRE_OP_LITERAL_IGNORE: 960 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", 961 ctx->pattern, ctx->ptr, ctx->pattern[0])); 962 if (ctx->ptr >= end || 963 state->lower(*ctx->ptr) != state->lower(*ctx->pattern)) 964 RETURN_FAILURE; 965 ctx->pattern++; 966 ctx->ptr++; 967 break; 968 969 case SRE_OP_NOT_LITERAL_IGNORE: 970 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", 971 ctx->pattern, ctx->ptr, *ctx->pattern)); 972 if (ctx->ptr >= end || 973 state->lower(*ctx->ptr) == state->lower(*ctx->pattern)) 974 RETURN_FAILURE; 975 ctx->pattern++; 976 ctx->ptr++; 977 break; 978 979 case SRE_OP_IN_IGNORE: 980 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); 981 if (ctx->ptr >= end 982 || !SRE_CHARSET(ctx->pattern+1, 983 (SRE_CODE)state->lower(*ctx->ptr))) 984 RETURN_FAILURE; 985 ctx->pattern += ctx->pattern[0]; 986 ctx->ptr++; 987 break; 988 989 case SRE_OP_JUMP: 990 case SRE_OP_INFO: 991 /* jump forward */ 992 /* <JUMP> <offset> */ 993 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern, 994 ctx->ptr, ctx->pattern[0])); 995 ctx->pattern += ctx->pattern[0]; 996 break; 997 998 case SRE_OP_BRANCH: 999 /* alternation */ 1000 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */ 1001 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr)); 1002 LASTMARK_SAVE(); 1003 ctx->u.rep = state->repeat; 1004 if (ctx->u.rep) 1005 MARK_PUSH(ctx->lastmark); 1006 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { 1007 if (ctx->pattern[1] == SRE_OP_LITERAL && 1008 (ctx->ptr >= end || 1009 (SRE_CODE) *ctx->ptr != ctx->pattern[2])) 1010 continue; 1011 if (ctx->pattern[1] == SRE_OP_IN && 1012 (ctx->ptr >= end || 1013 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr))) 1014 continue; 1015 state->ptr = ctx->ptr; 1016 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); 1017 if (ret) { 1018 if (ctx->u.rep) 1019 MARK_POP_DISCARD(ctx->lastmark); 1020 RETURN_ON_ERROR(ret); 1021 RETURN_SUCCESS; 1022 } 1023 if (ctx->u.rep) 1024 MARK_POP_KEEP(ctx->lastmark); 1025 LASTMARK_RESTORE(); 1026 } 1027 if (ctx->u.rep) 1028 MARK_POP_DISCARD(ctx->lastmark); 1029 RETURN_FAILURE; 1030 1031 case SRE_OP_REPEAT_ONE: 1032 /* match repeated sequence (maximizing regexp) */ 1033 1034 /* this operator only works if the repeated item is 1035 exactly one character wide, and we're not already 1036 collecting backtracking points. for other cases, 1037 use the MAX_REPEAT operator */ 1038 1039 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */ 1040 1041 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, 1042 ctx->pattern[1], ctx->pattern[2])); 1043 1044 if (ctx->ptr + ctx->pattern[1] > end) 1045 RETURN_FAILURE; /* cannot match */ 1046 1047 state->ptr = ctx->ptr; 1048 1049 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]); 1050 RETURN_ON_ERROR(ret); 1051 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); 1052 ctx->count = ret; 1053 ctx->ptr += ctx->count; 1054 1055 /* when we arrive here, count contains the number of 1056 matches, and ctx->ptr points to the tail of the target 1057 string. check if the rest of the pattern matches, 1058 and backtrack if not. */ 1059 1060 if (ctx->count < (Py_ssize_t) ctx->pattern[1]) 1061 RETURN_FAILURE; 1062 1063 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { 1064 /* tail is empty. we're finished */ 1065 state->ptr = ctx->ptr; 1066 RETURN_SUCCESS; 1067 } 1068 1069 LASTMARK_SAVE(); 1070 1071 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) { 1072 /* tail starts with a literal. skip positions where 1073 the rest of the pattern cannot possibly match */ 1074 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1]; 1075 for (;;) { 1076 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] && 1077 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) { 1078 ctx->ptr--; 1079 ctx->count--; 1080 } 1081 if (ctx->count < (Py_ssize_t) ctx->pattern[1]) 1082 break; 1083 state->ptr = ctx->ptr; 1084 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1, 1085 ctx->pattern+ctx->pattern[0]); 1086 if (ret) { 1087 RETURN_ON_ERROR(ret); 1088 RETURN_SUCCESS; 1089 } 1090 1091 LASTMARK_RESTORE(); 1092 1093 ctx->ptr--; 1094 ctx->count--; 1095 } 1096 1097 } else { 1098 /* general case */ 1099 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) { 1100 state->ptr = ctx->ptr; 1101 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2, 1102 ctx->pattern+ctx->pattern[0]); 1103 if (ret) { 1104 RETURN_ON_ERROR(ret); 1105 RETURN_SUCCESS; 1106 } 1107 ctx->ptr--; 1108 ctx->count--; 1109 LASTMARK_RESTORE(); 1110 } 1111 } 1112 RETURN_FAILURE; 1113 1114 case SRE_OP_MIN_REPEAT_ONE: 1115 /* match repeated sequence (minimizing regexp) */ 1116 1117 /* this operator only works if the repeated item is 1118 exactly one character wide, and we're not already 1119 collecting backtracking points. for other cases, 1120 use the MIN_REPEAT operator */ 1121 1122 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */ 1123 1124 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, 1125 ctx->pattern[1], ctx->pattern[2])); 1126 1127 if (ctx->ptr + ctx->pattern[1] > end) 1128 RETURN_FAILURE; /* cannot match */ 1129 1130 state->ptr = ctx->ptr; 1131 1132 if (ctx->pattern[1] == 0) 1133 ctx->count = 0; 1134 else { 1135 /* count using pattern min as the maximum */ 1136 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]); 1137 RETURN_ON_ERROR(ret); 1138 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); 1139 if (ret < (Py_ssize_t) ctx->pattern[1]) 1140 /* didn't match minimum number of times */ 1141 RETURN_FAILURE; 1142 /* advance past minimum matches of repeat */ 1143 ctx->count = ret; 1144 ctx->ptr += ctx->count; 1145 } 1146 1147 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { 1148 /* tail is empty. we're finished */ 1149 state->ptr = ctx->ptr; 1150 RETURN_SUCCESS; 1151 1152 } else { 1153 /* general case */ 1154 LASTMARK_SAVE(); 1155 while ((Py_ssize_t)ctx->pattern[2] == 65535 1156 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) { 1157 state->ptr = ctx->ptr; 1158 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one, 1159 ctx->pattern+ctx->pattern[0]); 1160 if (ret) { 1161 RETURN_ON_ERROR(ret); 1162 RETURN_SUCCESS; 1163 } 1164 state->ptr = ctx->ptr; 1165 ret = SRE_COUNT(state, ctx->pattern+3, 1); 1166 RETURN_ON_ERROR(ret); 1167 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); 1168 if (ret == 0) 1169 break; 1170 assert(ret == 1); 1171 ctx->ptr++; 1172 ctx->count++; 1173 LASTMARK_RESTORE(); 1174 } 1175 } 1176 RETURN_FAILURE; 1177 1178 case SRE_OP_REPEAT: 1179 /* create repeat context. all the hard work is done 1180 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ 1181 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */ 1182 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr, 1183 ctx->pattern[1], ctx->pattern[2])); 1184 1185 /* install new repeat context */ 1186 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep)); 1187 if (!ctx->u.rep) { 1188 PyErr_NoMemory(); 1189 RETURN_FAILURE; 1190 } 1191 ctx->u.rep->count = -1; 1192 ctx->u.rep->pattern = ctx->pattern; 1193 ctx->u.rep->prev = state->repeat; 1194 ctx->u.rep->last_ptr = NULL; 1195 state->repeat = ctx->u.rep; 1196 1197 state->ptr = ctx->ptr; 1198 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]); 1199 state->repeat = ctx->u.rep->prev; 1200 PyObject_FREE(ctx->u.rep); 1201 1202 if (ret) { 1203 RETURN_ON_ERROR(ret); 1204 RETURN_SUCCESS; 1205 } 1206 RETURN_FAILURE; 1207 1208 case SRE_OP_MAX_UNTIL: 1209 /* maximizing repeat */ 1210 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */ 1211 1212 /* FIXME: we probably need to deal with zero-width 1213 matches in here... */ 1214 1215 ctx->u.rep = state->repeat; 1216 if (!ctx->u.rep) 1217 RETURN_ERROR(SRE_ERROR_STATE); 1218 1219 state->ptr = ctx->ptr; 1220 1221 ctx->count = ctx->u.rep->count+1; 1222 1223 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern, 1224 ctx->ptr, ctx->count)); 1225 1226 if (ctx->count < ctx->u.rep->pattern[1]) { 1227 /* not enough matches */ 1228 ctx->u.rep->count = ctx->count; 1229 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1, 1230 ctx->u.rep->pattern+3); 1231 if (ret) { 1232 RETURN_ON_ERROR(ret); 1233 RETURN_SUCCESS; 1234 } 1235 ctx->u.rep->count = ctx->count-1; 1236 state->ptr = ctx->ptr; 1237 RETURN_FAILURE; 1238 } 1239 1240 if ((ctx->count < ctx->u.rep->pattern[2] || 1241 ctx->u.rep->pattern[2] == 65535) && 1242 state->ptr != ctx->u.rep->last_ptr) { 1243 /* we may have enough matches, but if we can 1244 match another item, do so */ 1245 ctx->u.rep->count = ctx->count; 1246 LASTMARK_SAVE(); 1247 MARK_PUSH(ctx->lastmark); 1248 /* zero-width match protection */ 1249 DATA_PUSH(&ctx->u.rep->last_ptr); 1250 ctx->u.rep->last_ptr = state->ptr; 1251 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2, 1252 ctx->u.rep->pattern+3); 1253 DATA_POP(&ctx->u.rep->last_ptr); 1254 if (ret) { 1255 MARK_POP_DISCARD(ctx->lastmark); 1256 RETURN_ON_ERROR(ret); 1257 RETURN_SUCCESS; 1258 } 1259 MARK_POP(ctx->lastmark); 1260 LASTMARK_RESTORE(); 1261 ctx->u.rep->count = ctx->count-1; 1262 state->ptr = ctx->ptr; 1263 } 1264 1265 /* cannot match more repeated items here. make sure the 1266 tail matches */ 1267 state->repeat = ctx->u.rep->prev; 1268 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); 1269 RETURN_ON_SUCCESS(ret); 1270 state->repeat = ctx->u.rep; 1271 state->ptr = ctx->ptr; 1272 RETURN_FAILURE; 1273 1274 case SRE_OP_MIN_UNTIL: 1275 /* minimizing repeat */ 1276 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */ 1277 1278 ctx->u.rep = state->repeat; 1279 if (!ctx->u.rep) 1280 RETURN_ERROR(SRE_ERROR_STATE); 1281 1282 state->ptr = ctx->ptr; 1283 1284 ctx->count = ctx->u.rep->count+1; 1285 1286 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern, 1287 ctx->ptr, ctx->count, ctx->u.rep->pattern)); 1288 1289 if (ctx->count < ctx->u.rep->pattern[1]) { 1290 /* not enough matches */ 1291 ctx->u.rep->count = ctx->count; 1292 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1, 1293 ctx->u.rep->pattern+3); 1294 if (ret) { 1295 RETURN_ON_ERROR(ret); 1296 RETURN_SUCCESS; 1297 } 1298 ctx->u.rep->count = ctx->count-1; 1299 state->ptr = ctx->ptr; 1300 RETURN_FAILURE; 1301 } 1302 1303 LASTMARK_SAVE(); 1304 1305 /* see if the tail matches */ 1306 state->repeat = ctx->u.rep->prev; 1307 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); 1308 if (ret) { 1309 RETURN_ON_ERROR(ret); 1310 RETURN_SUCCESS; 1311 } 1312 1313 state->repeat = ctx->u.rep; 1314 state->ptr = ctx->ptr; 1315 1316 LASTMARK_RESTORE(); 1317 1318 if (ctx->count >= ctx->u.rep->pattern[2] 1319 && ctx->u.rep->pattern[2] != 65535) 1320 RETURN_FAILURE; 1321 1322 ctx->u.rep->count = ctx->count; 1323 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3, 1324 ctx->u.rep->pattern+3); 1325 if (ret) { 1326 RETURN_ON_ERROR(ret); 1327 RETURN_SUCCESS; 1328 } 1329 ctx->u.rep->count = ctx->count-1; 1330 state->ptr = ctx->ptr; 1331 RETURN_FAILURE; 1332 1333 case SRE_OP_GROUPREF: 1334 /* match backreference */ 1335 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern, 1336 ctx->ptr, ctx->pattern[0])); 1337 i = ctx->pattern[0]; 1338 { 1339 Py_ssize_t groupref = i+i; 1340 if (groupref >= state->lastmark) { 1341 RETURN_FAILURE; 1342 } else { 1343 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; 1344 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; 1345 if (!p || !e || e < p) 1346 RETURN_FAILURE; 1347 while (p < e) { 1348 if (ctx->ptr >= end || *ctx->ptr != *p) 1349 RETURN_FAILURE; 1350 p++; ctx->ptr++; 1351 } 1352 } 1353 } 1354 ctx->pattern++; 1355 break; 1356 1357 case SRE_OP_GROUPREF_IGNORE: 1358 /* match backreference */ 1359 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern, 1360 ctx->ptr, ctx->pattern[0])); 1361 i = ctx->pattern[0]; 1362 { 1363 Py_ssize_t groupref = i+i; 1364 if (groupref >= state->lastmark) { 1365 RETURN_FAILURE; 1366 } else { 1367 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; 1368 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; 1369 if (!p || !e || e < p) 1370 RETURN_FAILURE; 1371 while (p < e) { 1372 if (ctx->ptr >= end || 1373 state->lower(*ctx->ptr) != state->lower(*p)) 1374 RETURN_FAILURE; 1375 p++; ctx->ptr++; 1376 } 1377 } 1378 } 1379 ctx->pattern++; 1380 break; 1381 1382 case SRE_OP_GROUPREF_EXISTS: 1383 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern, 1384 ctx->ptr, ctx->pattern[0])); 1385 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */ 1386 i = ctx->pattern[0]; 1387 { 1388 Py_ssize_t groupref = i+i; 1389 if (groupref >= state->lastmark) { 1390 ctx->pattern += ctx->pattern[1]; 1391 break; 1392 } else { 1393 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; 1394 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; 1395 if (!p || !e || e < p) { 1396 ctx->pattern += ctx->pattern[1]; 1397 break; 1398 } 1399 } 1400 } 1401 ctx->pattern += 2; 1402 break; 1403 1404 case SRE_OP_ASSERT: 1405 /* assert subpattern */ 1406 /* <ASSERT> <skip> <back> <pattern> */ 1407 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern, 1408 ctx->ptr, ctx->pattern[1])); 1409 state->ptr = ctx->ptr - ctx->pattern[1]; 1410 if (state->ptr < state->beginning) 1411 RETURN_FAILURE; 1412 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2); 1413 RETURN_ON_FAILURE(ret); 1414 ctx->pattern += ctx->pattern[0]; 1415 break; 1416 1417 case SRE_OP_ASSERT_NOT: 1418 /* assert not subpattern */ 1419 /* <ASSERT_NOT> <skip> <back> <pattern> */ 1420 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern, 1421 ctx->ptr, ctx->pattern[1])); 1422 state->ptr = ctx->ptr - ctx->pattern[1]; 1423 if (state->ptr >= state->beginning) { 1424 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); 1425 if (ret) { 1426 RETURN_ON_ERROR(ret); 1427 RETURN_FAILURE; 1428 } 1429 } 1430 ctx->pattern += ctx->pattern[0]; 1431 break; 1432 1433 case SRE_OP_FAILURE: 1434 /* immediate failure */ 1435 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr)); 1436 RETURN_FAILURE; 1437 1438 default: 1439 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr, 1440 ctx->pattern[-1])); 1441 RETURN_ERROR(SRE_ERROR_ILLEGAL); 1442 } 1443 } 1444 1445 exit: 1446 ctx_pos = ctx->last_ctx_pos; 1447 jump = ctx->jump; 1448 DATA_POP_DISCARD(ctx); 1449 if (ctx_pos == -1) 1450 return ret; 1451 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); 1452 1453 switch (jump) { 1454 case JUMP_MAX_UNTIL_2: 1455 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr)); 1456 goto jump_max_until_2; 1457 case JUMP_MAX_UNTIL_3: 1458 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr)); 1459 goto jump_max_until_3; 1460 case JUMP_MIN_UNTIL_2: 1461 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr)); 1462 goto jump_min_until_2; 1463 case JUMP_MIN_UNTIL_3: 1464 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr)); 1465 goto jump_min_until_3; 1466 case JUMP_BRANCH: 1467 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr)); 1468 goto jump_branch; 1469 case JUMP_MAX_UNTIL_1: 1470 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr)); 1471 goto jump_max_until_1; 1472 case JUMP_MIN_UNTIL_1: 1473 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr)); 1474 goto jump_min_until_1; 1475 case JUMP_REPEAT: 1476 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr)); 1477 goto jump_repeat; 1478 case JUMP_REPEAT_ONE_1: 1479 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr)); 1480 goto jump_repeat_one_1; 1481 case JUMP_REPEAT_ONE_2: 1482 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr)); 1483 goto jump_repeat_one_2; 1484 case JUMP_MIN_REPEAT_ONE: 1485 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr)); 1486 goto jump_min_repeat_one; 1487 case JUMP_ASSERT: 1488 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr)); 1489 goto jump_assert; 1490 case JUMP_ASSERT_NOT: 1491 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr)); 1492 goto jump_assert_not; 1493 case JUMP_NONE: 1494 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret)); 1495 break; 1496 } 1497 1498 return ret; /* should never get here */ 1499 } 1500 1501 LOCAL(Py_ssize_t) 1502 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) 1503 { 1504 SRE_CHAR* ptr = (SRE_CHAR *)state->start; 1505 SRE_CHAR* end = (SRE_CHAR *)state->end; 1506 Py_ssize_t status = 0; 1507 Py_ssize_t prefix_len = 0; 1508 Py_ssize_t prefix_skip = 0; 1509 SRE_CODE* prefix = NULL; 1510 SRE_CODE* charset = NULL; 1511 SRE_CODE* overlap = NULL; 1512 int flags = 0; 1513 1514 if (pattern[0] == SRE_OP_INFO) { 1515 /* optimization info block */ 1516 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ 1517 1518 flags = pattern[2]; 1519 1520 if (pattern[3] > 1) { 1521 /* adjust end point (but make sure we leave at least one 1522 character in there, so literal search will work) */ 1523 end -= pattern[3]-1; 1524 if (end <= ptr) 1525 end = ptr+1; 1526 } 1527 1528 if (flags & SRE_INFO_PREFIX) { 1529 /* pattern starts with a known prefix */ 1530 /* <length> <skip> <prefix data> <overlap data> */ 1531 prefix_len = pattern[5]; 1532 prefix_skip = pattern[6]; 1533 prefix = pattern + 7; 1534 overlap = prefix + prefix_len - 1; 1535 } else if (flags & SRE_INFO_CHARSET) 1536 /* pattern starts with a character from a known set */ 1537 /* <charset> */ 1538 charset = pattern + 5; 1539 1540 pattern += 1 + pattern[1]; 1541 } 1542 1543 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip)); 1544 TRACE(("charset = %p\n", charset)); 1545 1546 #if defined(USE_FAST_SEARCH) 1547 if (prefix_len > 1) { 1548 /* pattern starts with a known prefix. use the overlap 1549 table to skip forward as fast as we possibly can */ 1550 Py_ssize_t i = 0; 1551 end = (SRE_CHAR *)state->end; 1552 while (ptr < end) { 1553 for (;;) { 1554 if ((SRE_CODE) ptr[0] != prefix[i]) { 1555 if (!i) 1556 break; 1557 else 1558 i = overlap[i]; 1559 } else { 1560 if (++i == prefix_len) { 1561 /* found a potential match */ 1562 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr)); 1563 state->start = ptr + 1 - prefix_len; 1564 state->ptr = ptr + 1 - prefix_len + prefix_skip; 1565 if (flags & SRE_INFO_LITERAL) 1566 return 1; /* we got all of it */ 1567 status = SRE_MATCH(state, pattern + 2*prefix_skip); 1568 if (status != 0) 1569 return status; 1570 /* close but no cigar -- try again */ 1571 i = overlap[i]; 1572 } 1573 break; 1574 } 1575 } 1576 ptr++; 1577 } 1578 return 0; 1579 } 1580 #endif 1581 1582 if (pattern[0] == SRE_OP_LITERAL) { 1583 /* pattern starts with a literal character. this is used 1584 for short prefixes, and if fast search is disabled */ 1585 SRE_CODE chr = pattern[1]; 1586 end = (SRE_CHAR *)state->end; 1587 for (;;) { 1588 while (ptr < end && (SRE_CODE) ptr[0] != chr) 1589 ptr++; 1590 if (ptr >= end) 1591 return 0; 1592 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr)); 1593 state->start = ptr; 1594 state->ptr = ++ptr; 1595 if (flags & SRE_INFO_LITERAL) 1596 return 1; /* we got all of it */ 1597 status = SRE_MATCH(state, pattern + 2); 1598 if (status != 0) 1599 break; 1600 } 1601 } else if (charset) { 1602 /* pattern starts with a character from a known set */ 1603 end = (SRE_CHAR *)state->end; 1604 for (;;) { 1605 while (ptr < end && !SRE_CHARSET(charset, ptr[0])) 1606 ptr++; 1607 if (ptr >= end) 1608 return 0; 1609 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr)); 1610 state->start = ptr; 1611 state->ptr = ptr; 1612 status = SRE_MATCH(state, pattern); 1613 if (status != 0) 1614 break; 1615 ptr++; 1616 } 1617 } else 1618 /* general case */ 1619 while (ptr <= end) { 1620 TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); 1621 state->start = state->ptr = ptr++; 1622 status = SRE_MATCH(state, pattern); 1623 if (status != 0) 1624 break; 1625 } 1626 1627 return status; 1628 } 1629 1630 LOCAL(int) 1631 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len) 1632 { 1633 /* check if given string is a literal template (i.e. no escapes) */ 1634 while (len-- > 0) 1635 if (*ptr++ == '\\') 1636 return 0; 1637 return 1; 1638 } 1639 1640 #if !defined(SRE_RECURSIVE) 1641 1642 /* -------------------------------------------------------------------- */ 1643 /* factories and destructors */ 1644 1645 /* see sre.h for object declarations */ 1646 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int); 1647 static PyObject*pattern_scanner(PatternObject*, PyObject*); 1648 1649 static PyObject * 1650 sre_codesize(PyObject* self, PyObject *unused) 1651 { 1652 return Py_BuildValue("l", sizeof(SRE_CODE)); 1653 } 1654 1655 static PyObject * 1656 sre_getlower(PyObject* self, PyObject* args) 1657 { 1658 int character, flags; 1659 if (!PyArg_ParseTuple(args, "ii", &character, &flags)) 1660 return NULL; 1661 if (flags & SRE_FLAG_LOCALE) 1662 return Py_BuildValue("i", sre_lower_locale(character)); 1663 if (flags & SRE_FLAG_UNICODE) 1664 #if defined(HAVE_UNICODE) 1665 return Py_BuildValue("i", sre_lower_unicode(character)); 1666 #else 1667 return Py_BuildValue("i", sre_lower_locale(character)); 1668 #endif 1669 return Py_BuildValue("i", sre_lower(character)); 1670 } 1671 1672 LOCAL(void) 1673 state_reset(SRE_STATE* state) 1674 { 1675 /* FIXME: dynamic! */ 1676 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/ 1677 1678 state->lastmark = -1; 1679 state->lastindex = -1; 1680 1681 state->repeat = NULL; 1682 1683 data_stack_dealloc(state); 1684 } 1685 1686 static void* 1687 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) 1688 { 1689 /* given a python object, return a data pointer, a length (in 1690 characters), and a character size. return NULL if the object 1691 is not a string (or not compatible) */ 1692 1693 PyBufferProcs *buffer; 1694 Py_ssize_t size, bytes; 1695 int charsize; 1696 void* ptr; 1697 1698 #if defined(HAVE_UNICODE) 1699 if (PyUnicode_Check(string)) { 1700 /* unicode strings doesn't always support the buffer interface */ 1701 ptr = (void*) PyUnicode_AS_DATA(string); 1702 /* bytes = PyUnicode_GET_DATA_SIZE(string); */ 1703 size = PyUnicode_GET_SIZE(string); 1704 charsize = sizeof(Py_UNICODE); 1705 1706 } else { 1707 #endif 1708 1709 /* get pointer to string buffer */ 1710 buffer = Py_TYPE(string)->tp_as_buffer; 1711 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount || 1712 buffer->bf_getsegcount(string, NULL) != 1) { 1713 PyErr_SetString(PyExc_TypeError, "expected string or buffer"); 1714 return NULL; 1715 } 1716 1717 /* determine buffer size */ 1718 bytes = buffer->bf_getreadbuffer(string, 0, &ptr); 1719 if (bytes < 0) { 1720 PyErr_SetString(PyExc_TypeError, "buffer has negative size"); 1721 return NULL; 1722 } 1723 1724 /* determine character size */ 1725 #if PY_VERSION_HEX >= 0x01060000 1726 size = PyObject_Size(string); 1727 #else 1728 size = PyObject_Length(string); 1729 #endif 1730 1731 if (PyString_Check(string) || bytes == size) 1732 charsize = 1; 1733 #if defined(HAVE_UNICODE) 1734 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE))) 1735 charsize = sizeof(Py_UNICODE); 1736 #endif 1737 else { 1738 PyErr_SetString(PyExc_TypeError, "buffer size mismatch"); 1739 return NULL; 1740 } 1741 1742 #if defined(HAVE_UNICODE) 1743 } 1744 #endif 1745 1746 *p_length = size; 1747 *p_charsize = charsize; 1748 1749 return ptr; 1750 } 1751 1752 LOCAL(PyObject*) 1753 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, 1754 Py_ssize_t start, Py_ssize_t end) 1755 { 1756 /* prepare state object */ 1757 1758 Py_ssize_t length; 1759 int charsize; 1760 void* ptr; 1761 1762 memset(state, 0, sizeof(SRE_STATE)); 1763 1764 state->lastmark = -1; 1765 state->lastindex = -1; 1766 1767 ptr = getstring(string, &length, &charsize); 1768 if (!ptr) 1769 return NULL; 1770 1771 /* adjust boundaries */ 1772 if (start < 0) 1773 start = 0; 1774 else if (start > length) 1775 start = length; 1776 1777 if (end < 0) 1778 end = 0; 1779 else if (end > length) 1780 end = length; 1781 1782 state->charsize = charsize; 1783 1784 state->beginning = ptr; 1785 1786 state->start = (void*) ((char*) ptr + start * state->charsize); 1787 state->end = (void*) ((char*) ptr + end * state->charsize); 1788 1789 Py_INCREF(string); 1790 state->string = string; 1791 state->pos = start; 1792 state->endpos = end; 1793 1794 if (pattern->flags & SRE_FLAG_LOCALE) 1795 state->lower = sre_lower_locale; 1796 else if (pattern->flags & SRE_FLAG_UNICODE) 1797 #if defined(HAVE_UNICODE) 1798 state->lower = sre_lower_unicode; 1799 #else 1800 state->lower = sre_lower_locale; 1801 #endif 1802 else 1803 state->lower = sre_lower; 1804 1805 return string; 1806 } 1807 1808 LOCAL(void) 1809 state_fini(SRE_STATE* state) 1810 { 1811 Py_XDECREF(state->string); 1812 data_stack_dealloc(state); 1813 } 1814 1815 /* calculate offset from start of string */ 1816 #define STATE_OFFSET(state, member)\ 1817 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize) 1818 1819 LOCAL(PyObject*) 1820 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty) 1821 { 1822 Py_ssize_t i, j; 1823 1824 index = (index - 1) * 2; 1825 1826 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) { 1827 if (empty) 1828 /* want empty string */ 1829 i = j = 0; 1830 else { 1831 Py_INCREF(Py_None); 1832 return Py_None; 1833 } 1834 } else { 1835 i = STATE_OFFSET(state, state->mark[index]); 1836 j = STATE_OFFSET(state, state->mark[index+1]); 1837 } 1838 1839 return PySequence_GetSlice(string, i, j); 1840 } 1841 1842 static void 1843 pattern_error(int status) 1844 { 1845 switch (status) { 1846 case SRE_ERROR_RECURSION_LIMIT: 1847 PyErr_SetString( 1848 PyExc_RuntimeError, 1849 "maximum recursion limit exceeded" 1850 ); 1851 break; 1852 case SRE_ERROR_MEMORY: 1853 PyErr_NoMemory(); 1854 break; 1855 case SRE_ERROR_INTERRUPTED: 1856 /* An exception has already been raised, so let it fly */ 1857 break; 1858 default: 1859 /* other error codes indicate compiler/engine bugs */ 1860 PyErr_SetString( 1861 PyExc_RuntimeError, 1862 "internal error in regular expression engine" 1863 ); 1864 } 1865 } 1866 1867 static void 1868 pattern_dealloc(PatternObject* self) 1869 { 1870 if (self->weakreflist != NULL) 1871 PyObject_ClearWeakRefs((PyObject *) self); 1872 Py_XDECREF(self->pattern); 1873 Py_XDECREF(self->groupindex); 1874 Py_XDECREF(self->indexgroup); 1875 PyObject_DEL(self); 1876 } 1877 1878 static PyObject* 1879 pattern_match(PatternObject* self, PyObject* args, PyObject* kw) 1880 { 1881 SRE_STATE state; 1882 int status; 1883 1884 PyObject* string; 1885 Py_ssize_t start = 0; 1886 Py_ssize_t end = PY_SSIZE_T_MAX; 1887 static char* kwlist[] = { "pattern", "pos", "endpos", NULL }; 1888 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist, 1889 &string, &start, &end)) 1890 return NULL; 1891 1892 string = state_init(&state, self, string, start, end); 1893 if (!string) 1894 return NULL; 1895 1896 state.ptr = state.start; 1897 1898 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr)); 1899 1900 if (state.charsize == 1) { 1901 status = sre_match(&state, PatternObject_GetCode(self)); 1902 } else { 1903 #if defined(HAVE_UNICODE) 1904 status = sre_umatch(&state, PatternObject_GetCode(self)); 1905 #endif 1906 } 1907 1908 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); 1909 if (PyErr_Occurred()) 1910 return NULL; 1911 1912 state_fini(&state); 1913 1914 return pattern_new_match(self, &state, status); 1915 } 1916 1917 static PyObject* 1918 pattern_search(PatternObject* self, PyObject* args, PyObject* kw) 1919 { 1920 SRE_STATE state; 1921 int status; 1922 1923 PyObject* string; 1924 Py_ssize_t start = 0; 1925 Py_ssize_t end = PY_SSIZE_T_MAX; 1926 static char* kwlist[] = { "pattern", "pos", "endpos", NULL }; 1927 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist, 1928 &string, &start, &end)) 1929 return NULL; 1930 1931 string = state_init(&state, self, string, start, end); 1932 if (!string) 1933 return NULL; 1934 1935 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr)); 1936 1937 if (state.charsize == 1) { 1938 status = sre_search(&state, PatternObject_GetCode(self)); 1939 } else { 1940 #if defined(HAVE_UNICODE) 1941 status = sre_usearch(&state, PatternObject_GetCode(self)); 1942 #endif 1943 } 1944 1945 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); 1946 1947 state_fini(&state); 1948 1949 if (PyErr_Occurred()) 1950 return NULL; 1951 1952 return pattern_new_match(self, &state, status); 1953 } 1954 1955 static PyObject* 1956 call(char* module, char* function, PyObject* args) 1957 { 1958 PyObject* name; 1959 PyObject* mod; 1960 PyObject* func; 1961 PyObject* result; 1962 1963 if (!args) 1964 return NULL; 1965 name = PyString_FromString(module); 1966 if (!name) 1967 return NULL; 1968 mod = PyImport_Import(name); 1969 Py_DECREF(name); 1970 if (!mod) 1971 return NULL; 1972 func = PyObject_GetAttrString(mod, function); 1973 Py_DECREF(mod); 1974 if (!func) 1975 return NULL; 1976 result = PyObject_CallObject(func, args); 1977 Py_DECREF(func); 1978 Py_DECREF(args); 1979 return result; 1980 } 1981 1982 #ifdef USE_BUILTIN_COPY 1983 static int 1984 deepcopy(PyObject** object, PyObject* memo) 1985 { 1986 PyObject* copy; 1987 1988 copy = call( 1989 "copy", "deepcopy", 1990 PyTuple_Pack(2, *object, memo) 1991 ); 1992 if (!copy) 1993 return 0; 1994 1995 Py_DECREF(*object); 1996 *object = copy; 1997 1998 return 1; /* success */ 1999 } 2000 #endif 2001 2002 static PyObject* 2003 join_list(PyObject* list, PyObject* string) 2004 { 2005 /* join list elements */ 2006 2007 PyObject* joiner; 2008 #if PY_VERSION_HEX >= 0x01060000 2009 PyObject* function; 2010 PyObject* args; 2011 #endif 2012 PyObject* result; 2013 2014 joiner = PySequence_GetSlice(string, 0, 0); 2015 if (!joiner) 2016 return NULL; 2017 2018 if (PyList_GET_SIZE(list) == 0) { 2019 Py_DECREF(list); 2020 return joiner; 2021 } 2022 2023 #if PY_VERSION_HEX >= 0x01060000 2024 function = PyObject_GetAttrString(joiner, "join"); 2025 if (!function) { 2026 Py_DECREF(joiner); 2027 return NULL; 2028 } 2029 args = PyTuple_New(1); 2030 if (!args) { 2031 Py_DECREF(function); 2032 Py_DECREF(joiner); 2033 return NULL; 2034 } 2035 PyTuple_SET_ITEM(args, 0, list); 2036 result = PyObject_CallObject(function, args); 2037 Py_DECREF(args); /* also removes list */ 2038 Py_DECREF(function); 2039 #else 2040 result = call( 2041 "string", "join", 2042 PyTuple_Pack(2, list, joiner) 2043 ); 2044 #endif 2045 Py_DECREF(joiner); 2046 2047 return result; 2048 } 2049 2050 static PyObject* 2051 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw) 2052 { 2053 SRE_STATE state; 2054 PyObject* list; 2055 int status; 2056 Py_ssize_t i, b, e; 2057 2058 PyObject* string; 2059 Py_ssize_t start = 0; 2060 Py_ssize_t end = PY_SSIZE_T_MAX; 2061 static char* kwlist[] = { "source", "pos", "endpos", NULL }; 2062 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist, 2063 &string, &start, &end)) 2064 return NULL; 2065 2066 string = state_init(&state, self, string, start, end); 2067 if (!string) 2068 return NULL; 2069 2070 list = PyList_New(0); 2071 if (!list) { 2072 state_fini(&state); 2073 return NULL; 2074 } 2075 2076 while (state.start <= state.end) { 2077 2078 PyObject* item; 2079 2080 state_reset(&state); 2081 2082 state.ptr = state.start; 2083 2084 if (state.charsize == 1) { 2085 status = sre_search(&state, PatternObject_GetCode(self)); 2086 } else { 2087 #if defined(HAVE_UNICODE) 2088 status = sre_usearch(&state, PatternObject_GetCode(self)); 2089 #endif 2090 } 2091 2092 if (PyErr_Occurred()) 2093 goto error; 2094 2095 if (status <= 0) { 2096 if (status == 0) 2097 break; 2098 pattern_error(status); 2099 goto error; 2100 } 2101 2102 /* don't bother to build a match object */ 2103 switch (self->groups) { 2104 case 0: 2105 b = STATE_OFFSET(&state, state.start); 2106 e = STATE_OFFSET(&state, state.ptr); 2107 item = PySequence_GetSlice(string, b, e); 2108 if (!item) 2109 goto error; 2110 break; 2111 case 1: 2112 item = state_getslice(&state, 1, string, 1); 2113 if (!item) 2114 goto error; 2115 break; 2116 default: 2117 item = PyTuple_New(self->groups); 2118 if (!item) 2119 goto error; 2120 for (i = 0; i < self->groups; i++) { 2121 PyObject* o = state_getslice(&state, i+1, string, 1); 2122 if (!o) { 2123 Py_DECREF(item); 2124 goto error; 2125 } 2126 PyTuple_SET_ITEM(item, i, o); 2127 } 2128 break; 2129 } 2130 2131 status = PyList_Append(list, item); 2132 Py_DECREF(item); 2133 if (status < 0) 2134 goto error; 2135 2136 if (state.ptr == state.start) 2137 state.start = (void*) ((char*) state.ptr + state.charsize); 2138 else 2139 state.start = state.ptr; 2140 2141 } 2142 2143 state_fini(&state); 2144 return list; 2145 2146 error: 2147 Py_DECREF(list); 2148 state_fini(&state); 2149 return NULL; 2150 2151 } 2152 2153 #if PY_VERSION_HEX >= 0x02020000 2154 static PyObject* 2155 pattern_finditer(PatternObject* pattern, PyObject* args) 2156 { 2157 PyObject* scanner; 2158 PyObject* search; 2159 PyObject* iterator; 2160 2161 scanner = pattern_scanner(pattern, args); 2162 if (!scanner) 2163 return NULL; 2164 2165 search = PyObject_GetAttrString(scanner, "search"); 2166 Py_DECREF(scanner); 2167 if (!search) 2168 return NULL; 2169 2170 iterator = PyCallIter_New(search, Py_None); 2171 Py_DECREF(search); 2172 2173 return iterator; 2174 } 2175 #endif 2176 2177 static PyObject* 2178 pattern_split(PatternObject* self, PyObject* args, PyObject* kw) 2179 { 2180 SRE_STATE state; 2181 PyObject* list; 2182 PyObject* item; 2183 int status; 2184 Py_ssize_t n; 2185 Py_ssize_t i; 2186 void* last; 2187 2188 PyObject* string; 2189 Py_ssize_t maxsplit = 0; 2190 static char* kwlist[] = { "source", "maxsplit", NULL }; 2191 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist, 2192 &string, &maxsplit)) 2193 return NULL; 2194 2195 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); 2196 if (!string) 2197 return NULL; 2198 2199 list = PyList_New(0); 2200 if (!list) { 2201 state_fini(&state); 2202 return NULL; 2203 } 2204 2205 n = 0; 2206 last = state.start; 2207 2208 while (!maxsplit || n < maxsplit) { 2209 2210 state_reset(&state); 2211 2212 state.ptr = state.start; 2213 2214 if (state.charsize == 1) { 2215 status = sre_search(&state, PatternObject_GetCode(self)); 2216 } else { 2217 #if defined(HAVE_UNICODE) 2218 status = sre_usearch(&state, PatternObject_GetCode(self)); 2219 #endif 2220 } 2221 2222 if (PyErr_Occurred()) 2223 goto error; 2224 2225 if (status <= 0) { 2226 if (status == 0) 2227 break; 2228 pattern_error(status); 2229 goto error; 2230 } 2231 2232 if (state.start == state.ptr) { 2233 if (last == state.end) 2234 break; 2235 /* skip one character */ 2236 state.start = (void*) ((char*) state.ptr + state.charsize); 2237 continue; 2238 } 2239 2240 /* get segment before this match */ 2241 item = PySequence_GetSlice( 2242 string, STATE_OFFSET(&state, last), 2243 STATE_OFFSET(&state, state.start) 2244 ); 2245 if (!item) 2246 goto error; 2247 status = PyList_Append(list, item); 2248 Py_DECREF(item); 2249 if (status < 0) 2250 goto error; 2251 2252 /* add groups (if any) */ 2253 for (i = 0; i < self->groups; i++) { 2254 item = state_getslice(&state, i+1, string, 0); 2255 if (!item) 2256 goto error; 2257 status = PyList_Append(list, item); 2258 Py_DECREF(item); 2259 if (status < 0) 2260 goto error; 2261 } 2262 2263 n = n + 1; 2264 2265 last = state.start = state.ptr; 2266 2267 } 2268 2269 /* get segment following last match (even if empty) */ 2270 item = PySequence_GetSlice( 2271 string, STATE_OFFSET(&state, last), state.endpos 2272 ); 2273 if (!item) 2274 goto error; 2275 status = PyList_Append(list, item); 2276 Py_DECREF(item); 2277 if (status < 0) 2278 goto error; 2279 2280 state_fini(&state); 2281 return list; 2282 2283 error: 2284 Py_DECREF(list); 2285 state_fini(&state); 2286 return NULL; 2287 2288 } 2289 2290 static PyObject* 2291 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, 2292 Py_ssize_t count, Py_ssize_t subn) 2293 { 2294 SRE_STATE state; 2295 PyObject* list; 2296 PyObject* item; 2297 PyObject* filter; 2298 PyObject* args; 2299 PyObject* match; 2300 void* ptr; 2301 int status; 2302 Py_ssize_t n; 2303 Py_ssize_t i, b, e; 2304 int bint; 2305 int filter_is_callable; 2306 2307 if (PyCallable_Check(ptemplate)) { 2308 /* sub/subn takes either a function or a template */ 2309 filter = ptemplate; 2310 Py_INCREF(filter); 2311 filter_is_callable = 1; 2312 } else { 2313 /* if not callable, check if it's a literal string */ 2314 int literal; 2315 ptr = getstring(ptemplate, &n, &bint); 2316 b = bint; 2317 if (ptr) { 2318 if (b == 1) { 2319 literal = sre_literal_template((unsigned char *)ptr, n); 2320 } else { 2321 #if defined(HAVE_UNICODE) 2322 literal = sre_uliteral_template((Py_UNICODE *)ptr, n); 2323 #endif 2324 } 2325 } else { 2326 PyErr_Clear(); 2327 literal = 0; 2328 } 2329 if (literal) { 2330 filter = ptemplate; 2331 Py_INCREF(filter); 2332 filter_is_callable = 0; 2333 } else { 2334 /* not a literal; hand it over to the template compiler */ 2335 filter = call( 2336 SRE_PY_MODULE, "_subx", 2337 PyTuple_Pack(2, self, ptemplate) 2338 ); 2339 if (!filter) 2340 return NULL; 2341 filter_is_callable = PyCallable_Check(filter); 2342 } 2343 } 2344 2345 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); 2346 if (!string) { 2347 Py_DECREF(filter); 2348 return NULL; 2349 } 2350 2351 list = PyList_New(0); 2352 if (!list) { 2353 Py_DECREF(filter); 2354 state_fini(&state); 2355 return NULL; 2356 } 2357 2358 n = i = 0; 2359 2360 while (!count || n < count) { 2361 2362 state_reset(&state); 2363 2364 state.ptr = state.start; 2365 2366 if (state.charsize == 1) { 2367 status = sre_search(&state, PatternObject_GetCode(self)); 2368 } else { 2369 #if defined(HAVE_UNICODE) 2370 status = sre_usearch(&state, PatternObject_GetCode(self)); 2371 #endif 2372 } 2373 2374 if (PyErr_Occurred()) 2375 goto error; 2376 2377 if (status <= 0) { 2378 if (status == 0) 2379 break; 2380 pattern_error(status); 2381 goto error; 2382 } 2383 2384 b = STATE_OFFSET(&state, state.start); 2385 e = STATE_OFFSET(&state, state.ptr); 2386 2387 if (i < b) { 2388 /* get segment before this match */ 2389 item = PySequence_GetSlice(string, i, b); 2390 if (!item) 2391 goto error; 2392 status = PyList_Append(list, item); 2393 Py_DECREF(item); 2394 if (status < 0) 2395 goto error; 2396 2397 } else if (i == b && i == e && n > 0) 2398 /* ignore empty match on latest position */ 2399 goto next; 2400 2401 if (filter_is_callable) { 2402 /* pass match object through filter */ 2403 match = pattern_new_match(self, &state, 1); 2404 if (!match) 2405 goto error; 2406 args = PyTuple_Pack(1, match); 2407 if (!args) { 2408 Py_DECREF(match); 2409 goto error; 2410 } 2411 item = PyObject_CallObject(filter, args); 2412 Py_DECREF(args); 2413 Py_DECREF(match); 2414 if (!item) 2415 goto error; 2416 } else { 2417 /* filter is literal string */ 2418 item = filter; 2419 Py_INCREF(item); 2420 } 2421 2422 /* add to list */ 2423 if (item != Py_None) { 2424 status = PyList_Append(list, item); 2425 Py_DECREF(item); 2426 if (status < 0) 2427 goto error; 2428 } 2429 2430 i = e; 2431 n = n + 1; 2432 2433 next: 2434 /* move on */ 2435 if (state.ptr == state.start) 2436 state.start = (void*) ((char*) state.ptr + state.charsize); 2437 else 2438 state.start = state.ptr; 2439 2440 } 2441 2442 /* get segment following last match */ 2443 if (i < state.endpos) { 2444 item = PySequence_GetSlice(string, i, state.endpos); 2445 if (!item) 2446 goto error; 2447 status = PyList_Append(list, item); 2448 Py_DECREF(item); 2449 if (status < 0) 2450 goto error; 2451 } 2452 2453 state_fini(&state); 2454 2455 Py_DECREF(filter); 2456 2457 /* convert list to single string (also removes list) */ 2458 item = join_list(list, string); 2459 2460 if (!item) 2461 return NULL; 2462 2463 if (subn) 2464 return Py_BuildValue("Ni", item, n); 2465 2466 return item; 2467 2468 error: 2469 Py_DECREF(list); 2470 state_fini(&state); 2471 Py_DECREF(filter); 2472 return NULL; 2473 2474 } 2475 2476 static PyObject* 2477 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw) 2478 { 2479 PyObject* ptemplate; 2480 PyObject* string; 2481 Py_ssize_t count = 0; 2482 static char* kwlist[] = { "repl", "string", "count", NULL }; 2483 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist, 2484 &ptemplate, &string, &count)) 2485 return NULL; 2486 2487 return pattern_subx(self, ptemplate, string, count, 0); 2488 } 2489 2490 static PyObject* 2491 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw) 2492 { 2493 PyObject* ptemplate; 2494 PyObject* string; 2495 Py_ssize_t count = 0; 2496 static char* kwlist[] = { "repl", "string", "count", NULL }; 2497 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist, 2498 &ptemplate, &string, &count)) 2499 return NULL; 2500 2501 return pattern_subx(self, ptemplate, string, count, 1); 2502 } 2503 2504 static PyObject* 2505 pattern_copy(PatternObject* self, PyObject *unused) 2506 { 2507 #ifdef USE_BUILTIN_COPY 2508 PatternObject* copy; 2509 int offset; 2510 2511 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize); 2512 if (!copy) 2513 return NULL; 2514 2515 offset = offsetof(PatternObject, groups); 2516 2517 Py_XINCREF(self->groupindex); 2518 Py_XINCREF(self->indexgroup); 2519 Py_XINCREF(self->pattern); 2520 2521 memcpy((char*) copy + offset, (char*) self + offset, 2522 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset); 2523 copy->weakreflist = NULL; 2524 2525 return (PyObject*) copy; 2526 #else 2527 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object"); 2528 return NULL; 2529 #endif 2530 } 2531 2532 static PyObject* 2533 pattern_deepcopy(PatternObject* self, PyObject* memo) 2534 { 2535 #ifdef USE_BUILTIN_COPY 2536 PatternObject* copy; 2537 2538 copy = (PatternObject*) pattern_copy(self); 2539 if (!copy) 2540 return NULL; 2541 2542 if (!deepcopy(©->groupindex, memo) || 2543 !deepcopy(©->indexgroup, memo) || 2544 !deepcopy(©->pattern, memo)) { 2545 Py_DECREF(copy); 2546 return NULL; 2547 } 2548 2549 #else 2550 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object"); 2551 return NULL; 2552 #endif 2553 } 2554 2555 PyDoc_STRVAR(pattern_match_doc, 2556 "match(string[, pos[, endpos]]) --> match object or None.\n\ 2557 Matches zero or more characters at the beginning of the string"); 2558 2559 PyDoc_STRVAR(pattern_search_doc, 2560 "search(string[, pos[, endpos]]) --> match object or None.\n\ 2561 Scan through string looking for a match, and return a corresponding\n\ 2562 MatchObject instance. Return None if no position in the string matches."); 2563 2564 PyDoc_STRVAR(pattern_split_doc, 2565 "split(string[, maxsplit = 0]) --> list.\n\ 2566 Split string by the occurrences of pattern."); 2567 2568 PyDoc_STRVAR(pattern_findall_doc, 2569 "findall(string[, pos[, endpos]]) --> list.\n\ 2570 Return a list of all non-overlapping matches of pattern in string."); 2571 2572 PyDoc_STRVAR(pattern_finditer_doc, 2573 "finditer(string[, pos[, endpos]]) --> iterator.\n\ 2574 Return an iterator over all non-overlapping matches for the \n\ 2575 RE pattern in string. For each match, the iterator returns a\n\ 2576 match object."); 2577 2578 PyDoc_STRVAR(pattern_sub_doc, 2579 "sub(repl, string[, count = 0]) --> newstring\n\ 2580 Return the string obtained by replacing the leftmost non-overlapping\n\ 2581 occurrences of pattern in string by the replacement repl."); 2582 2583 PyDoc_STRVAR(pattern_subn_doc, 2584 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\ 2585 Return the tuple (new_string, number_of_subs_made) found by replacing\n\ 2586 the leftmost non-overlapping occurrences of pattern with the\n\ 2587 replacement repl."); 2588 2589 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects"); 2590 2591 static PyMethodDef pattern_methods[] = { 2592 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS, 2593 pattern_match_doc}, 2594 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS, 2595 pattern_search_doc}, 2596 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS, 2597 pattern_sub_doc}, 2598 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS, 2599 pattern_subn_doc}, 2600 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS, 2601 pattern_split_doc}, 2602 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS, 2603 pattern_findall_doc}, 2604 #if PY_VERSION_HEX >= 0x02020000 2605 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS, 2606 pattern_finditer_doc}, 2607 #endif 2608 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS}, 2609 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS}, 2610 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O}, 2611 {NULL, NULL} 2612 }; 2613 2614 #define PAT_OFF(x) offsetof(PatternObject, x) 2615 static PyMemberDef pattern_members[] = { 2616 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY}, 2617 {"flags", T_INT, PAT_OFF(flags), READONLY}, 2618 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY}, 2619 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY}, 2620 {NULL} /* Sentinel */ 2621 }; 2622 2623 statichere PyTypeObject Pattern_Type = { 2624 PyObject_HEAD_INIT(NULL) 2625 0, "_" SRE_MODULE ".SRE_Pattern", 2626 sizeof(PatternObject), sizeof(SRE_CODE), 2627 (destructor)pattern_dealloc, /*tp_dealloc*/ 2628 0, /* tp_print */ 2629 0, /* tp_getattrn */ 2630 0, /* tp_setattr */ 2631 0, /* tp_compare */ 2632 0, /* tp_repr */ 2633 0, /* tp_as_number */ 2634 0, /* tp_as_sequence */ 2635 0, /* tp_as_mapping */ 2636 0, /* tp_hash */ 2637 0, /* tp_call */ 2638 0, /* tp_str */ 2639 0, /* tp_getattro */ 2640 0, /* tp_setattro */ 2641 0, /* tp_as_buffer */ 2642 Py_TPFLAGS_DEFAULT, /* tp_flags */ 2643 pattern_doc, /* tp_doc */ 2644 0, /* tp_traverse */ 2645 0, /* tp_clear */ 2646 0, /* tp_richcompare */ 2647 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */ 2648 0, /* tp_iter */ 2649 0, /* tp_iternext */ 2650 pattern_methods, /* tp_methods */ 2651 pattern_members, /* tp_members */ 2652 }; 2653 2654 static int _validate(PatternObject *self); /* Forward */ 2655 2656 static PyObject * 2657 _compile(PyObject* self_, PyObject* args) 2658 { 2659 /* "compile" pattern descriptor to pattern object */ 2660 2661 PatternObject* self; 2662 Py_ssize_t i, n; 2663 2664 PyObject* pattern; 2665 int flags = 0; 2666 PyObject* code; 2667 Py_ssize_t groups = 0; 2668 PyObject* groupindex = NULL; 2669 PyObject* indexgroup = NULL; 2670 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags, 2671 &PyList_Type, &code, &groups, 2672 &groupindex, &indexgroup)) 2673 return NULL; 2674 2675 n = PyList_GET_SIZE(code); 2676 /* coverity[ampersand_in_size] */ 2677 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n); 2678 if (!self) 2679 return NULL; 2680 self->weakreflist = NULL; 2681 self->pattern = NULL; 2682 self->groupindex = NULL; 2683 self->indexgroup = NULL; 2684 2685 self->codesize = n; 2686 2687 for (i = 0; i < n; i++) { 2688 PyObject *o = PyList_GET_ITEM(code, i); 2689 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o) 2690 : PyLong_AsUnsignedLong(o); 2691 self->code[i] = (SRE_CODE) value; 2692 if ((unsigned long) self->code[i] != value) { 2693 PyErr_SetString(PyExc_OverflowError, 2694 "regular expression code size limit exceeded"); 2695 break; 2696 } 2697 } 2698 2699 if (PyErr_Occurred()) { 2700 Py_DECREF(self); 2701 return NULL; 2702 } 2703 2704 Py_INCREF(pattern); 2705 self->pattern = pattern; 2706 2707 self->flags = flags; 2708 2709 self->groups = groups; 2710 2711 Py_XINCREF(groupindex); 2712 self->groupindex = groupindex; 2713 2714 Py_XINCREF(indexgroup); 2715 self->indexgroup = indexgroup; 2716 2717 self->weakreflist = NULL; 2718 2719 if (!_validate(self)) { 2720 Py_DECREF(self); 2721 return NULL; 2722 } 2723 2724 return (PyObject*) self; 2725 } 2726 2727 /* -------------------------------------------------------------------- */ 2728 /* Code validation */ 2729 2730 /* To learn more about this code, have a look at the _compile() function in 2731 Lib/sre_compile.py. The validation functions below checks the code array 2732 for conformance with the code patterns generated there. 2733 2734 The nice thing about the generated code is that it is position-independent: 2735 all jumps are relative jumps forward. Also, jumps don't cross each other: 2736 the target of a later jump is always earlier than the target of an earlier 2737 jump. IOW, this is okay: 2738 2739 J---------J-------T--------T 2740 \ \_____/ / 2741 \______________________/ 2742 2743 but this is not: 2744 2745 J---------J-------T--------T 2746 \_________\_____/ / 2747 \____________/ 2748 2749 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4 2750 bytes wide (the latter if Python is compiled for "wide" unicode support). 2751 */ 2752 2753 /* Defining this one enables tracing of the validator */ 2754 #undef VVERBOSE 2755 2756 /* Trace macro for the validator */ 2757 #if defined(VVERBOSE) 2758 #define VTRACE(v) printf v 2759 #else 2760 #define VTRACE(v) 2761 #endif 2762 2763 /* Report failure */ 2764 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0) 2765 2766 /* Extract opcode, argument, or skip count from code array */ 2767 #define GET_OP \ 2768 do { \ 2769 VTRACE(("%p: ", code)); \ 2770 if (code >= end) FAIL; \ 2771 op = *code++; \ 2772 VTRACE(("%lu (op)\n", (unsigned long)op)); \ 2773 } while (0) 2774 #define GET_ARG \ 2775 do { \ 2776 VTRACE(("%p= ", code)); \ 2777 if (code >= end) FAIL; \ 2778 arg = *code++; \ 2779 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \ 2780 } while (0) 2781 #define GET_SKIP_ADJ(adj) \ 2782 do { \ 2783 VTRACE(("%p= ", code)); \ 2784 if (code >= end) FAIL; \ 2785 skip = *code; \ 2786 VTRACE(("%lu (skip to %p)\n", \ 2787 (unsigned long)skip, code+skip)); \ 2788 if (code+skip-adj < code || code+skip-adj > end)\ 2789 FAIL; \ 2790 code++; \ 2791 } while (0) 2792 #define GET_SKIP GET_SKIP_ADJ(0) 2793 2794 static int 2795 _validate_charset(SRE_CODE *code, SRE_CODE *end) 2796 { 2797 /* Some variables are manipulated by the macros above */ 2798 SRE_CODE op; 2799 SRE_CODE arg; 2800 SRE_CODE offset; 2801 int i; 2802 2803 while (code < end) { 2804 GET_OP; 2805 switch (op) { 2806 2807 case SRE_OP_NEGATE: 2808 break; 2809 2810 case SRE_OP_LITERAL: 2811 GET_ARG; 2812 break; 2813 2814 case SRE_OP_RANGE: 2815 GET_ARG; 2816 GET_ARG; 2817 break; 2818 2819 case SRE_OP_CHARSET: 2820 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */ 2821 if (code+offset < code || code+offset > end) 2822 FAIL; 2823 code += offset; 2824 break; 2825 2826 case SRE_OP_BIGCHARSET: 2827 GET_ARG; /* Number of blocks */ 2828 offset = 256/sizeof(SRE_CODE); /* 256-byte table */ 2829 if (code+offset < code || code+offset > end) 2830 FAIL; 2831 /* Make sure that each byte points to a valid block */ 2832 for (i = 0; i < 256; i++) { 2833 if (((unsigned char *)code)[i] >= arg) 2834 FAIL; 2835 } 2836 code += offset; 2837 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */ 2838 if (code+offset < code || code+offset > end) 2839 FAIL; 2840 code += offset; 2841 break; 2842 2843 case SRE_OP_CATEGORY: 2844 GET_ARG; 2845 switch (arg) { 2846 case SRE_CATEGORY_DIGIT: 2847 case SRE_CATEGORY_NOT_DIGIT: 2848 case SRE_CATEGORY_SPACE: 2849 case SRE_CATEGORY_NOT_SPACE: 2850 case SRE_CATEGORY_WORD: 2851 case SRE_CATEGORY_NOT_WORD: 2852 case SRE_CATEGORY_LINEBREAK: 2853 case SRE_CATEGORY_NOT_LINEBREAK: 2854 case SRE_CATEGORY_LOC_WORD: 2855 case SRE_CATEGORY_LOC_NOT_WORD: 2856 case SRE_CATEGORY_UNI_DIGIT: 2857 case SRE_CATEGORY_UNI_NOT_DIGIT: 2858 case SRE_CATEGORY_UNI_SPACE: 2859 case SRE_CATEGORY_UNI_NOT_SPACE: 2860 case SRE_CATEGORY_UNI_WORD: 2861 case SRE_CATEGORY_UNI_NOT_WORD: 2862 case SRE_CATEGORY_UNI_LINEBREAK: 2863 case SRE_CATEGORY_UNI_NOT_LINEBREAK: 2864 break; 2865 default: 2866 FAIL; 2867 } 2868 break; 2869 2870 default: 2871 FAIL; 2872 2873 } 2874 } 2875 2876 return 1; 2877 } 2878 2879 static int 2880 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) 2881 { 2882 /* Some variables are manipulated by the macros above */ 2883 SRE_CODE op; 2884 SRE_CODE arg; 2885 SRE_CODE skip; 2886 2887 VTRACE(("code=%p, end=%p\n", code, end)); 2888 2889 if (code > end) 2890 FAIL; 2891 2892 while (code < end) { 2893 GET_OP; 2894 switch (op) { 2895 2896 case SRE_OP_MARK: 2897 /* We don't check whether marks are properly nested; the 2898 sre_match() code is robust even if they don't, and the worst 2899 you can get is nonsensical match results. */ 2900 GET_ARG; 2901 if (arg > 2*groups+1) { 2902 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups)); 2903 FAIL; 2904 } 2905 break; 2906 2907 case SRE_OP_LITERAL: 2908 case SRE_OP_NOT_LITERAL: 2909 case SRE_OP_LITERAL_IGNORE: 2910 case SRE_OP_NOT_LITERAL_IGNORE: 2911 GET_ARG; 2912 /* The arg is just a character, nothing to check */ 2913 break; 2914 2915 case SRE_OP_SUCCESS: 2916 case SRE_OP_FAILURE: 2917 /* Nothing to check; these normally end the matching process */ 2918 break; 2919 2920 case SRE_OP_AT: 2921 GET_ARG; 2922 switch (arg) { 2923 case SRE_AT_BEGINNING: 2924 case SRE_AT_BEGINNING_STRING: 2925 case SRE_AT_BEGINNING_LINE: 2926 case SRE_AT_END: 2927 case SRE_AT_END_LINE: 2928 case SRE_AT_END_STRING: 2929 case SRE_AT_BOUNDARY: 2930 case SRE_AT_NON_BOUNDARY: 2931 case SRE_AT_LOC_BOUNDARY: 2932 case SRE_AT_LOC_NON_BOUNDARY: 2933 case SRE_AT_UNI_BOUNDARY: 2934 case SRE_AT_UNI_NON_BOUNDARY: 2935 break; 2936 default: 2937 FAIL; 2938 } 2939 break; 2940 2941 case SRE_OP_ANY: 2942 case SRE_OP_ANY_ALL: 2943 /* These have no operands */ 2944 break; 2945 2946 case SRE_OP_IN: 2947 case SRE_OP_IN_IGNORE: 2948 GET_SKIP; 2949 /* Stop 1 before the end; we check the FAILURE below */ 2950 if (!_validate_charset(code, code+skip-2)) 2951 FAIL; 2952 if (code[skip-2] != SRE_OP_FAILURE) 2953 FAIL; 2954 code += skip-1; 2955 break; 2956 2957 case SRE_OP_INFO: 2958 { 2959 /* A minimal info field is 2960 <INFO> <1=skip> <2=flags> <3=min> <4=max>; 2961 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags, 2962 more follows. */ 2963 SRE_CODE flags, i; 2964 SRE_CODE *newcode; 2965 GET_SKIP; 2966 newcode = code+skip-1; 2967 GET_ARG; flags = arg; 2968 GET_ARG; /* min */ 2969 GET_ARG; /* max */ 2970 /* Check that only valid flags are present */ 2971 if ((flags & ~(SRE_INFO_PREFIX | 2972 SRE_INFO_LITERAL | 2973 SRE_INFO_CHARSET)) != 0) 2974 FAIL; 2975 /* PREFIX and CHARSET are mutually exclusive */ 2976 if ((flags & SRE_INFO_PREFIX) && 2977 (flags & SRE_INFO_CHARSET)) 2978 FAIL; 2979 /* LITERAL implies PREFIX */ 2980 if ((flags & SRE_INFO_LITERAL) && 2981 !(flags & SRE_INFO_PREFIX)) 2982 FAIL; 2983 /* Validate the prefix */ 2984 if (flags & SRE_INFO_PREFIX) { 2985 SRE_CODE prefix_len; 2986 GET_ARG; prefix_len = arg; 2987 GET_ARG; /* prefix skip */ 2988 /* Here comes the prefix string */ 2989 if (code+prefix_len < code || code+prefix_len > newcode) 2990 FAIL; 2991 code += prefix_len; 2992 /* And here comes the overlap table */ 2993 if (code+prefix_len < code || code+prefix_len > newcode) 2994 FAIL; 2995 /* Each overlap value should be < prefix_len */ 2996 for (i = 0; i < prefix_len; i++) { 2997 if (code[i] >= prefix_len) 2998 FAIL; 2999 } 3000 code += prefix_len; 3001 } 3002 /* Validate the charset */ 3003 if (flags & SRE_INFO_CHARSET) { 3004 if (!_validate_charset(code, newcode-1)) 3005 FAIL; 3006 if (newcode[-1] != SRE_OP_FAILURE) 3007 FAIL; 3008 code = newcode; 3009 } 3010 else if (code != newcode) { 3011 VTRACE(("code=%p, newcode=%p\n", code, newcode)); 3012 FAIL; 3013 } 3014 } 3015 break; 3016 3017 case SRE_OP_BRANCH: 3018 { 3019 SRE_CODE *target = NULL; 3020 for (;;) { 3021 GET_SKIP; 3022 if (skip == 0) 3023 break; 3024 /* Stop 2 before the end; we check the JUMP below */ 3025 if (!_validate_inner(code, code+skip-3, groups)) 3026 FAIL; 3027 code += skip-3; 3028 /* Check that it ends with a JUMP, and that each JUMP 3029 has the same target */ 3030 GET_OP; 3031 if (op != SRE_OP_JUMP) 3032 FAIL; 3033 GET_SKIP; 3034 if (target == NULL) 3035 target = code+skip-1; 3036 else if (code+skip-1 != target) 3037 FAIL; 3038 } 3039 } 3040 break; 3041 3042 case SRE_OP_REPEAT_ONE: 3043 case SRE_OP_MIN_REPEAT_ONE: 3044 { 3045 SRE_CODE min, max; 3046 GET_SKIP; 3047 GET_ARG; min = arg; 3048 GET_ARG; max = arg; 3049 if (min > max) 3050 FAIL; 3051 #ifdef Py_UNICODE_WIDE 3052 if (max > 65535) 3053 FAIL; 3054 #endif 3055 if (!_validate_inner(code, code+skip-4, groups)) 3056 FAIL; 3057 code += skip-4; 3058 GET_OP; 3059 if (op != SRE_OP_SUCCESS) 3060 FAIL; 3061 } 3062 break; 3063 3064 case SRE_OP_REPEAT: 3065 { 3066 SRE_CODE min, max; 3067 GET_SKIP; 3068 GET_ARG; min = arg; 3069 GET_ARG; max = arg; 3070 if (min > max) 3071 FAIL; 3072 #ifdef Py_UNICODE_WIDE 3073 if (max > 65535) 3074 FAIL; 3075 #endif 3076 if (!_validate_inner(code, code+skip-3, groups)) 3077 FAIL; 3078 code += skip-3; 3079 GET_OP; 3080 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) 3081 FAIL; 3082 } 3083 break; 3084 3085 case SRE_OP_GROUPREF: 3086 case SRE_OP_GROUPREF_IGNORE: 3087 GET_ARG; 3088 if (arg >= groups) 3089 FAIL; 3090 break; 3091 3092 case SRE_OP_GROUPREF_EXISTS: 3093 /* The regex syntax for this is: '(?(group)then|else)', where 3094 'group' is either an integer group number or a group name, 3095 'then' and 'else' are sub-regexes, and 'else' is optional. */ 3096 GET_ARG; 3097 if (arg >= groups) 3098 FAIL; 3099 GET_SKIP_ADJ(1); 3100 code--; /* The skip is relative to the first arg! */ 3101 /* There are two possibilities here: if there is both a 'then' 3102 part and an 'else' part, the generated code looks like: 3103 3104 GROUPREF_EXISTS 3105 <group> 3106 <skipyes> 3107 ...then part... 3108 JUMP 3109 <skipno> 3110 (<skipyes> jumps here) 3111 ...else part... 3112 (<skipno> jumps here) 3113 3114 If there is only a 'then' part, it looks like: 3115 3116 GROUPREF_EXISTS 3117 <group> 3118 <skip> 3119 ...then part... 3120 (<skip> jumps here) 3121 3122 There is no direct way to decide which it is, and we don't want 3123 to allow arbitrary jumps anywhere in the code; so we just look 3124 for a JUMP opcode preceding our skip target. 3125 */ 3126 if (skip >= 3 && code+skip-3 >= code && 3127 code[skip-3] == SRE_OP_JUMP) 3128 { 3129 VTRACE(("both then and else parts present\n")); 3130 if (!_validate_inner(code+1, code+skip-3, groups)) 3131 FAIL; 3132 code += skip-2; /* Position after JUMP, at <skipno> */ 3133 GET_SKIP; 3134 if (!_validate_inner(code, code+skip-1, groups)) 3135 FAIL; 3136 code += skip-1; 3137 } 3138 else { 3139 VTRACE(("only a then part present\n")); 3140 if (!_validate_inner(code+1, code+skip-1, groups)) 3141 FAIL; 3142 code += skip-1; 3143 } 3144 break; 3145 3146 case SRE_OP_ASSERT: 3147 case SRE_OP_ASSERT_NOT: 3148 GET_SKIP; 3149 GET_ARG; /* 0 for lookahead, width for lookbehind */ 3150 code--; /* Back up over arg to simplify math below */ 3151 if (arg & 0x80000000) 3152 FAIL; /* Width too large */ 3153 /* Stop 1 before the end; we check the SUCCESS below */ 3154 if (!_validate_inner(code+1, code+skip-2, groups)) 3155 FAIL; 3156 code += skip-2; 3157 GET_OP; 3158 if (op != SRE_OP_SUCCESS) 3159 FAIL; 3160 break; 3161 3162 default: 3163 FAIL; 3164 3165 } 3166 } 3167 3168 VTRACE(("okay\n")); 3169 return 1; 3170 } 3171 3172 static int 3173 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) 3174 { 3175 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS) 3176 FAIL; 3177 if (groups == 0) /* fix for simplejson */ 3178 groups = 100; /* 100 groups should always be safe */ 3179 return _validate_inner(code, end-1, groups); 3180 } 3181 3182 static int 3183 _validate(PatternObject *self) 3184 { 3185 if (!_validate_outer(self->code, self->code+self->codesize, self->groups)) 3186 { 3187 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code"); 3188 return 0; 3189 } 3190 else 3191 VTRACE(("Success!\n")); 3192 return 1; 3193 } 3194 3195 /* -------------------------------------------------------------------- */ 3196 /* match methods */ 3197 3198 static void 3199 match_dealloc(MatchObject* self) 3200 { 3201 Py_XDECREF(self->regs); 3202 Py_XDECREF(self->string); 3203 Py_DECREF(self->pattern); 3204 PyObject_DEL(self); 3205 } 3206 3207 static PyObject* 3208 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def) 3209 { 3210 if (index < 0 || index >= self->groups) { 3211 /* raise IndexError if we were given a bad group number */ 3212 PyErr_SetString( 3213 PyExc_IndexError, 3214 "no such group" 3215 ); 3216 return NULL; 3217 } 3218 3219 index *= 2; 3220 3221 if (self->string == Py_None || self->mark[index] < 0) { 3222 /* return default value if the string or group is undefined */ 3223 Py_INCREF(def); 3224 return def; 3225 } 3226 3227 return PySequence_GetSlice( 3228 self->string, self->mark[index], self->mark[index+1] 3229 ); 3230 } 3231 3232 static Py_ssize_t 3233 match_getindex(MatchObject* self, PyObject* index) 3234 { 3235 Py_ssize_t i; 3236 3237 if (PyInt_Check(index)) 3238 return PyInt_AsSsize_t(index); 3239 3240 i = -1; 3241 3242 if (self->pattern->groupindex) { 3243 index = PyObject_GetItem(self->pattern->groupindex, index); 3244 if (index) { 3245 if (PyInt_Check(index) || PyLong_Check(index)) 3246 i = PyInt_AsSsize_t(index); 3247 Py_DECREF(index); 3248 } else 3249 PyErr_Clear(); 3250 } 3251 3252 return i; 3253 } 3254 3255 static PyObject* 3256 match_getslice(MatchObject* self, PyObject* index, PyObject* def) 3257 { 3258 return match_getslice_by_index(self, match_getindex(self, index), def); 3259 } 3260 3261 static PyObject* 3262 match_expand(MatchObject* self, PyObject* ptemplate) 3263 { 3264 /* delegate to Python code */ 3265 return call( 3266 SRE_PY_MODULE, "_expand", 3267 PyTuple_Pack(3, self->pattern, self, ptemplate) 3268 ); 3269 } 3270 3271 static PyObject* 3272 match_group(MatchObject* self, PyObject* args) 3273 { 3274 PyObject* result; 3275 Py_ssize_t i, size; 3276 3277 size = PyTuple_GET_SIZE(args); 3278 3279 switch (size) { 3280 case 0: 3281 result = match_getslice(self, Py_False, Py_None); 3282 break; 3283 case 1: 3284 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None); 3285 break; 3286 default: 3287 /* fetch multiple items */ 3288 result = PyTuple_New(size); 3289 if (!result) 3290 return NULL; 3291 for (i = 0; i < size; i++) { 3292 PyObject* item = match_getslice( 3293 self, PyTuple_GET_ITEM(args, i), Py_None 3294 ); 3295 if (!item) { 3296 Py_DECREF(result); 3297 return NULL; 3298 } 3299 PyTuple_SET_ITEM(result, i, item); 3300 } 3301 break; 3302 } 3303 return result; 3304 } 3305 3306 static PyObject* 3307 match_groups(MatchObject* self, PyObject* args, PyObject* kw) 3308 { 3309 PyObject* result; 3310 Py_ssize_t index; 3311 3312 PyObject* def = Py_None; 3313 static char* kwlist[] = { "default", NULL }; 3314 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def)) 3315 return NULL; 3316 3317 result = PyTuple_New(self->groups-1); 3318 if (!result) 3319 return NULL; 3320 3321 for (index = 1; index < self->groups; index++) { 3322 PyObject* item; 3323 item = match_getslice_by_index(self, index, def); 3324 if (!item) { 3325 Py_DECREF(result); 3326 return NULL; 3327 } 3328 PyTuple_SET_ITEM(result, index-1, item); 3329 } 3330 3331 return result; 3332 } 3333 3334 static PyObject* 3335 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw) 3336 { 3337 PyObject* result; 3338 PyObject* keys; 3339 Py_ssize_t index; 3340 3341 PyObject* def = Py_None; 3342 static char* kwlist[] = { "default", NULL }; 3343 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def)) 3344 return NULL; 3345 3346 result = PyDict_New(); 3347 if (!result || !self->pattern->groupindex) 3348 return result; 3349 3350 keys = PyMapping_Keys(self->pattern->groupindex); 3351 if (!keys) 3352 goto failed; 3353 3354 for (index = 0; index < PyList_GET_SIZE(keys); index++) { 3355 int status; 3356 PyObject* key; 3357 PyObject* value; 3358 key = PyList_GET_ITEM(keys, index); 3359 if (!key) 3360 goto failed; 3361 value = match_getslice(self, key, def); 3362 if (!value) { 3363 Py_DECREF(key); 3364 goto failed; 3365 } 3366 status = PyDict_SetItem(result, key, value); 3367 Py_DECREF(value); 3368 if (status < 0) 3369 goto failed; 3370 } 3371 3372 Py_DECREF(keys); 3373 3374 return result; 3375 3376 failed: 3377 Py_XDECREF(keys); 3378 Py_DECREF(result); 3379 return NULL; 3380 } 3381 3382 static PyObject* 3383 match_start(MatchObject* self, PyObject* args) 3384 { 3385 Py_ssize_t index; 3386 3387 PyObject* index_ = Py_False; /* zero */ 3388 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_)) 3389 return NULL; 3390 3391 index = match_getindex(self, index_); 3392 3393 if (index < 0 || index >= self->groups) { 3394 PyErr_SetString( 3395 PyExc_IndexError, 3396 "no such group" 3397 ); 3398 return NULL; 3399 } 3400 3401 /* mark is -1 if group is undefined */ 3402 return Py_BuildValue("i", self->mark[index*2]); 3403 } 3404 3405 static PyObject* 3406 match_end(MatchObject* self, PyObject* args) 3407 { 3408 Py_ssize_t index; 3409 3410 PyObject* index_ = Py_False; /* zero */ 3411 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_)) 3412 return NULL; 3413 3414 index = match_getindex(self, index_); 3415 3416 if (index < 0 || index >= self->groups) { 3417 PyErr_SetString( 3418 PyExc_IndexError, 3419 "no such group" 3420 ); 3421 return NULL; 3422 } 3423 3424 /* mark is -1 if group is undefined */ 3425 return Py_BuildValue("i", self->mark[index*2+1]); 3426 } 3427 3428 LOCAL(PyObject*) 3429 _pair(Py_ssize_t i1, Py_ssize_t i2) 3430 { 3431 PyObject* pair; 3432 PyObject* item; 3433 3434 pair = PyTuple_New(2); 3435 if (!pair) 3436 return NULL; 3437 3438 item = PyInt_FromSsize_t(i1); 3439 if (!item) 3440 goto error; 3441 PyTuple_SET_ITEM(pair, 0, item); 3442 3443 item = PyInt_FromSsize_t(i2); 3444 if (!item) 3445 goto error; 3446 PyTuple_SET_ITEM(pair, 1, item); 3447 3448 return pair; 3449 3450 error: 3451 Py_DECREF(pair); 3452 return NULL; 3453 } 3454 3455 static PyObject* 3456 match_span(MatchObject* self, PyObject* args) 3457 { 3458 Py_ssize_t index; 3459 3460 PyObject* index_ = Py_False; /* zero */ 3461 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_)) 3462 return NULL; 3463 3464 index = match_getindex(self, index_); 3465 3466 if (index < 0 || index >= self->groups) { 3467 PyErr_SetString( 3468 PyExc_IndexError, 3469 "no such group" 3470 ); 3471 return NULL; 3472 } 3473 3474 /* marks are -1 if group is undefined */ 3475 return _pair(self->mark[index*2], self->mark[index*2+1]); 3476 } 3477 3478 static PyObject* 3479 match_regs(MatchObject* self) 3480 { 3481 PyObject* regs; 3482 PyObject* item; 3483 Py_ssize_t index; 3484 3485 regs = PyTuple_New(self->groups); 3486 if (!regs) 3487 return NULL; 3488 3489 for (index = 0; index < self->groups; index++) { 3490 item = _pair(self->mark[index*2], self->mark[index*2+1]); 3491 if (!item) { 3492 Py_DECREF(regs); 3493 return NULL; 3494 } 3495 PyTuple_SET_ITEM(regs, index, item); 3496 } 3497 3498 Py_INCREF(regs); 3499 self->regs = regs; 3500 3501 return regs; 3502 } 3503 3504 static PyObject* 3505 match_copy(MatchObject* self, PyObject *unused) 3506 { 3507 #ifdef USE_BUILTIN_COPY 3508 MatchObject* copy; 3509 Py_ssize_t slots, offset; 3510 3511 slots = 2 * (self->pattern->groups+1); 3512 3513 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots); 3514 if (!copy) 3515 return NULL; 3516 3517 /* this value a constant, but any compiler should be able to 3518 figure that out all by itself */ 3519 offset = offsetof(MatchObject, string); 3520 3521 Py_XINCREF(self->pattern); 3522 Py_XINCREF(self->string); 3523 Py_XINCREF(self->regs); 3524 3525 memcpy((char*) copy + offset, (char*) self + offset, 3526 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset); 3527 3528 return (PyObject*) copy; 3529 #else 3530 PyErr_SetString(PyExc_TypeError, "cannot copy this match object"); 3531 return NULL; 3532 #endif 3533 } 3534 3535 static PyObject* 3536 match_deepcopy(MatchObject* self, PyObject* memo) 3537 { 3538 #ifdef USE_BUILTIN_COPY 3539 MatchObject* copy; 3540 3541 copy = (MatchObject*) match_copy(self); 3542 if (!copy) 3543 return NULL; 3544 3545 if (!deepcopy((PyObject**) ©->pattern, memo) || 3546 !deepcopy(©->string, memo) || 3547 !deepcopy(©->regs, memo)) { 3548 Py_DECREF(copy); 3549 return NULL; 3550 } 3551 3552 #else 3553 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object"); 3554 return NULL; 3555 #endif 3556 } 3557 3558 static struct PyMethodDef match_methods[] = { 3559 {"group", (PyCFunction) match_group, METH_VARARGS}, 3560 {"start", (PyCFunction) match_start, METH_VARARGS}, 3561 {"end", (PyCFunction) match_end, METH_VARARGS}, 3562 {"span", (PyCFunction) match_span, METH_VARARGS}, 3563 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS}, 3564 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS}, 3565 {"expand", (PyCFunction) match_expand, METH_O}, 3566 {"__copy__", (PyCFunction) match_copy, METH_NOARGS}, 3567 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O}, 3568 {NULL, NULL} 3569 }; 3570 3571 static PyObject * 3572 match_lastindex_get(MatchObject *self) 3573 { 3574 if (self->lastindex >= 0) 3575 return Py_BuildValue("i", self->lastindex); 3576 Py_INCREF(Py_None); 3577 return Py_None; 3578 } 3579 3580 static PyObject * 3581 match_lastgroup_get(MatchObject *self) 3582 { 3583 if (self->pattern->indexgroup && self->lastindex >= 0) { 3584 PyObject* result = PySequence_GetItem( 3585 self->pattern->indexgroup, self->lastindex 3586 ); 3587 if (result) 3588 return result; 3589 PyErr_Clear(); 3590 } 3591 Py_INCREF(Py_None); 3592 return Py_None; 3593 } 3594 3595 static PyObject * 3596 match_regs_get(MatchObject *self) 3597 { 3598 if (self->regs) { 3599 Py_INCREF(self->regs); 3600 return self->regs; 3601 } else 3602 return match_regs(self); 3603 } 3604 3605 static PyGetSetDef match_getset[] = { 3606 {"lastindex", (getter)match_lastindex_get, (setter)NULL}, 3607 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL}, 3608 {"regs", (getter)match_regs_get, (setter)NULL}, 3609 {NULL} 3610 }; 3611 3612 #define MATCH_OFF(x) offsetof(MatchObject, x) 3613 static PyMemberDef match_members[] = { 3614 {"string", T_OBJECT, MATCH_OFF(string), READONLY}, 3615 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY}, 3616 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY}, 3617 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY}, 3618 {NULL} 3619 }; 3620 3621 3622 /* FIXME: implement setattr("string", None) as a special case (to 3623 detach the associated string, if any */ 3624 3625 static PyTypeObject Match_Type = { 3626 PyVarObject_HEAD_INIT(NULL, 0) 3627 "_" SRE_MODULE ".SRE_Match", 3628 sizeof(MatchObject), sizeof(Py_ssize_t), 3629 (destructor)match_dealloc, /* tp_dealloc */ 3630 0, /* tp_print */ 3631 0, /* tp_getattr */ 3632 0, /* tp_setattr */ 3633 0, /* tp_compare */ 3634 0, /* tp_repr */ 3635 0, /* tp_as_number */ 3636 0, /* tp_as_sequence */ 3637 0, /* tp_as_mapping */ 3638 0, /* tp_hash */ 3639 0, /* tp_call */ 3640 0, /* tp_str */ 3641 0, /* tp_getattro */ 3642 0, /* tp_setattro */ 3643 0, /* tp_as_buffer */ 3644 Py_TPFLAGS_DEFAULT, 3645 0, /* tp_doc */ 3646 0, /* tp_traverse */ 3647 0, /* tp_clear */ 3648 0, /* tp_richcompare */ 3649 0, /* tp_weaklistoffset */ 3650 0, /* tp_iter */ 3651 0, /* tp_iternext */ 3652 match_methods, /* tp_methods */ 3653 match_members, /* tp_members */ 3654 match_getset, /* tp_getset */ 3655 }; 3656 3657 static PyObject* 3658 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status) 3659 { 3660 /* create match object (from state object) */ 3661 3662 MatchObject* match; 3663 Py_ssize_t i, j; 3664 char* base; 3665 int n; 3666 3667 if (status > 0) { 3668 3669 /* create match object (with room for extra group marks) */ 3670 /* coverity[ampersand_in_size] */ 3671 match = PyObject_NEW_VAR(MatchObject, &Match_Type, 3672 2*(pattern->groups+1)); 3673 if (!match) 3674 return NULL; 3675 3676 Py_INCREF(pattern); 3677 match->pattern = pattern; 3678 3679 Py_INCREF(state->string); 3680 match->string = state->string; 3681 3682 match->regs = NULL; 3683 match->groups = pattern->groups+1; 3684 3685 /* fill in group slices */ 3686 3687 base = (char*) state->beginning; 3688 n = state->charsize; 3689 3690 match->mark[0] = ((char*) state->start - base) / n; 3691 match->mark[1] = ((char*) state->ptr - base) / n; 3692 3693 for (i = j = 0; i < pattern->groups; i++, j+=2) 3694 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) { 3695 match->mark[j+2] = ((char*) state->mark[j] - base) / n; 3696 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n; 3697 } else 3698 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ 3699 3700 match->pos = state->pos; 3701 match->endpos = state->endpos; 3702 3703 match->lastindex = state->lastindex; 3704 3705 return (PyObject*) match; 3706 3707 } else if (status == 0) { 3708 3709 /* no match */ 3710 Py_INCREF(Py_None); 3711 return Py_None; 3712 3713 } 3714 3715 /* internal error */ 3716 pattern_error(status); 3717 return NULL; 3718 } 3719 3720 3721 /* -------------------------------------------------------------------- */ 3722 /* scanner methods (experimental) */ 3723 3724 static void 3725 scanner_dealloc(ScannerObject* self) 3726 { 3727 state_fini(&self->state); 3728 Py_XDECREF(self->pattern); 3729 PyObject_DEL(self); 3730 } 3731 3732 static PyObject* 3733 scanner_match(ScannerObject* self, PyObject *unused) 3734 { 3735 SRE_STATE* state = &self->state; 3736 PyObject* match; 3737 int status; 3738 3739 state_reset(state); 3740 3741 state->ptr = state->start; 3742 3743 if (state->charsize == 1) { 3744 status = sre_match(state, PatternObject_GetCode(self->pattern)); 3745 } else { 3746 #if defined(HAVE_UNICODE) 3747 status = sre_umatch(state, PatternObject_GetCode(self->pattern)); 3748 #endif 3749 } 3750 if (PyErr_Occurred()) 3751 return NULL; 3752 3753 match = pattern_new_match((PatternObject*) self->pattern, 3754 state, status); 3755 3756 if (status == 0 || state->ptr == state->start) 3757 state->start = (void*) ((char*) state->ptr + state->charsize); 3758 else 3759 state->start = state->ptr; 3760 3761 return match; 3762 } 3763 3764 3765 static PyObject* 3766 scanner_search(ScannerObject* self, PyObject *unused) 3767 { 3768 SRE_STATE* state = &self->state; 3769 PyObject* match; 3770 int status; 3771 3772 state_reset(state); 3773 3774 state->ptr = state->start; 3775 3776 if (state->charsize == 1) { 3777 status = sre_search(state, PatternObject_GetCode(self->pattern)); 3778 } else { 3779 #if defined(HAVE_UNICODE) 3780 status = sre_usearch(state, PatternObject_GetCode(self->pattern)); 3781 #endif 3782 } 3783 if (PyErr_Occurred()) 3784 return NULL; 3785 3786 match = pattern_new_match((PatternObject*) self->pattern, 3787 state, status); 3788 3789 if (status == 0 || state->ptr == state->start) 3790 state->start = (void*) ((char*) state->ptr + state->charsize); 3791 else 3792 state->start = state->ptr; 3793 3794 return match; 3795 } 3796 3797 static PyMethodDef scanner_methods[] = { 3798 {"match", (PyCFunction) scanner_match, METH_NOARGS}, 3799 {"search", (PyCFunction) scanner_search, METH_NOARGS}, 3800 {NULL, NULL} 3801 }; 3802 3803 #define SCAN_OFF(x) offsetof(ScannerObject, x) 3804 static PyMemberDef scanner_members[] = { 3805 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY}, 3806 {NULL} /* Sentinel */ 3807 }; 3808 3809 statichere PyTypeObject Scanner_Type = { 3810 PyObject_HEAD_INIT(NULL) 3811 0, "_" SRE_MODULE ".SRE_Scanner", 3812 sizeof(ScannerObject), 0, 3813 (destructor)scanner_dealloc, /*tp_dealloc*/ 3814 0, /* tp_print */ 3815 0, /* tp_getattr */ 3816 0, /* tp_setattr */ 3817 0, /* tp_reserved */ 3818 0, /* tp_repr */ 3819 0, /* tp_as_number */ 3820 0, /* tp_as_sequence */ 3821 0, /* tp_as_mapping */ 3822 0, /* tp_hash */ 3823 0, /* tp_call */ 3824 0, /* tp_str */ 3825 0, /* tp_getattro */ 3826 0, /* tp_setattro */ 3827 0, /* tp_as_buffer */ 3828 Py_TPFLAGS_DEFAULT, /* tp_flags */ 3829 0, /* tp_doc */ 3830 0, /* tp_traverse */ 3831 0, /* tp_clear */ 3832 0, /* tp_richcompare */ 3833 0, /* tp_weaklistoffset */ 3834 0, /* tp_iter */ 3835 0, /* tp_iternext */ 3836 scanner_methods, /* tp_methods */ 3837 scanner_members, /* tp_members */ 3838 0, /* tp_getset */ 3839 }; 3840 3841 static PyObject* 3842 pattern_scanner(PatternObject* pattern, PyObject* args) 3843 { 3844 /* create search state object */ 3845 3846 ScannerObject* self; 3847 3848 PyObject* string; 3849 Py_ssize_t start = 0; 3850 Py_ssize_t end = PY_SSIZE_T_MAX; 3851 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end)) 3852 return NULL; 3853 3854 /* create scanner object */ 3855 self = PyObject_NEW(ScannerObject, &Scanner_Type); 3856 if (!self) 3857 return NULL; 3858 self->pattern = NULL; 3859 3860 string = state_init(&self->state, pattern, string, start, end); 3861 if (!string) { 3862 Py_DECREF(self); 3863 return NULL; 3864 } 3865 3866 Py_INCREF(pattern); 3867 self->pattern = (PyObject*) pattern; 3868 3869 return (PyObject*) self; 3870 } 3871 3872 static PyMethodDef _functions[] = { 3873 {"compile", _compile, METH_VARARGS}, 3874 {"getcodesize", sre_codesize, METH_NOARGS}, 3875 {"getlower", sre_getlower, METH_VARARGS}, 3876 {NULL, NULL} 3877 }; 3878 3879 #if PY_VERSION_HEX < 0x02030000 3880 DL_EXPORT(void) init_sre(void) 3881 #else 3882 PyMODINIT_FUNC init_sre(void) 3883 #endif 3884 { 3885 PyObject* m; 3886 PyObject* d; 3887 PyObject* x; 3888 3889 /* Patch object types */ 3890 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) || 3891 PyType_Ready(&Scanner_Type)) 3892 return; 3893 3894 m = Py_InitModule("_" SRE_MODULE, _functions); 3895 if (m == NULL) 3896 return; 3897 d = PyModule_GetDict(m); 3898 3899 x = PyInt_FromLong(SRE_MAGIC); 3900 if (x) { 3901 PyDict_SetItemString(d, "MAGIC", x); 3902 Py_DECREF(x); 3903 } 3904 3905 x = PyInt_FromLong(sizeof(SRE_CODE)); 3906 if (x) { 3907 PyDict_SetItemString(d, "CODESIZE", x); 3908 Py_DECREF(x); 3909 } 3910 3911 x = PyString_FromString(copyright); 3912 if (x) { 3913 PyDict_SetItemString(d, "copyright", x); 3914 Py_DECREF(x); 3915 } 3916 } 3917 3918 #endif /* !defined(SRE_RECURSIVE) */ 3919 3920 /* vim:ts=4:sw=4:et 3921 */ 3922