1 /* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard (at) redhat.com> 15 */ 16 17 #define IN_LIBXML 18 #include "libxml.h" 19 20 #ifdef LIBXML_REGEXP_ENABLED 21 22 /* #define DEBUG_ERR */ 23 24 #include <stdio.h> 25 #include <string.h> 26 #ifdef HAVE_LIMITS_H 27 #include <limits.h> 28 #endif 29 30 #include <libxml/tree.h> 31 #include <libxml/parserInternals.h> 32 #include <libxml/xmlregexp.h> 33 #include <libxml/xmlautomata.h> 34 #include <libxml/xmlunicode.h> 35 36 #ifndef INT_MAX 37 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 38 #endif 39 40 /* #define DEBUG_REGEXP_GRAPH */ 41 /* #define DEBUG_REGEXP_EXEC */ 42 /* #define DEBUG_PUSH */ 43 /* #define DEBUG_COMPACTION */ 44 45 #define MAX_PUSH 10000000 46 47 #ifdef ERROR 48 #undef ERROR 49 #endif 50 #define ERROR(str) \ 51 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 52 xmlRegexpErrCompile(ctxt, str); 53 #define NEXT ctxt->cur++ 54 #define CUR (*(ctxt->cur)) 55 #define NXT(index) (ctxt->cur[index]) 56 57 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 58 #define NEXTL(l) ctxt->cur += l; 59 #define XML_REG_STRING_SEPARATOR '|' 60 /* 61 * Need PREV to check on a '-' within a Character Group. May only be used 62 * when it's guaranteed that cur is not at the beginning of ctxt->string! 63 */ 64 #define PREV (ctxt->cur[-1]) 65 66 /** 67 * TODO: 68 * 69 * macro to flag unimplemented blocks 70 */ 71 #define TODO \ 72 xmlGenericError(xmlGenericErrorContext, \ 73 "Unimplemented block at %s:%d\n", \ 74 __FILE__, __LINE__); 75 76 /************************************************************************ 77 * * 78 * Datatypes and structures * 79 * * 80 ************************************************************************/ 81 82 /* 83 * Note: the order of the enums below is significant, do not shuffle 84 */ 85 typedef enum { 86 XML_REGEXP_EPSILON = 1, 87 XML_REGEXP_CHARVAL, 88 XML_REGEXP_RANGES, 89 XML_REGEXP_SUBREG, /* used for () sub regexps */ 90 XML_REGEXP_STRING, 91 XML_REGEXP_ANYCHAR, /* . */ 92 XML_REGEXP_ANYSPACE, /* \s */ 93 XML_REGEXP_NOTSPACE, /* \S */ 94 XML_REGEXP_INITNAME, /* \l */ 95 XML_REGEXP_NOTINITNAME, /* \L */ 96 XML_REGEXP_NAMECHAR, /* \c */ 97 XML_REGEXP_NOTNAMECHAR, /* \C */ 98 XML_REGEXP_DECIMAL, /* \d */ 99 XML_REGEXP_NOTDECIMAL, /* \D */ 100 XML_REGEXP_REALCHAR, /* \w */ 101 XML_REGEXP_NOTREALCHAR, /* \W */ 102 XML_REGEXP_LETTER = 100, 103 XML_REGEXP_LETTER_UPPERCASE, 104 XML_REGEXP_LETTER_LOWERCASE, 105 XML_REGEXP_LETTER_TITLECASE, 106 XML_REGEXP_LETTER_MODIFIER, 107 XML_REGEXP_LETTER_OTHERS, 108 XML_REGEXP_MARK, 109 XML_REGEXP_MARK_NONSPACING, 110 XML_REGEXP_MARK_SPACECOMBINING, 111 XML_REGEXP_MARK_ENCLOSING, 112 XML_REGEXP_NUMBER, 113 XML_REGEXP_NUMBER_DECIMAL, 114 XML_REGEXP_NUMBER_LETTER, 115 XML_REGEXP_NUMBER_OTHERS, 116 XML_REGEXP_PUNCT, 117 XML_REGEXP_PUNCT_CONNECTOR, 118 XML_REGEXP_PUNCT_DASH, 119 XML_REGEXP_PUNCT_OPEN, 120 XML_REGEXP_PUNCT_CLOSE, 121 XML_REGEXP_PUNCT_INITQUOTE, 122 XML_REGEXP_PUNCT_FINQUOTE, 123 XML_REGEXP_PUNCT_OTHERS, 124 XML_REGEXP_SEPAR, 125 XML_REGEXP_SEPAR_SPACE, 126 XML_REGEXP_SEPAR_LINE, 127 XML_REGEXP_SEPAR_PARA, 128 XML_REGEXP_SYMBOL, 129 XML_REGEXP_SYMBOL_MATH, 130 XML_REGEXP_SYMBOL_CURRENCY, 131 XML_REGEXP_SYMBOL_MODIFIER, 132 XML_REGEXP_SYMBOL_OTHERS, 133 XML_REGEXP_OTHER, 134 XML_REGEXP_OTHER_CONTROL, 135 XML_REGEXP_OTHER_FORMAT, 136 XML_REGEXP_OTHER_PRIVATE, 137 XML_REGEXP_OTHER_NA, 138 XML_REGEXP_BLOCK_NAME 139 } xmlRegAtomType; 140 141 typedef enum { 142 XML_REGEXP_QUANT_EPSILON = 1, 143 XML_REGEXP_QUANT_ONCE, 144 XML_REGEXP_QUANT_OPT, 145 XML_REGEXP_QUANT_MULT, 146 XML_REGEXP_QUANT_PLUS, 147 XML_REGEXP_QUANT_ONCEONLY, 148 XML_REGEXP_QUANT_ALL, 149 XML_REGEXP_QUANT_RANGE 150 } xmlRegQuantType; 151 152 typedef enum { 153 XML_REGEXP_START_STATE = 1, 154 XML_REGEXP_FINAL_STATE, 155 XML_REGEXP_TRANS_STATE, 156 XML_REGEXP_SINK_STATE, 157 XML_REGEXP_UNREACH_STATE 158 } xmlRegStateType; 159 160 typedef enum { 161 XML_REGEXP_MARK_NORMAL = 0, 162 XML_REGEXP_MARK_START, 163 XML_REGEXP_MARK_VISITED 164 } xmlRegMarkedType; 165 166 typedef struct _xmlRegRange xmlRegRange; 167 typedef xmlRegRange *xmlRegRangePtr; 168 169 struct _xmlRegRange { 170 int neg; /* 0 normal, 1 not, 2 exclude */ 171 xmlRegAtomType type; 172 int start; 173 int end; 174 xmlChar *blockName; 175 }; 176 177 typedef struct _xmlRegAtom xmlRegAtom; 178 typedef xmlRegAtom *xmlRegAtomPtr; 179 180 typedef struct _xmlAutomataState xmlRegState; 181 typedef xmlRegState *xmlRegStatePtr; 182 183 struct _xmlRegAtom { 184 int no; 185 xmlRegAtomType type; 186 xmlRegQuantType quant; 187 int min; 188 int max; 189 190 void *valuep; 191 void *valuep2; 192 int neg; 193 int codepoint; 194 xmlRegStatePtr start; 195 xmlRegStatePtr start0; 196 xmlRegStatePtr stop; 197 int maxRanges; 198 int nbRanges; 199 xmlRegRangePtr *ranges; 200 void *data; 201 }; 202 203 typedef struct _xmlRegCounter xmlRegCounter; 204 typedef xmlRegCounter *xmlRegCounterPtr; 205 206 struct _xmlRegCounter { 207 int min; 208 int max; 209 }; 210 211 typedef struct _xmlRegTrans xmlRegTrans; 212 typedef xmlRegTrans *xmlRegTransPtr; 213 214 struct _xmlRegTrans { 215 xmlRegAtomPtr atom; 216 int to; 217 int counter; 218 int count; 219 int nd; 220 }; 221 222 struct _xmlAutomataState { 223 xmlRegStateType type; 224 xmlRegMarkedType mark; 225 xmlRegMarkedType markd; 226 xmlRegMarkedType reached; 227 int no; 228 int maxTrans; 229 int nbTrans; 230 xmlRegTrans *trans; 231 /* knowing states ponting to us can speed things up */ 232 int maxTransTo; 233 int nbTransTo; 234 int *transTo; 235 }; 236 237 typedef struct _xmlAutomata xmlRegParserCtxt; 238 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 239 240 #define AM_AUTOMATA_RNG 1 241 242 struct _xmlAutomata { 243 xmlChar *string; 244 xmlChar *cur; 245 246 int error; 247 int neg; 248 249 xmlRegStatePtr start; 250 xmlRegStatePtr end; 251 xmlRegStatePtr state; 252 253 xmlRegAtomPtr atom; 254 255 int maxAtoms; 256 int nbAtoms; 257 xmlRegAtomPtr *atoms; 258 259 int maxStates; 260 int nbStates; 261 xmlRegStatePtr *states; 262 263 int maxCounters; 264 int nbCounters; 265 xmlRegCounter *counters; 266 267 int determinist; 268 int negs; 269 int flags; 270 }; 271 272 struct _xmlRegexp { 273 xmlChar *string; 274 int nbStates; 275 xmlRegStatePtr *states; 276 int nbAtoms; 277 xmlRegAtomPtr *atoms; 278 int nbCounters; 279 xmlRegCounter *counters; 280 int determinist; 281 int flags; 282 /* 283 * That's the compact form for determinists automatas 284 */ 285 int nbstates; 286 int *compact; 287 void **transdata; 288 int nbstrings; 289 xmlChar **stringMap; 290 }; 291 292 typedef struct _xmlRegExecRollback xmlRegExecRollback; 293 typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 294 295 struct _xmlRegExecRollback { 296 xmlRegStatePtr state;/* the current state */ 297 int index; /* the index in the input stack */ 298 int nextbranch; /* the next transition to explore in that state */ 299 int *counts; /* save the automata state if it has some */ 300 }; 301 302 typedef struct _xmlRegInputToken xmlRegInputToken; 303 typedef xmlRegInputToken *xmlRegInputTokenPtr; 304 305 struct _xmlRegInputToken { 306 xmlChar *value; 307 void *data; 308 }; 309 310 struct _xmlRegExecCtxt { 311 int status; /* execution status != 0 indicate an error */ 312 int determinist; /* did we find an indeterministic behaviour */ 313 xmlRegexpPtr comp; /* the compiled regexp */ 314 xmlRegExecCallbacks callback; 315 void *data; 316 317 xmlRegStatePtr state;/* the current state */ 318 int transno; /* the current transition on that state */ 319 int transcount; /* the number of chars in char counted transitions */ 320 321 /* 322 * A stack of rollback states 323 */ 324 int maxRollbacks; 325 int nbRollbacks; 326 xmlRegExecRollback *rollbacks; 327 328 /* 329 * The state of the automata if any 330 */ 331 int *counts; 332 333 /* 334 * The input stack 335 */ 336 int inputStackMax; 337 int inputStackNr; 338 int index; 339 int *charStack; 340 const xmlChar *inputString; /* when operating on characters */ 341 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 342 343 /* 344 * error handling 345 */ 346 int errStateNo; /* the error state number */ 347 xmlRegStatePtr errState; /* the error state */ 348 xmlChar *errString; /* the string raising the error */ 349 int *errCounts; /* counters at the error state */ 350 int nbPush; 351 }; 352 353 #define REGEXP_ALL_COUNTER 0x123456 354 #define REGEXP_ALL_LAX_COUNTER 0x123457 355 356 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 357 static void xmlRegFreeState(xmlRegStatePtr state); 358 static void xmlRegFreeAtom(xmlRegAtomPtr atom); 359 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 360 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 361 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 362 int neg, int start, int end, const xmlChar *blockName); 363 364 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags); 365 366 /************************************************************************ 367 * * 368 * Regexp memory error handler * 369 * * 370 ************************************************************************/ 371 /** 372 * xmlRegexpErrMemory: 373 * @extra: extra information 374 * 375 * Handle an out of memory condition 376 */ 377 static void 378 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 379 { 380 const char *regexp = NULL; 381 if (ctxt != NULL) { 382 regexp = (const char *) ctxt->string; 383 ctxt->error = XML_ERR_NO_MEMORY; 384 } 385 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 386 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 387 regexp, NULL, 0, 0, 388 "Memory allocation failed : %s\n", extra); 389 } 390 391 /** 392 * xmlRegexpErrCompile: 393 * @extra: extra information 394 * 395 * Handle a compilation failure 396 */ 397 static void 398 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 399 { 400 const char *regexp = NULL; 401 int idx = 0; 402 403 if (ctxt != NULL) { 404 regexp = (const char *) ctxt->string; 405 idx = ctxt->cur - ctxt->string; 406 ctxt->error = XML_REGEXP_COMPILE_ERROR; 407 } 408 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 409 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 410 regexp, NULL, idx, 0, 411 "failed to compile: %s\n", extra); 412 } 413 414 /************************************************************************ 415 * * 416 * Allocation/Deallocation * 417 * * 418 ************************************************************************/ 419 420 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 421 /** 422 * xmlRegEpxFromParse: 423 * @ctxt: the parser context used to build it 424 * 425 * Allocate a new regexp and fill it with the result from the parser 426 * 427 * Returns the new regexp or NULL in case of error 428 */ 429 static xmlRegexpPtr 430 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 431 xmlRegexpPtr ret; 432 433 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 434 if (ret == NULL) { 435 xmlRegexpErrMemory(ctxt, "compiling regexp"); 436 return(NULL); 437 } 438 memset(ret, 0, sizeof(xmlRegexp)); 439 ret->string = ctxt->string; 440 ret->nbStates = ctxt->nbStates; 441 ret->states = ctxt->states; 442 ret->nbAtoms = ctxt->nbAtoms; 443 ret->atoms = ctxt->atoms; 444 ret->nbCounters = ctxt->nbCounters; 445 ret->counters = ctxt->counters; 446 ret->determinist = ctxt->determinist; 447 ret->flags = ctxt->flags; 448 if (ret->determinist == -1) { 449 xmlRegexpIsDeterminist(ret); 450 } 451 452 if ((ret->determinist != 0) && 453 (ret->nbCounters == 0) && 454 (ctxt->negs == 0) && 455 (ret->atoms != NULL) && 456 (ret->atoms[0] != NULL) && 457 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 458 int i, j, nbstates = 0, nbatoms = 0; 459 int *stateRemap; 460 int *stringRemap; 461 int *transitions; 462 void **transdata; 463 xmlChar **stringMap; 464 xmlChar *value; 465 466 /* 467 * Switch to a compact representation 468 * 1/ counting the effective number of states left 469 * 2/ counting the unique number of atoms, and check that 470 * they are all of the string type 471 * 3/ build a table state x atom for the transitions 472 */ 473 474 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 475 if (stateRemap == NULL) { 476 xmlRegexpErrMemory(ctxt, "compiling regexp"); 477 xmlFree(ret); 478 return(NULL); 479 } 480 for (i = 0;i < ret->nbStates;i++) { 481 if (ret->states[i] != NULL) { 482 stateRemap[i] = nbstates; 483 nbstates++; 484 } else { 485 stateRemap[i] = -1; 486 } 487 } 488 #ifdef DEBUG_COMPACTION 489 printf("Final: %d states\n", nbstates); 490 #endif 491 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 492 if (stringMap == NULL) { 493 xmlRegexpErrMemory(ctxt, "compiling regexp"); 494 xmlFree(stateRemap); 495 xmlFree(ret); 496 return(NULL); 497 } 498 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 499 if (stringRemap == NULL) { 500 xmlRegexpErrMemory(ctxt, "compiling regexp"); 501 xmlFree(stringMap); 502 xmlFree(stateRemap); 503 xmlFree(ret); 504 return(NULL); 505 } 506 for (i = 0;i < ret->nbAtoms;i++) { 507 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 508 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 509 value = ret->atoms[i]->valuep; 510 for (j = 0;j < nbatoms;j++) { 511 if (xmlStrEqual(stringMap[j], value)) { 512 stringRemap[i] = j; 513 break; 514 } 515 } 516 if (j >= nbatoms) { 517 stringRemap[i] = nbatoms; 518 stringMap[nbatoms] = xmlStrdup(value); 519 if (stringMap[nbatoms] == NULL) { 520 for (i = 0;i < nbatoms;i++) 521 xmlFree(stringMap[i]); 522 xmlFree(stringRemap); 523 xmlFree(stringMap); 524 xmlFree(stateRemap); 525 xmlFree(ret); 526 return(NULL); 527 } 528 nbatoms++; 529 } 530 } else { 531 xmlFree(stateRemap); 532 xmlFree(stringRemap); 533 for (i = 0;i < nbatoms;i++) 534 xmlFree(stringMap[i]); 535 xmlFree(stringMap); 536 xmlFree(ret); 537 return(NULL); 538 } 539 } 540 #ifdef DEBUG_COMPACTION 541 printf("Final: %d atoms\n", nbatoms); 542 #endif 543 transitions = (int *) xmlMalloc((nbstates + 1) * 544 (nbatoms + 1) * sizeof(int)); 545 if (transitions == NULL) { 546 xmlFree(stateRemap); 547 xmlFree(stringRemap); 548 xmlFree(stringMap); 549 xmlFree(ret); 550 return(NULL); 551 } 552 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 553 554 /* 555 * Allocate the transition table. The first entry for each 556 * state corresponds to the state type. 557 */ 558 transdata = NULL; 559 560 for (i = 0;i < ret->nbStates;i++) { 561 int stateno, atomno, targetno, prev; 562 xmlRegStatePtr state; 563 xmlRegTransPtr trans; 564 565 stateno = stateRemap[i]; 566 if (stateno == -1) 567 continue; 568 state = ret->states[i]; 569 570 transitions[stateno * (nbatoms + 1)] = state->type; 571 572 for (j = 0;j < state->nbTrans;j++) { 573 trans = &(state->trans[j]); 574 if ((trans->to == -1) || (trans->atom == NULL)) 575 continue; 576 atomno = stringRemap[trans->atom->no]; 577 if ((trans->atom->data != NULL) && (transdata == NULL)) { 578 transdata = (void **) xmlMalloc(nbstates * nbatoms * 579 sizeof(void *)); 580 if (transdata != NULL) 581 memset(transdata, 0, 582 nbstates * nbatoms * sizeof(void *)); 583 else { 584 xmlRegexpErrMemory(ctxt, "compiling regexp"); 585 break; 586 } 587 } 588 targetno = stateRemap[trans->to]; 589 /* 590 * if the same atom can generate transitions to 2 different 591 * states then it means the automata is not determinist and 592 * the compact form can't be used ! 593 */ 594 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 595 if (prev != 0) { 596 if (prev != targetno + 1) { 597 ret->determinist = 0; 598 #ifdef DEBUG_COMPACTION 599 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 600 i, j, trans->atom->no, trans->to, atomno, targetno); 601 printf(" previous to is %d\n", prev); 602 #endif 603 if (transdata != NULL) 604 xmlFree(transdata); 605 xmlFree(transitions); 606 xmlFree(stateRemap); 607 xmlFree(stringRemap); 608 for (i = 0;i < nbatoms;i++) 609 xmlFree(stringMap[i]); 610 xmlFree(stringMap); 611 goto not_determ; 612 } 613 } else { 614 #if 0 615 printf("State %d trans %d: atom %d to %d : %d to %d\n", 616 i, j, trans->atom->no, trans->to, atomno, targetno); 617 #endif 618 transitions[stateno * (nbatoms + 1) + atomno + 1] = 619 targetno + 1; /* to avoid 0 */ 620 if (transdata != NULL) 621 transdata[stateno * nbatoms + atomno] = 622 trans->atom->data; 623 } 624 } 625 } 626 ret->determinist = 1; 627 #ifdef DEBUG_COMPACTION 628 /* 629 * Debug 630 */ 631 for (i = 0;i < nbstates;i++) { 632 for (j = 0;j < nbatoms + 1;j++) { 633 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 634 } 635 printf("\n"); 636 } 637 printf("\n"); 638 #endif 639 /* 640 * Cleanup of the old data 641 */ 642 if (ret->states != NULL) { 643 for (i = 0;i < ret->nbStates;i++) 644 xmlRegFreeState(ret->states[i]); 645 xmlFree(ret->states); 646 } 647 ret->states = NULL; 648 ret->nbStates = 0; 649 if (ret->atoms != NULL) { 650 for (i = 0;i < ret->nbAtoms;i++) 651 xmlRegFreeAtom(ret->atoms[i]); 652 xmlFree(ret->atoms); 653 } 654 ret->atoms = NULL; 655 ret->nbAtoms = 0; 656 657 ret->compact = transitions; 658 ret->transdata = transdata; 659 ret->stringMap = stringMap; 660 ret->nbstrings = nbatoms; 661 ret->nbstates = nbstates; 662 xmlFree(stateRemap); 663 xmlFree(stringRemap); 664 } 665 not_determ: 666 ctxt->string = NULL; 667 ctxt->nbStates = 0; 668 ctxt->states = NULL; 669 ctxt->nbAtoms = 0; 670 ctxt->atoms = NULL; 671 ctxt->nbCounters = 0; 672 ctxt->counters = NULL; 673 return(ret); 674 } 675 676 /** 677 * xmlRegNewParserCtxt: 678 * @string: the string to parse 679 * 680 * Allocate a new regexp parser context 681 * 682 * Returns the new context or NULL in case of error 683 */ 684 static xmlRegParserCtxtPtr 685 xmlRegNewParserCtxt(const xmlChar *string) { 686 xmlRegParserCtxtPtr ret; 687 688 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 689 if (ret == NULL) 690 return(NULL); 691 memset(ret, 0, sizeof(xmlRegParserCtxt)); 692 if (string != NULL) 693 ret->string = xmlStrdup(string); 694 ret->cur = ret->string; 695 ret->neg = 0; 696 ret->negs = 0; 697 ret->error = 0; 698 ret->determinist = -1; 699 return(ret); 700 } 701 702 /** 703 * xmlRegNewRange: 704 * @ctxt: the regexp parser context 705 * @neg: is that negative 706 * @type: the type of range 707 * @start: the start codepoint 708 * @end: the end codepoint 709 * 710 * Allocate a new regexp range 711 * 712 * Returns the new range or NULL in case of error 713 */ 714 static xmlRegRangePtr 715 xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 716 int neg, xmlRegAtomType type, int start, int end) { 717 xmlRegRangePtr ret; 718 719 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 720 if (ret == NULL) { 721 xmlRegexpErrMemory(ctxt, "allocating range"); 722 return(NULL); 723 } 724 ret->neg = neg; 725 ret->type = type; 726 ret->start = start; 727 ret->end = end; 728 return(ret); 729 } 730 731 /** 732 * xmlRegFreeRange: 733 * @range: the regexp range 734 * 735 * Free a regexp range 736 */ 737 static void 738 xmlRegFreeRange(xmlRegRangePtr range) { 739 if (range == NULL) 740 return; 741 742 if (range->blockName != NULL) 743 xmlFree(range->blockName); 744 xmlFree(range); 745 } 746 747 /** 748 * xmlRegCopyRange: 749 * @range: the regexp range 750 * 751 * Copy a regexp range 752 * 753 * Returns the new copy or NULL in case of error. 754 */ 755 static xmlRegRangePtr 756 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) { 757 xmlRegRangePtr ret; 758 759 if (range == NULL) 760 return(NULL); 761 762 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start, 763 range->end); 764 if (ret == NULL) 765 return(NULL); 766 if (range->blockName != NULL) { 767 ret->blockName = xmlStrdup(range->blockName); 768 if (ret->blockName == NULL) { 769 xmlRegexpErrMemory(ctxt, "allocating range"); 770 xmlRegFreeRange(ret); 771 return(NULL); 772 } 773 } 774 return(ret); 775 } 776 777 /** 778 * xmlRegNewAtom: 779 * @ctxt: the regexp parser context 780 * @type: the type of atom 781 * 782 * Allocate a new atom 783 * 784 * Returns the new atom or NULL in case of error 785 */ 786 static xmlRegAtomPtr 787 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 788 xmlRegAtomPtr ret; 789 790 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 791 if (ret == NULL) { 792 xmlRegexpErrMemory(ctxt, "allocating atom"); 793 return(NULL); 794 } 795 memset(ret, 0, sizeof(xmlRegAtom)); 796 ret->type = type; 797 ret->quant = XML_REGEXP_QUANT_ONCE; 798 ret->min = 0; 799 ret->max = 0; 800 return(ret); 801 } 802 803 /** 804 * xmlRegFreeAtom: 805 * @atom: the regexp atom 806 * 807 * Free a regexp atom 808 */ 809 static void 810 xmlRegFreeAtom(xmlRegAtomPtr atom) { 811 int i; 812 813 if (atom == NULL) 814 return; 815 816 for (i = 0;i < atom->nbRanges;i++) 817 xmlRegFreeRange(atom->ranges[i]); 818 if (atom->ranges != NULL) 819 xmlFree(atom->ranges); 820 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 821 xmlFree(atom->valuep); 822 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 823 xmlFree(atom->valuep2); 824 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 825 xmlFree(atom->valuep); 826 xmlFree(atom); 827 } 828 829 /** 830 * xmlRegCopyAtom: 831 * @ctxt: the regexp parser context 832 * @atom: the oiginal atom 833 * 834 * Allocate a new regexp range 835 * 836 * Returns the new atom or NULL in case of error 837 */ 838 static xmlRegAtomPtr 839 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 840 xmlRegAtomPtr ret; 841 842 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 843 if (ret == NULL) { 844 xmlRegexpErrMemory(ctxt, "copying atom"); 845 return(NULL); 846 } 847 memset(ret, 0, sizeof(xmlRegAtom)); 848 ret->type = atom->type; 849 ret->quant = atom->quant; 850 ret->min = atom->min; 851 ret->max = atom->max; 852 if (atom->nbRanges > 0) { 853 int i; 854 855 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) * 856 atom->nbRanges); 857 if (ret->ranges == NULL) { 858 xmlRegexpErrMemory(ctxt, "copying atom"); 859 goto error; 860 } 861 for (i = 0;i < atom->nbRanges;i++) { 862 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]); 863 if (ret->ranges[i] == NULL) 864 goto error; 865 ret->nbRanges = i + 1; 866 } 867 } 868 return(ret); 869 870 error: 871 xmlRegFreeAtom(ret); 872 return(NULL); 873 } 874 875 static xmlRegStatePtr 876 xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 877 xmlRegStatePtr ret; 878 879 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 880 if (ret == NULL) { 881 xmlRegexpErrMemory(ctxt, "allocating state"); 882 return(NULL); 883 } 884 memset(ret, 0, sizeof(xmlRegState)); 885 ret->type = XML_REGEXP_TRANS_STATE; 886 ret->mark = XML_REGEXP_MARK_NORMAL; 887 return(ret); 888 } 889 890 /** 891 * xmlRegFreeState: 892 * @state: the regexp state 893 * 894 * Free a regexp state 895 */ 896 static void 897 xmlRegFreeState(xmlRegStatePtr state) { 898 if (state == NULL) 899 return; 900 901 if (state->trans != NULL) 902 xmlFree(state->trans); 903 if (state->transTo != NULL) 904 xmlFree(state->transTo); 905 xmlFree(state); 906 } 907 908 /** 909 * xmlRegFreeParserCtxt: 910 * @ctxt: the regexp parser context 911 * 912 * Free a regexp parser context 913 */ 914 static void 915 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 916 int i; 917 if (ctxt == NULL) 918 return; 919 920 if (ctxt->string != NULL) 921 xmlFree(ctxt->string); 922 if (ctxt->states != NULL) { 923 for (i = 0;i < ctxt->nbStates;i++) 924 xmlRegFreeState(ctxt->states[i]); 925 xmlFree(ctxt->states); 926 } 927 if (ctxt->atoms != NULL) { 928 for (i = 0;i < ctxt->nbAtoms;i++) 929 xmlRegFreeAtom(ctxt->atoms[i]); 930 xmlFree(ctxt->atoms); 931 } 932 if (ctxt->counters != NULL) 933 xmlFree(ctxt->counters); 934 xmlFree(ctxt); 935 } 936 937 /************************************************************************ 938 * * 939 * Display of Data structures * 940 * * 941 ************************************************************************/ 942 943 static void 944 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 945 switch (type) { 946 case XML_REGEXP_EPSILON: 947 fprintf(output, "epsilon "); break; 948 case XML_REGEXP_CHARVAL: 949 fprintf(output, "charval "); break; 950 case XML_REGEXP_RANGES: 951 fprintf(output, "ranges "); break; 952 case XML_REGEXP_SUBREG: 953 fprintf(output, "subexpr "); break; 954 case XML_REGEXP_STRING: 955 fprintf(output, "string "); break; 956 case XML_REGEXP_ANYCHAR: 957 fprintf(output, "anychar "); break; 958 case XML_REGEXP_ANYSPACE: 959 fprintf(output, "anyspace "); break; 960 case XML_REGEXP_NOTSPACE: 961 fprintf(output, "notspace "); break; 962 case XML_REGEXP_INITNAME: 963 fprintf(output, "initname "); break; 964 case XML_REGEXP_NOTINITNAME: 965 fprintf(output, "notinitname "); break; 966 case XML_REGEXP_NAMECHAR: 967 fprintf(output, "namechar "); break; 968 case XML_REGEXP_NOTNAMECHAR: 969 fprintf(output, "notnamechar "); break; 970 case XML_REGEXP_DECIMAL: 971 fprintf(output, "decimal "); break; 972 case XML_REGEXP_NOTDECIMAL: 973 fprintf(output, "notdecimal "); break; 974 case XML_REGEXP_REALCHAR: 975 fprintf(output, "realchar "); break; 976 case XML_REGEXP_NOTREALCHAR: 977 fprintf(output, "notrealchar "); break; 978 case XML_REGEXP_LETTER: 979 fprintf(output, "LETTER "); break; 980 case XML_REGEXP_LETTER_UPPERCASE: 981 fprintf(output, "LETTER_UPPERCASE "); break; 982 case XML_REGEXP_LETTER_LOWERCASE: 983 fprintf(output, "LETTER_LOWERCASE "); break; 984 case XML_REGEXP_LETTER_TITLECASE: 985 fprintf(output, "LETTER_TITLECASE "); break; 986 case XML_REGEXP_LETTER_MODIFIER: 987 fprintf(output, "LETTER_MODIFIER "); break; 988 case XML_REGEXP_LETTER_OTHERS: 989 fprintf(output, "LETTER_OTHERS "); break; 990 case XML_REGEXP_MARK: 991 fprintf(output, "MARK "); break; 992 case XML_REGEXP_MARK_NONSPACING: 993 fprintf(output, "MARK_NONSPACING "); break; 994 case XML_REGEXP_MARK_SPACECOMBINING: 995 fprintf(output, "MARK_SPACECOMBINING "); break; 996 case XML_REGEXP_MARK_ENCLOSING: 997 fprintf(output, "MARK_ENCLOSING "); break; 998 case XML_REGEXP_NUMBER: 999 fprintf(output, "NUMBER "); break; 1000 case XML_REGEXP_NUMBER_DECIMAL: 1001 fprintf(output, "NUMBER_DECIMAL "); break; 1002 case XML_REGEXP_NUMBER_LETTER: 1003 fprintf(output, "NUMBER_LETTER "); break; 1004 case XML_REGEXP_NUMBER_OTHERS: 1005 fprintf(output, "NUMBER_OTHERS "); break; 1006 case XML_REGEXP_PUNCT: 1007 fprintf(output, "PUNCT "); break; 1008 case XML_REGEXP_PUNCT_CONNECTOR: 1009 fprintf(output, "PUNCT_CONNECTOR "); break; 1010 case XML_REGEXP_PUNCT_DASH: 1011 fprintf(output, "PUNCT_DASH "); break; 1012 case XML_REGEXP_PUNCT_OPEN: 1013 fprintf(output, "PUNCT_OPEN "); break; 1014 case XML_REGEXP_PUNCT_CLOSE: 1015 fprintf(output, "PUNCT_CLOSE "); break; 1016 case XML_REGEXP_PUNCT_INITQUOTE: 1017 fprintf(output, "PUNCT_INITQUOTE "); break; 1018 case XML_REGEXP_PUNCT_FINQUOTE: 1019 fprintf(output, "PUNCT_FINQUOTE "); break; 1020 case XML_REGEXP_PUNCT_OTHERS: 1021 fprintf(output, "PUNCT_OTHERS "); break; 1022 case XML_REGEXP_SEPAR: 1023 fprintf(output, "SEPAR "); break; 1024 case XML_REGEXP_SEPAR_SPACE: 1025 fprintf(output, "SEPAR_SPACE "); break; 1026 case XML_REGEXP_SEPAR_LINE: 1027 fprintf(output, "SEPAR_LINE "); break; 1028 case XML_REGEXP_SEPAR_PARA: 1029 fprintf(output, "SEPAR_PARA "); break; 1030 case XML_REGEXP_SYMBOL: 1031 fprintf(output, "SYMBOL "); break; 1032 case XML_REGEXP_SYMBOL_MATH: 1033 fprintf(output, "SYMBOL_MATH "); break; 1034 case XML_REGEXP_SYMBOL_CURRENCY: 1035 fprintf(output, "SYMBOL_CURRENCY "); break; 1036 case XML_REGEXP_SYMBOL_MODIFIER: 1037 fprintf(output, "SYMBOL_MODIFIER "); break; 1038 case XML_REGEXP_SYMBOL_OTHERS: 1039 fprintf(output, "SYMBOL_OTHERS "); break; 1040 case XML_REGEXP_OTHER: 1041 fprintf(output, "OTHER "); break; 1042 case XML_REGEXP_OTHER_CONTROL: 1043 fprintf(output, "OTHER_CONTROL "); break; 1044 case XML_REGEXP_OTHER_FORMAT: 1045 fprintf(output, "OTHER_FORMAT "); break; 1046 case XML_REGEXP_OTHER_PRIVATE: 1047 fprintf(output, "OTHER_PRIVATE "); break; 1048 case XML_REGEXP_OTHER_NA: 1049 fprintf(output, "OTHER_NA "); break; 1050 case XML_REGEXP_BLOCK_NAME: 1051 fprintf(output, "BLOCK "); break; 1052 } 1053 } 1054 1055 static void 1056 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 1057 switch (type) { 1058 case XML_REGEXP_QUANT_EPSILON: 1059 fprintf(output, "epsilon "); break; 1060 case XML_REGEXP_QUANT_ONCE: 1061 fprintf(output, "once "); break; 1062 case XML_REGEXP_QUANT_OPT: 1063 fprintf(output, "? "); break; 1064 case XML_REGEXP_QUANT_MULT: 1065 fprintf(output, "* "); break; 1066 case XML_REGEXP_QUANT_PLUS: 1067 fprintf(output, "+ "); break; 1068 case XML_REGEXP_QUANT_RANGE: 1069 fprintf(output, "range "); break; 1070 case XML_REGEXP_QUANT_ONCEONLY: 1071 fprintf(output, "onceonly "); break; 1072 case XML_REGEXP_QUANT_ALL: 1073 fprintf(output, "all "); break; 1074 } 1075 } 1076 static void 1077 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 1078 fprintf(output, " range: "); 1079 if (range->neg) 1080 fprintf(output, "negative "); 1081 xmlRegPrintAtomType(output, range->type); 1082 fprintf(output, "%c - %c\n", range->start, range->end); 1083 } 1084 1085 static void 1086 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 1087 fprintf(output, " atom: "); 1088 if (atom == NULL) { 1089 fprintf(output, "NULL\n"); 1090 return; 1091 } 1092 if (atom->neg) 1093 fprintf(output, "not "); 1094 xmlRegPrintAtomType(output, atom->type); 1095 xmlRegPrintQuantType(output, atom->quant); 1096 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1097 fprintf(output, "%d-%d ", atom->min, atom->max); 1098 if (atom->type == XML_REGEXP_STRING) 1099 fprintf(output, "'%s' ", (char *) atom->valuep); 1100 if (atom->type == XML_REGEXP_CHARVAL) 1101 fprintf(output, "char %c\n", atom->codepoint); 1102 else if (atom->type == XML_REGEXP_RANGES) { 1103 int i; 1104 fprintf(output, "%d entries\n", atom->nbRanges); 1105 for (i = 0; i < atom->nbRanges;i++) 1106 xmlRegPrintRange(output, atom->ranges[i]); 1107 } else if (atom->type == XML_REGEXP_SUBREG) { 1108 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1109 } else { 1110 fprintf(output, "\n"); 1111 } 1112 } 1113 1114 static void 1115 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1116 fprintf(output, " trans: "); 1117 if (trans == NULL) { 1118 fprintf(output, "NULL\n"); 1119 return; 1120 } 1121 if (trans->to < 0) { 1122 fprintf(output, "removed\n"); 1123 return; 1124 } 1125 if (trans->nd != 0) { 1126 if (trans->nd == 2) 1127 fprintf(output, "last not determinist, "); 1128 else 1129 fprintf(output, "not determinist, "); 1130 } 1131 if (trans->counter >= 0) { 1132 fprintf(output, "counted %d, ", trans->counter); 1133 } 1134 if (trans->count == REGEXP_ALL_COUNTER) { 1135 fprintf(output, "all transition, "); 1136 } else if (trans->count >= 0) { 1137 fprintf(output, "count based %d, ", trans->count); 1138 } 1139 if (trans->atom == NULL) { 1140 fprintf(output, "epsilon to %d\n", trans->to); 1141 return; 1142 } 1143 if (trans->atom->type == XML_REGEXP_CHARVAL) 1144 fprintf(output, "char %c ", trans->atom->codepoint); 1145 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1146 } 1147 1148 static void 1149 xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1150 int i; 1151 1152 fprintf(output, " state: "); 1153 if (state == NULL) { 1154 fprintf(output, "NULL\n"); 1155 return; 1156 } 1157 if (state->type == XML_REGEXP_START_STATE) 1158 fprintf(output, "START "); 1159 if (state->type == XML_REGEXP_FINAL_STATE) 1160 fprintf(output, "FINAL "); 1161 1162 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1163 for (i = 0;i < state->nbTrans; i++) { 1164 xmlRegPrintTrans(output, &(state->trans[i])); 1165 } 1166 } 1167 1168 #ifdef DEBUG_REGEXP_GRAPH 1169 static void 1170 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1171 int i; 1172 1173 fprintf(output, " ctxt: "); 1174 if (ctxt == NULL) { 1175 fprintf(output, "NULL\n"); 1176 return; 1177 } 1178 fprintf(output, "'%s' ", ctxt->string); 1179 if (ctxt->error) 1180 fprintf(output, "error "); 1181 if (ctxt->neg) 1182 fprintf(output, "neg "); 1183 fprintf(output, "\n"); 1184 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1185 for (i = 0;i < ctxt->nbAtoms; i++) { 1186 fprintf(output, " %02d ", i); 1187 xmlRegPrintAtom(output, ctxt->atoms[i]); 1188 } 1189 if (ctxt->atom != NULL) { 1190 fprintf(output, "current atom:\n"); 1191 xmlRegPrintAtom(output, ctxt->atom); 1192 } 1193 fprintf(output, "%d states:", ctxt->nbStates); 1194 if (ctxt->start != NULL) 1195 fprintf(output, " start: %d", ctxt->start->no); 1196 if (ctxt->end != NULL) 1197 fprintf(output, " end: %d", ctxt->end->no); 1198 fprintf(output, "\n"); 1199 for (i = 0;i < ctxt->nbStates; i++) { 1200 xmlRegPrintState(output, ctxt->states[i]); 1201 } 1202 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1203 for (i = 0;i < ctxt->nbCounters; i++) { 1204 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1205 ctxt->counters[i].max); 1206 } 1207 } 1208 #endif 1209 1210 /************************************************************************ 1211 * * 1212 * Finite Automata structures manipulations * 1213 * * 1214 ************************************************************************/ 1215 1216 static void 1217 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1218 int neg, xmlRegAtomType type, int start, int end, 1219 xmlChar *blockName) { 1220 xmlRegRangePtr range; 1221 1222 if (atom == NULL) { 1223 ERROR("add range: atom is NULL"); 1224 return; 1225 } 1226 if (atom->type != XML_REGEXP_RANGES) { 1227 ERROR("add range: atom is not ranges"); 1228 return; 1229 } 1230 if (atom->maxRanges == 0) { 1231 atom->maxRanges = 4; 1232 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1233 sizeof(xmlRegRangePtr)); 1234 if (atom->ranges == NULL) { 1235 xmlRegexpErrMemory(ctxt, "adding ranges"); 1236 atom->maxRanges = 0; 1237 return; 1238 } 1239 } else if (atom->nbRanges >= atom->maxRanges) { 1240 xmlRegRangePtr *tmp; 1241 atom->maxRanges *= 2; 1242 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1243 sizeof(xmlRegRangePtr)); 1244 if (tmp == NULL) { 1245 xmlRegexpErrMemory(ctxt, "adding ranges"); 1246 atom->maxRanges /= 2; 1247 return; 1248 } 1249 atom->ranges = tmp; 1250 } 1251 range = xmlRegNewRange(ctxt, neg, type, start, end); 1252 if (range == NULL) 1253 return; 1254 range->blockName = blockName; 1255 atom->ranges[atom->nbRanges++] = range; 1256 1257 } 1258 1259 static int 1260 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1261 if (ctxt->maxCounters == 0) { 1262 ctxt->maxCounters = 4; 1263 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1264 sizeof(xmlRegCounter)); 1265 if (ctxt->counters == NULL) { 1266 xmlRegexpErrMemory(ctxt, "allocating counter"); 1267 ctxt->maxCounters = 0; 1268 return(-1); 1269 } 1270 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1271 xmlRegCounter *tmp; 1272 ctxt->maxCounters *= 2; 1273 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1274 sizeof(xmlRegCounter)); 1275 if (tmp == NULL) { 1276 xmlRegexpErrMemory(ctxt, "allocating counter"); 1277 ctxt->maxCounters /= 2; 1278 return(-1); 1279 } 1280 ctxt->counters = tmp; 1281 } 1282 ctxt->counters[ctxt->nbCounters].min = -1; 1283 ctxt->counters[ctxt->nbCounters].max = -1; 1284 return(ctxt->nbCounters++); 1285 } 1286 1287 static int 1288 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1289 if (atom == NULL) { 1290 ERROR("atom push: atom is NULL"); 1291 return(-1); 1292 } 1293 if (ctxt->maxAtoms == 0) { 1294 ctxt->maxAtoms = 4; 1295 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1296 sizeof(xmlRegAtomPtr)); 1297 if (ctxt->atoms == NULL) { 1298 xmlRegexpErrMemory(ctxt, "pushing atom"); 1299 ctxt->maxAtoms = 0; 1300 return(-1); 1301 } 1302 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1303 xmlRegAtomPtr *tmp; 1304 ctxt->maxAtoms *= 2; 1305 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1306 sizeof(xmlRegAtomPtr)); 1307 if (tmp == NULL) { 1308 xmlRegexpErrMemory(ctxt, "allocating counter"); 1309 ctxt->maxAtoms /= 2; 1310 return(-1); 1311 } 1312 ctxt->atoms = tmp; 1313 } 1314 atom->no = ctxt->nbAtoms; 1315 ctxt->atoms[ctxt->nbAtoms++] = atom; 1316 return(0); 1317 } 1318 1319 static void 1320 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1321 int from) { 1322 if (target->maxTransTo == 0) { 1323 target->maxTransTo = 8; 1324 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1325 sizeof(int)); 1326 if (target->transTo == NULL) { 1327 xmlRegexpErrMemory(ctxt, "adding transition"); 1328 target->maxTransTo = 0; 1329 return; 1330 } 1331 } else if (target->nbTransTo >= target->maxTransTo) { 1332 int *tmp; 1333 target->maxTransTo *= 2; 1334 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1335 sizeof(int)); 1336 if (tmp == NULL) { 1337 xmlRegexpErrMemory(ctxt, "adding transition"); 1338 target->maxTransTo /= 2; 1339 return; 1340 } 1341 target->transTo = tmp; 1342 } 1343 target->transTo[target->nbTransTo] = from; 1344 target->nbTransTo++; 1345 } 1346 1347 static void 1348 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1349 xmlRegAtomPtr atom, xmlRegStatePtr target, 1350 int counter, int count) { 1351 1352 int nrtrans; 1353 1354 if (state == NULL) { 1355 ERROR("add state: state is NULL"); 1356 return; 1357 } 1358 if (target == NULL) { 1359 ERROR("add state: target is NULL"); 1360 return; 1361 } 1362 /* 1363 * Other routines follow the philosophy 'When in doubt, add a transition' 1364 * so we check here whether such a transition is already present and, if 1365 * so, silently ignore this request. 1366 */ 1367 1368 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1369 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1370 if ((trans->atom == atom) && 1371 (trans->to == target->no) && 1372 (trans->counter == counter) && 1373 (trans->count == count)) { 1374 #ifdef DEBUG_REGEXP_GRAPH 1375 printf("Ignoring duplicate transition from %d to %d\n", 1376 state->no, target->no); 1377 #endif 1378 return; 1379 } 1380 } 1381 1382 if (state->maxTrans == 0) { 1383 state->maxTrans = 8; 1384 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1385 sizeof(xmlRegTrans)); 1386 if (state->trans == NULL) { 1387 xmlRegexpErrMemory(ctxt, "adding transition"); 1388 state->maxTrans = 0; 1389 return; 1390 } 1391 } else if (state->nbTrans >= state->maxTrans) { 1392 xmlRegTrans *tmp; 1393 state->maxTrans *= 2; 1394 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1395 sizeof(xmlRegTrans)); 1396 if (tmp == NULL) { 1397 xmlRegexpErrMemory(ctxt, "adding transition"); 1398 state->maxTrans /= 2; 1399 return; 1400 } 1401 state->trans = tmp; 1402 } 1403 #ifdef DEBUG_REGEXP_GRAPH 1404 printf("Add trans from %d to %d ", state->no, target->no); 1405 if (count == REGEXP_ALL_COUNTER) 1406 printf("all transition\n"); 1407 else if (count >= 0) 1408 printf("count based %d\n", count); 1409 else if (counter >= 0) 1410 printf("counted %d\n", counter); 1411 else if (atom == NULL) 1412 printf("epsilon transition\n"); 1413 else if (atom != NULL) 1414 xmlRegPrintAtom(stdout, atom); 1415 #endif 1416 1417 state->trans[state->nbTrans].atom = atom; 1418 state->trans[state->nbTrans].to = target->no; 1419 state->trans[state->nbTrans].counter = counter; 1420 state->trans[state->nbTrans].count = count; 1421 state->trans[state->nbTrans].nd = 0; 1422 state->nbTrans++; 1423 xmlRegStateAddTransTo(ctxt, target, state->no); 1424 } 1425 1426 static int 1427 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1428 if (state == NULL) return(-1); 1429 if (ctxt->maxStates == 0) { 1430 ctxt->maxStates = 4; 1431 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1432 sizeof(xmlRegStatePtr)); 1433 if (ctxt->states == NULL) { 1434 xmlRegexpErrMemory(ctxt, "adding state"); 1435 ctxt->maxStates = 0; 1436 return(-1); 1437 } 1438 } else if (ctxt->nbStates >= ctxt->maxStates) { 1439 xmlRegStatePtr *tmp; 1440 ctxt->maxStates *= 2; 1441 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1442 sizeof(xmlRegStatePtr)); 1443 if (tmp == NULL) { 1444 xmlRegexpErrMemory(ctxt, "adding state"); 1445 ctxt->maxStates /= 2; 1446 return(-1); 1447 } 1448 ctxt->states = tmp; 1449 } 1450 state->no = ctxt->nbStates; 1451 ctxt->states[ctxt->nbStates++] = state; 1452 return(0); 1453 } 1454 1455 /** 1456 * xmlFAGenerateAllTransition: 1457 * @ctxt: a regexp parser context 1458 * @from: the from state 1459 * @to: the target state or NULL for building a new one 1460 * @lax: 1461 * 1462 */ 1463 static void 1464 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1465 xmlRegStatePtr from, xmlRegStatePtr to, 1466 int lax) { 1467 if (to == NULL) { 1468 to = xmlRegNewState(ctxt); 1469 xmlRegStatePush(ctxt, to); 1470 ctxt->state = to; 1471 } 1472 if (lax) 1473 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1474 else 1475 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1476 } 1477 1478 /** 1479 * xmlFAGenerateEpsilonTransition: 1480 * @ctxt: a regexp parser context 1481 * @from: the from state 1482 * @to: the target state or NULL for building a new one 1483 * 1484 */ 1485 static void 1486 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1487 xmlRegStatePtr from, xmlRegStatePtr to) { 1488 if (to == NULL) { 1489 to = xmlRegNewState(ctxt); 1490 xmlRegStatePush(ctxt, to); 1491 ctxt->state = to; 1492 } 1493 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1494 } 1495 1496 /** 1497 * xmlFAGenerateCountedEpsilonTransition: 1498 * @ctxt: a regexp parser context 1499 * @from: the from state 1500 * @to: the target state or NULL for building a new one 1501 * counter: the counter for that transition 1502 * 1503 */ 1504 static void 1505 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1506 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1507 if (to == NULL) { 1508 to = xmlRegNewState(ctxt); 1509 xmlRegStatePush(ctxt, to); 1510 ctxt->state = to; 1511 } 1512 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1513 } 1514 1515 /** 1516 * xmlFAGenerateCountedTransition: 1517 * @ctxt: a regexp parser context 1518 * @from: the from state 1519 * @to: the target state or NULL for building a new one 1520 * counter: the counter for that transition 1521 * 1522 */ 1523 static void 1524 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1525 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1526 if (to == NULL) { 1527 to = xmlRegNewState(ctxt); 1528 xmlRegStatePush(ctxt, to); 1529 ctxt->state = to; 1530 } 1531 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1532 } 1533 1534 /** 1535 * xmlFAGenerateTransitions: 1536 * @ctxt: a regexp parser context 1537 * @from: the from state 1538 * @to: the target state or NULL for building a new one 1539 * @atom: the atom generating the transition 1540 * 1541 * Returns 0 if success and -1 in case of error. 1542 */ 1543 static int 1544 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1545 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1546 xmlRegStatePtr end; 1547 int nullable = 0; 1548 1549 if (atom == NULL) { 1550 ERROR("genrate transition: atom == NULL"); 1551 return(-1); 1552 } 1553 if (atom->type == XML_REGEXP_SUBREG) { 1554 /* 1555 * this is a subexpression handling one should not need to 1556 * create a new node except for XML_REGEXP_QUANT_RANGE. 1557 */ 1558 if (xmlRegAtomPush(ctxt, atom) < 0) { 1559 return(-1); 1560 } 1561 if ((to != NULL) && (atom->stop != to) && 1562 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1563 /* 1564 * Generate an epsilon transition to link to the target 1565 */ 1566 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1567 #ifdef DV 1568 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1569 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1570 to = xmlRegNewState(ctxt); 1571 xmlRegStatePush(ctxt, to); 1572 ctxt->state = to; 1573 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1574 #endif 1575 } 1576 switch (atom->quant) { 1577 case XML_REGEXP_QUANT_OPT: 1578 atom->quant = XML_REGEXP_QUANT_ONCE; 1579 /* 1580 * transition done to the state after end of atom. 1581 * 1. set transition from atom start to new state 1582 * 2. set transition from atom end to this state. 1583 */ 1584 if (to == NULL) { 1585 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1586 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, 1587 ctxt->state); 1588 } else { 1589 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to); 1590 } 1591 break; 1592 case XML_REGEXP_QUANT_MULT: 1593 atom->quant = XML_REGEXP_QUANT_ONCE; 1594 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1595 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1596 break; 1597 case XML_REGEXP_QUANT_PLUS: 1598 atom->quant = XML_REGEXP_QUANT_ONCE; 1599 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1600 break; 1601 case XML_REGEXP_QUANT_RANGE: { 1602 int counter; 1603 xmlRegStatePtr inter, newstate; 1604 1605 /* 1606 * create the final state now if needed 1607 */ 1608 if (to != NULL) { 1609 newstate = to; 1610 } else { 1611 newstate = xmlRegNewState(ctxt); 1612 xmlRegStatePush(ctxt, newstate); 1613 } 1614 1615 /* 1616 * The principle here is to use counted transition 1617 * to avoid explosion in the number of states in the 1618 * graph. This is clearly more complex but should not 1619 * be exploitable at runtime. 1620 */ 1621 if ((atom->min == 0) && (atom->start0 == NULL)) { 1622 xmlRegAtomPtr copy; 1623 /* 1624 * duplicate a transition based on atom to count next 1625 * occurences after 1. We cannot loop to atom->start 1626 * directly because we need an epsilon transition to 1627 * newstate. 1628 */ 1629 /* ???? For some reason it seems we never reach that 1630 case, I suppose this got optimized out before when 1631 building the automata */ 1632 copy = xmlRegCopyAtom(ctxt, atom); 1633 if (copy == NULL) 1634 return(-1); 1635 copy->quant = XML_REGEXP_QUANT_ONCE; 1636 copy->min = 0; 1637 copy->max = 0; 1638 1639 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy) 1640 < 0) 1641 return(-1); 1642 inter = ctxt->state; 1643 counter = xmlRegGetCounter(ctxt); 1644 ctxt->counters[counter].min = atom->min - 1; 1645 ctxt->counters[counter].max = atom->max - 1; 1646 /* count the number of times we see it again */ 1647 xmlFAGenerateCountedEpsilonTransition(ctxt, inter, 1648 atom->stop, counter); 1649 /* allow a way out based on the count */ 1650 xmlFAGenerateCountedTransition(ctxt, inter, 1651 newstate, counter); 1652 /* and also allow a direct exit for 0 */ 1653 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1654 newstate); 1655 } else { 1656 /* 1657 * either we need the atom at least once or there 1658 * is an atom->start0 allowing to easilly plug the 1659 * epsilon transition. 1660 */ 1661 counter = xmlRegGetCounter(ctxt); 1662 ctxt->counters[counter].min = atom->min - 1; 1663 ctxt->counters[counter].max = atom->max - 1; 1664 /* count the number of times we see it again */ 1665 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1666 atom->start, counter); 1667 /* allow a way out based on the count */ 1668 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1669 newstate, counter); 1670 /* and if needed allow a direct exit for 0 */ 1671 if (atom->min == 0) 1672 xmlFAGenerateEpsilonTransition(ctxt, atom->start0, 1673 newstate); 1674 1675 } 1676 atom->min = 0; 1677 atom->max = 0; 1678 atom->quant = XML_REGEXP_QUANT_ONCE; 1679 ctxt->state = newstate; 1680 } 1681 default: 1682 break; 1683 } 1684 return(0); 1685 } 1686 if ((atom->min == 0) && (atom->max == 0) && 1687 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1688 /* 1689 * we can discard the atom and generate an epsilon transition instead 1690 */ 1691 if (to == NULL) { 1692 to = xmlRegNewState(ctxt); 1693 if (to != NULL) 1694 xmlRegStatePush(ctxt, to); 1695 else { 1696 return(-1); 1697 } 1698 } 1699 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1700 ctxt->state = to; 1701 xmlRegFreeAtom(atom); 1702 return(0); 1703 } 1704 if (to == NULL) { 1705 to = xmlRegNewState(ctxt); 1706 if (to != NULL) 1707 xmlRegStatePush(ctxt, to); 1708 else { 1709 return(-1); 1710 } 1711 } 1712 end = to; 1713 if ((atom->quant == XML_REGEXP_QUANT_MULT) || 1714 (atom->quant == XML_REGEXP_QUANT_PLUS)) { 1715 /* 1716 * Do not pollute the target state by adding transitions from 1717 * it as it is likely to be the shared target of multiple branches. 1718 * So isolate with an epsilon transition. 1719 */ 1720 xmlRegStatePtr tmp; 1721 1722 tmp = xmlRegNewState(ctxt); 1723 if (tmp != NULL) 1724 xmlRegStatePush(ctxt, tmp); 1725 else { 1726 return(-1); 1727 } 1728 xmlFAGenerateEpsilonTransition(ctxt, tmp, to); 1729 to = tmp; 1730 } 1731 if (xmlRegAtomPush(ctxt, atom) < 0) { 1732 return(-1); 1733 } 1734 if ((atom->quant == XML_REGEXP_QUANT_RANGE) && 1735 (atom->min == 0) && (atom->max > 0)) { 1736 nullable = 1; 1737 atom->min = 1; 1738 if (atom->max == 1) 1739 atom->quant = XML_REGEXP_QUANT_OPT; 1740 } 1741 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1742 ctxt->state = end; 1743 switch (atom->quant) { 1744 case XML_REGEXP_QUANT_OPT: 1745 atom->quant = XML_REGEXP_QUANT_ONCE; 1746 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1747 break; 1748 case XML_REGEXP_QUANT_MULT: 1749 atom->quant = XML_REGEXP_QUANT_ONCE; 1750 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1751 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1752 break; 1753 case XML_REGEXP_QUANT_PLUS: 1754 atom->quant = XML_REGEXP_QUANT_ONCE; 1755 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1756 break; 1757 case XML_REGEXP_QUANT_RANGE: 1758 if (nullable) 1759 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1760 break; 1761 default: 1762 break; 1763 } 1764 return(0); 1765 } 1766 1767 /** 1768 * xmlFAReduceEpsilonTransitions: 1769 * @ctxt: a regexp parser context 1770 * @fromnr: the from state 1771 * @tonr: the to state 1772 * @counter: should that transition be associated to a counted 1773 * 1774 */ 1775 static void 1776 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1777 int tonr, int counter) { 1778 int transnr; 1779 xmlRegStatePtr from; 1780 xmlRegStatePtr to; 1781 1782 #ifdef DEBUG_REGEXP_GRAPH 1783 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1784 #endif 1785 from = ctxt->states[fromnr]; 1786 if (from == NULL) 1787 return; 1788 to = ctxt->states[tonr]; 1789 if (to == NULL) 1790 return; 1791 if ((to->mark == XML_REGEXP_MARK_START) || 1792 (to->mark == XML_REGEXP_MARK_VISITED)) 1793 return; 1794 1795 to->mark = XML_REGEXP_MARK_VISITED; 1796 if (to->type == XML_REGEXP_FINAL_STATE) { 1797 #ifdef DEBUG_REGEXP_GRAPH 1798 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1799 #endif 1800 from->type = XML_REGEXP_FINAL_STATE; 1801 } 1802 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1803 if (to->trans[transnr].to < 0) 1804 continue; 1805 if (to->trans[transnr].atom == NULL) { 1806 /* 1807 * Don't remove counted transitions 1808 * Don't loop either 1809 */ 1810 if (to->trans[transnr].to != fromnr) { 1811 if (to->trans[transnr].count >= 0) { 1812 int newto = to->trans[transnr].to; 1813 1814 xmlRegStateAddTrans(ctxt, from, NULL, 1815 ctxt->states[newto], 1816 -1, to->trans[transnr].count); 1817 } else { 1818 #ifdef DEBUG_REGEXP_GRAPH 1819 printf("Found epsilon trans %d from %d to %d\n", 1820 transnr, tonr, to->trans[transnr].to); 1821 #endif 1822 if (to->trans[transnr].counter >= 0) { 1823 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1824 to->trans[transnr].to, 1825 to->trans[transnr].counter); 1826 } else { 1827 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1828 to->trans[transnr].to, 1829 counter); 1830 } 1831 } 1832 } 1833 } else { 1834 int newto = to->trans[transnr].to; 1835 1836 if (to->trans[transnr].counter >= 0) { 1837 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1838 ctxt->states[newto], 1839 to->trans[transnr].counter, -1); 1840 } else { 1841 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1842 ctxt->states[newto], counter, -1); 1843 } 1844 } 1845 } 1846 to->mark = XML_REGEXP_MARK_NORMAL; 1847 } 1848 1849 /** 1850 * xmlFAEliminateSimpleEpsilonTransitions: 1851 * @ctxt: a regexp parser context 1852 * 1853 * Eliminating general epsilon transitions can get costly in the general 1854 * algorithm due to the large amount of generated new transitions and 1855 * associated comparisons. However for simple epsilon transition used just 1856 * to separate building blocks when generating the automata this can be 1857 * reduced to state elimination: 1858 * - if there exists an epsilon from X to Y 1859 * - if there is no other transition from X 1860 * then X and Y are semantically equivalent and X can be eliminated 1861 * If X is the start state then make Y the start state, else replace the 1862 * target of all transitions to X by transitions to Y. 1863 */ 1864 static void 1865 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1866 int statenr, i, j, newto; 1867 xmlRegStatePtr state, tmp; 1868 1869 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1870 state = ctxt->states[statenr]; 1871 if (state == NULL) 1872 continue; 1873 if (state->nbTrans != 1) 1874 continue; 1875 if (state->type == XML_REGEXP_UNREACH_STATE) 1876 continue; 1877 /* is the only transition out a basic transition */ 1878 if ((state->trans[0].atom == NULL) && 1879 (state->trans[0].to >= 0) && 1880 (state->trans[0].to != statenr) && 1881 (state->trans[0].counter < 0) && 1882 (state->trans[0].count < 0)) { 1883 newto = state->trans[0].to; 1884 1885 if (state->type == XML_REGEXP_START_STATE) { 1886 #ifdef DEBUG_REGEXP_GRAPH 1887 printf("Found simple epsilon trans from start %d to %d\n", 1888 statenr, newto); 1889 #endif 1890 } else { 1891 #ifdef DEBUG_REGEXP_GRAPH 1892 printf("Found simple epsilon trans from %d to %d\n", 1893 statenr, newto); 1894 #endif 1895 for (i = 0;i < state->nbTransTo;i++) { 1896 tmp = ctxt->states[state->transTo[i]]; 1897 for (j = 0;j < tmp->nbTrans;j++) { 1898 if (tmp->trans[j].to == statenr) { 1899 #ifdef DEBUG_REGEXP_GRAPH 1900 printf("Changed transition %d on %d to go to %d\n", 1901 j, tmp->no, newto); 1902 #endif 1903 tmp->trans[j].to = -1; 1904 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom, 1905 ctxt->states[newto], 1906 tmp->trans[j].counter, 1907 tmp->trans[j].count); 1908 } 1909 } 1910 } 1911 if (state->type == XML_REGEXP_FINAL_STATE) 1912 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1913 /* eliminate the transition completely */ 1914 state->nbTrans = 0; 1915 1916 state->type = XML_REGEXP_UNREACH_STATE; 1917 1918 } 1919 1920 } 1921 } 1922 } 1923 /** 1924 * xmlFAEliminateEpsilonTransitions: 1925 * @ctxt: a regexp parser context 1926 * 1927 */ 1928 static void 1929 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1930 int statenr, transnr; 1931 xmlRegStatePtr state; 1932 int has_epsilon; 1933 1934 if (ctxt->states == NULL) return; 1935 1936 /* 1937 * Eliminate simple epsilon transition and the associated unreachable 1938 * states. 1939 */ 1940 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1941 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1942 state = ctxt->states[statenr]; 1943 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) { 1944 #ifdef DEBUG_REGEXP_GRAPH 1945 printf("Removed unreachable state %d\n", statenr); 1946 #endif 1947 xmlRegFreeState(state); 1948 ctxt->states[statenr] = NULL; 1949 } 1950 } 1951 1952 has_epsilon = 0; 1953 1954 /* 1955 * Build the completed transitions bypassing the epsilons 1956 * Use a marking algorithm to avoid loops 1957 * Mark sink states too. 1958 * Process from the latests states backward to the start when 1959 * there is long cascading epsilon chains this minimize the 1960 * recursions and transition compares when adding the new ones 1961 */ 1962 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) { 1963 state = ctxt->states[statenr]; 1964 if (state == NULL) 1965 continue; 1966 if ((state->nbTrans == 0) && 1967 (state->type != XML_REGEXP_FINAL_STATE)) { 1968 state->type = XML_REGEXP_SINK_STATE; 1969 } 1970 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1971 if ((state->trans[transnr].atom == NULL) && 1972 (state->trans[transnr].to >= 0)) { 1973 if (state->trans[transnr].to == statenr) { 1974 state->trans[transnr].to = -1; 1975 #ifdef DEBUG_REGEXP_GRAPH 1976 printf("Removed loopback epsilon trans %d on %d\n", 1977 transnr, statenr); 1978 #endif 1979 } else if (state->trans[transnr].count < 0) { 1980 int newto = state->trans[transnr].to; 1981 1982 #ifdef DEBUG_REGEXP_GRAPH 1983 printf("Found epsilon trans %d from %d to %d\n", 1984 transnr, statenr, newto); 1985 #endif 1986 has_epsilon = 1; 1987 state->trans[transnr].to = -2; 1988 state->mark = XML_REGEXP_MARK_START; 1989 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1990 newto, state->trans[transnr].counter); 1991 state->mark = XML_REGEXP_MARK_NORMAL; 1992 #ifdef DEBUG_REGEXP_GRAPH 1993 } else { 1994 printf("Found counted transition %d on %d\n", 1995 transnr, statenr); 1996 #endif 1997 } 1998 } 1999 } 2000 } 2001 /* 2002 * Eliminate the epsilon transitions 2003 */ 2004 if (has_epsilon) { 2005 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2006 state = ctxt->states[statenr]; 2007 if (state == NULL) 2008 continue; 2009 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2010 xmlRegTransPtr trans = &(state->trans[transnr]); 2011 if ((trans->atom == NULL) && 2012 (trans->count < 0) && 2013 (trans->to >= 0)) { 2014 trans->to = -1; 2015 } 2016 } 2017 } 2018 } 2019 2020 /* 2021 * Use this pass to detect unreachable states too 2022 */ 2023 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2024 state = ctxt->states[statenr]; 2025 if (state != NULL) 2026 state->reached = XML_REGEXP_MARK_NORMAL; 2027 } 2028 state = ctxt->states[0]; 2029 if (state != NULL) 2030 state->reached = XML_REGEXP_MARK_START; 2031 while (state != NULL) { 2032 xmlRegStatePtr target = NULL; 2033 state->reached = XML_REGEXP_MARK_VISITED; 2034 /* 2035 * Mark all states reachable from the current reachable state 2036 */ 2037 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2038 if ((state->trans[transnr].to >= 0) && 2039 ((state->trans[transnr].atom != NULL) || 2040 (state->trans[transnr].count >= 0))) { 2041 int newto = state->trans[transnr].to; 2042 2043 if (ctxt->states[newto] == NULL) 2044 continue; 2045 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 2046 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 2047 target = ctxt->states[newto]; 2048 } 2049 } 2050 } 2051 2052 /* 2053 * find the next accessible state not explored 2054 */ 2055 if (target == NULL) { 2056 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 2057 state = ctxt->states[statenr]; 2058 if ((state != NULL) && (state->reached == 2059 XML_REGEXP_MARK_START)) { 2060 target = state; 2061 break; 2062 } 2063 } 2064 } 2065 state = target; 2066 } 2067 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2068 state = ctxt->states[statenr]; 2069 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 2070 #ifdef DEBUG_REGEXP_GRAPH 2071 printf("Removed unreachable state %d\n", statenr); 2072 #endif 2073 xmlRegFreeState(state); 2074 ctxt->states[statenr] = NULL; 2075 } 2076 } 2077 2078 } 2079 2080 static int 2081 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 2082 int ret = 0; 2083 2084 if ((range1->type == XML_REGEXP_RANGES) || 2085 (range2->type == XML_REGEXP_RANGES) || 2086 (range2->type == XML_REGEXP_SUBREG) || 2087 (range1->type == XML_REGEXP_SUBREG) || 2088 (range1->type == XML_REGEXP_STRING) || 2089 (range2->type == XML_REGEXP_STRING)) 2090 return(-1); 2091 2092 /* put them in order */ 2093 if (range1->type > range2->type) { 2094 xmlRegRangePtr tmp; 2095 2096 tmp = range1; 2097 range1 = range2; 2098 range2 = tmp; 2099 } 2100 if ((range1->type == XML_REGEXP_ANYCHAR) || 2101 (range2->type == XML_REGEXP_ANYCHAR)) { 2102 ret = 1; 2103 } else if ((range1->type == XML_REGEXP_EPSILON) || 2104 (range2->type == XML_REGEXP_EPSILON)) { 2105 return(0); 2106 } else if (range1->type == range2->type) { 2107 if (range1->type != XML_REGEXP_CHARVAL) 2108 ret = 1; 2109 else if ((range1->end < range2->start) || 2110 (range2->end < range1->start)) 2111 ret = 0; 2112 else 2113 ret = 1; 2114 } else if (range1->type == XML_REGEXP_CHARVAL) { 2115 int codepoint; 2116 int neg = 0; 2117 2118 /* 2119 * just check all codepoints in the range for acceptance, 2120 * this is usually way cheaper since done only once at 2121 * compilation than testing over and over at runtime or 2122 * pushing too many states when evaluating. 2123 */ 2124 if (((range1->neg == 0) && (range2->neg != 0)) || 2125 ((range1->neg != 0) && (range2->neg == 0))) 2126 neg = 1; 2127 2128 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 2129 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 2130 0, range2->start, range2->end, 2131 range2->blockName); 2132 if (ret < 0) 2133 return(-1); 2134 if (((neg == 1) && (ret == 0)) || 2135 ((neg == 0) && (ret == 1))) 2136 return(1); 2137 } 2138 return(0); 2139 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 2140 (range2->type == XML_REGEXP_BLOCK_NAME)) { 2141 if (range1->type == range2->type) { 2142 ret = xmlStrEqual(range1->blockName, range2->blockName); 2143 } else { 2144 /* 2145 * comparing a block range with anything else is way 2146 * too costly, and maintining the table is like too much 2147 * memory too, so let's force the automata to save state 2148 * here. 2149 */ 2150 return(1); 2151 } 2152 } else if ((range1->type < XML_REGEXP_LETTER) || 2153 (range2->type < XML_REGEXP_LETTER)) { 2154 if ((range1->type == XML_REGEXP_ANYSPACE) && 2155 (range2->type == XML_REGEXP_NOTSPACE)) 2156 ret = 0; 2157 else if ((range1->type == XML_REGEXP_INITNAME) && 2158 (range2->type == XML_REGEXP_NOTINITNAME)) 2159 ret = 0; 2160 else if ((range1->type == XML_REGEXP_NAMECHAR) && 2161 (range2->type == XML_REGEXP_NOTNAMECHAR)) 2162 ret = 0; 2163 else if ((range1->type == XML_REGEXP_DECIMAL) && 2164 (range2->type == XML_REGEXP_NOTDECIMAL)) 2165 ret = 0; 2166 else if ((range1->type == XML_REGEXP_REALCHAR) && 2167 (range2->type == XML_REGEXP_NOTREALCHAR)) 2168 ret = 0; 2169 else { 2170 /* same thing to limit complexity */ 2171 return(1); 2172 } 2173 } else { 2174 ret = 0; 2175 /* range1->type < range2->type here */ 2176 switch (range1->type) { 2177 case XML_REGEXP_LETTER: 2178 /* all disjoint except in the subgroups */ 2179 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2180 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2181 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2182 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2183 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2184 ret = 1; 2185 break; 2186 case XML_REGEXP_MARK: 2187 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2188 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2189 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2190 ret = 1; 2191 break; 2192 case XML_REGEXP_NUMBER: 2193 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2194 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2195 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2196 ret = 1; 2197 break; 2198 case XML_REGEXP_PUNCT: 2199 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2200 (range2->type == XML_REGEXP_PUNCT_DASH) || 2201 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2202 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2203 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2204 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2205 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2206 ret = 1; 2207 break; 2208 case XML_REGEXP_SEPAR: 2209 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2210 (range2->type == XML_REGEXP_SEPAR_LINE) || 2211 (range2->type == XML_REGEXP_SEPAR_PARA)) 2212 ret = 1; 2213 break; 2214 case XML_REGEXP_SYMBOL: 2215 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2216 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2217 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2218 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2219 ret = 1; 2220 break; 2221 case XML_REGEXP_OTHER: 2222 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2223 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2224 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2225 ret = 1; 2226 break; 2227 default: 2228 if ((range2->type >= XML_REGEXP_LETTER) && 2229 (range2->type < XML_REGEXP_BLOCK_NAME)) 2230 ret = 0; 2231 else { 2232 /* safety net ! */ 2233 return(1); 2234 } 2235 } 2236 } 2237 if (((range1->neg == 0) && (range2->neg != 0)) || 2238 ((range1->neg != 0) && (range2->neg == 0))) 2239 ret = !ret; 2240 return(ret); 2241 } 2242 2243 /** 2244 * xmlFACompareAtomTypes: 2245 * @type1: an atom type 2246 * @type2: an atom type 2247 * 2248 * Compares two atoms type to check whether they intersect in some ways, 2249 * this is used by xmlFACompareAtoms only 2250 * 2251 * Returns 1 if they may intersect and 0 otherwise 2252 */ 2253 static int 2254 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2255 if ((type1 == XML_REGEXP_EPSILON) || 2256 (type1 == XML_REGEXP_CHARVAL) || 2257 (type1 == XML_REGEXP_RANGES) || 2258 (type1 == XML_REGEXP_SUBREG) || 2259 (type1 == XML_REGEXP_STRING) || 2260 (type1 == XML_REGEXP_ANYCHAR)) 2261 return(1); 2262 if ((type2 == XML_REGEXP_EPSILON) || 2263 (type2 == XML_REGEXP_CHARVAL) || 2264 (type2 == XML_REGEXP_RANGES) || 2265 (type2 == XML_REGEXP_SUBREG) || 2266 (type2 == XML_REGEXP_STRING) || 2267 (type2 == XML_REGEXP_ANYCHAR)) 2268 return(1); 2269 2270 if (type1 == type2) return(1); 2271 2272 /* simplify subsequent compares by making sure type1 < type2 */ 2273 if (type1 > type2) { 2274 xmlRegAtomType tmp = type1; 2275 type1 = type2; 2276 type2 = tmp; 2277 } 2278 switch (type1) { 2279 case XML_REGEXP_ANYSPACE: /* \s */ 2280 /* can't be a letter, number, mark, pontuation, symbol */ 2281 if ((type2 == XML_REGEXP_NOTSPACE) || 2282 ((type2 >= XML_REGEXP_LETTER) && 2283 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2284 ((type2 >= XML_REGEXP_NUMBER) && 2285 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2286 ((type2 >= XML_REGEXP_MARK) && 2287 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2288 ((type2 >= XML_REGEXP_PUNCT) && 2289 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2290 ((type2 >= XML_REGEXP_SYMBOL) && 2291 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2292 ) return(0); 2293 break; 2294 case XML_REGEXP_NOTSPACE: /* \S */ 2295 break; 2296 case XML_REGEXP_INITNAME: /* \l */ 2297 /* can't be a number, mark, separator, pontuation, symbol or other */ 2298 if ((type2 == XML_REGEXP_NOTINITNAME) || 2299 ((type2 >= XML_REGEXP_NUMBER) && 2300 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2301 ((type2 >= XML_REGEXP_MARK) && 2302 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2303 ((type2 >= XML_REGEXP_SEPAR) && 2304 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2305 ((type2 >= XML_REGEXP_PUNCT) && 2306 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2307 ((type2 >= XML_REGEXP_SYMBOL) && 2308 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2309 ((type2 >= XML_REGEXP_OTHER) && 2310 (type2 <= XML_REGEXP_OTHER_NA)) 2311 ) return(0); 2312 break; 2313 case XML_REGEXP_NOTINITNAME: /* \L */ 2314 break; 2315 case XML_REGEXP_NAMECHAR: /* \c */ 2316 /* can't be a mark, separator, pontuation, symbol or other */ 2317 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2318 ((type2 >= XML_REGEXP_MARK) && 2319 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2320 ((type2 >= XML_REGEXP_PUNCT) && 2321 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2322 ((type2 >= XML_REGEXP_SEPAR) && 2323 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2324 ((type2 >= XML_REGEXP_SYMBOL) && 2325 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2326 ((type2 >= XML_REGEXP_OTHER) && 2327 (type2 <= XML_REGEXP_OTHER_NA)) 2328 ) return(0); 2329 break; 2330 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2331 break; 2332 case XML_REGEXP_DECIMAL: /* \d */ 2333 /* can't be a letter, mark, separator, pontuation, symbol or other */ 2334 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2335 (type2 == XML_REGEXP_REALCHAR) || 2336 ((type2 >= XML_REGEXP_LETTER) && 2337 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2338 ((type2 >= XML_REGEXP_MARK) && 2339 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2340 ((type2 >= XML_REGEXP_PUNCT) && 2341 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2342 ((type2 >= XML_REGEXP_SEPAR) && 2343 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2344 ((type2 >= XML_REGEXP_SYMBOL) && 2345 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2346 ((type2 >= XML_REGEXP_OTHER) && 2347 (type2 <= XML_REGEXP_OTHER_NA)) 2348 )return(0); 2349 break; 2350 case XML_REGEXP_NOTDECIMAL: /* \D */ 2351 break; 2352 case XML_REGEXP_REALCHAR: /* \w */ 2353 /* can't be a mark, separator, pontuation, symbol or other */ 2354 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2355 ((type2 >= XML_REGEXP_MARK) && 2356 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2357 ((type2 >= XML_REGEXP_PUNCT) && 2358 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2359 ((type2 >= XML_REGEXP_SEPAR) && 2360 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2361 ((type2 >= XML_REGEXP_SYMBOL) && 2362 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2363 ((type2 >= XML_REGEXP_OTHER) && 2364 (type2 <= XML_REGEXP_OTHER_NA)) 2365 )return(0); 2366 break; 2367 case XML_REGEXP_NOTREALCHAR: /* \W */ 2368 break; 2369 /* 2370 * at that point we know both type 1 and type2 are from 2371 * character categories are ordered and are different, 2372 * it becomes simple because this is a partition 2373 */ 2374 case XML_REGEXP_LETTER: 2375 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2376 return(1); 2377 return(0); 2378 case XML_REGEXP_LETTER_UPPERCASE: 2379 case XML_REGEXP_LETTER_LOWERCASE: 2380 case XML_REGEXP_LETTER_TITLECASE: 2381 case XML_REGEXP_LETTER_MODIFIER: 2382 case XML_REGEXP_LETTER_OTHERS: 2383 return(0); 2384 case XML_REGEXP_MARK: 2385 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2386 return(1); 2387 return(0); 2388 case XML_REGEXP_MARK_NONSPACING: 2389 case XML_REGEXP_MARK_SPACECOMBINING: 2390 case XML_REGEXP_MARK_ENCLOSING: 2391 return(0); 2392 case XML_REGEXP_NUMBER: 2393 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2394 return(1); 2395 return(0); 2396 case XML_REGEXP_NUMBER_DECIMAL: 2397 case XML_REGEXP_NUMBER_LETTER: 2398 case XML_REGEXP_NUMBER_OTHERS: 2399 return(0); 2400 case XML_REGEXP_PUNCT: 2401 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2402 return(1); 2403 return(0); 2404 case XML_REGEXP_PUNCT_CONNECTOR: 2405 case XML_REGEXP_PUNCT_DASH: 2406 case XML_REGEXP_PUNCT_OPEN: 2407 case XML_REGEXP_PUNCT_CLOSE: 2408 case XML_REGEXP_PUNCT_INITQUOTE: 2409 case XML_REGEXP_PUNCT_FINQUOTE: 2410 case XML_REGEXP_PUNCT_OTHERS: 2411 return(0); 2412 case XML_REGEXP_SEPAR: 2413 if (type2 <= XML_REGEXP_SEPAR_PARA) 2414 return(1); 2415 return(0); 2416 case XML_REGEXP_SEPAR_SPACE: 2417 case XML_REGEXP_SEPAR_LINE: 2418 case XML_REGEXP_SEPAR_PARA: 2419 return(0); 2420 case XML_REGEXP_SYMBOL: 2421 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2422 return(1); 2423 return(0); 2424 case XML_REGEXP_SYMBOL_MATH: 2425 case XML_REGEXP_SYMBOL_CURRENCY: 2426 case XML_REGEXP_SYMBOL_MODIFIER: 2427 case XML_REGEXP_SYMBOL_OTHERS: 2428 return(0); 2429 case XML_REGEXP_OTHER: 2430 if (type2 <= XML_REGEXP_OTHER_NA) 2431 return(1); 2432 return(0); 2433 case XML_REGEXP_OTHER_CONTROL: 2434 case XML_REGEXP_OTHER_FORMAT: 2435 case XML_REGEXP_OTHER_PRIVATE: 2436 case XML_REGEXP_OTHER_NA: 2437 return(0); 2438 default: 2439 break; 2440 } 2441 return(1); 2442 } 2443 2444 /** 2445 * xmlFAEqualAtoms: 2446 * @atom1: an atom 2447 * @atom2: an atom 2448 * @deep: if not set only compare string pointers 2449 * 2450 * Compares two atoms to check whether they are the same exactly 2451 * this is used to remove equivalent transitions 2452 * 2453 * Returns 1 if same and 0 otherwise 2454 */ 2455 static int 2456 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2457 int ret = 0; 2458 2459 if (atom1 == atom2) 2460 return(1); 2461 if ((atom1 == NULL) || (atom2 == NULL)) 2462 return(0); 2463 2464 if (atom1->type != atom2->type) 2465 return(0); 2466 switch (atom1->type) { 2467 case XML_REGEXP_EPSILON: 2468 ret = 0; 2469 break; 2470 case XML_REGEXP_STRING: 2471 if (!deep) 2472 ret = (atom1->valuep == atom2->valuep); 2473 else 2474 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2475 (xmlChar *)atom2->valuep); 2476 break; 2477 case XML_REGEXP_CHARVAL: 2478 ret = (atom1->codepoint == atom2->codepoint); 2479 break; 2480 case XML_REGEXP_RANGES: 2481 /* too hard to do in the general case */ 2482 ret = 0; 2483 default: 2484 break; 2485 } 2486 return(ret); 2487 } 2488 2489 /** 2490 * xmlFACompareAtoms: 2491 * @atom1: an atom 2492 * @atom2: an atom 2493 * @deep: if not set only compare string pointers 2494 * 2495 * Compares two atoms to check whether they intersect in some ways, 2496 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2497 * 2498 * Returns 1 if yes and 0 otherwise 2499 */ 2500 static int 2501 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2502 int ret = 1; 2503 2504 if (atom1 == atom2) 2505 return(1); 2506 if ((atom1 == NULL) || (atom2 == NULL)) 2507 return(0); 2508 2509 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2510 (atom2->type == XML_REGEXP_ANYCHAR)) 2511 return(1); 2512 2513 if (atom1->type > atom2->type) { 2514 xmlRegAtomPtr tmp; 2515 tmp = atom1; 2516 atom1 = atom2; 2517 atom2 = tmp; 2518 } 2519 if (atom1->type != atom2->type) { 2520 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2521 /* if they can't intersect at the type level break now */ 2522 if (ret == 0) 2523 return(0); 2524 } 2525 switch (atom1->type) { 2526 case XML_REGEXP_STRING: 2527 if (!deep) 2528 ret = (atom1->valuep != atom2->valuep); 2529 else 2530 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, 2531 (xmlChar *)atom2->valuep); 2532 break; 2533 case XML_REGEXP_EPSILON: 2534 goto not_determinist; 2535 case XML_REGEXP_CHARVAL: 2536 if (atom2->type == XML_REGEXP_CHARVAL) { 2537 ret = (atom1->codepoint == atom2->codepoint); 2538 } else { 2539 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2540 if (ret < 0) 2541 ret = 1; 2542 } 2543 break; 2544 case XML_REGEXP_RANGES: 2545 if (atom2->type == XML_REGEXP_RANGES) { 2546 int i, j, res; 2547 xmlRegRangePtr r1, r2; 2548 2549 /* 2550 * need to check that none of the ranges eventually matches 2551 */ 2552 for (i = 0;i < atom1->nbRanges;i++) { 2553 for (j = 0;j < atom2->nbRanges;j++) { 2554 r1 = atom1->ranges[i]; 2555 r2 = atom2->ranges[j]; 2556 res = xmlFACompareRanges(r1, r2); 2557 if (res == 1) { 2558 ret = 1; 2559 goto done; 2560 } 2561 } 2562 } 2563 ret = 0; 2564 } 2565 break; 2566 default: 2567 goto not_determinist; 2568 } 2569 done: 2570 if (atom1->neg != atom2->neg) { 2571 ret = !ret; 2572 } 2573 if (ret == 0) 2574 return(0); 2575 not_determinist: 2576 return(1); 2577 } 2578 2579 /** 2580 * xmlFARecurseDeterminism: 2581 * @ctxt: a regexp parser context 2582 * 2583 * Check whether the associated regexp is determinist, 2584 * should be called after xmlFAEliminateEpsilonTransitions() 2585 * 2586 */ 2587 static int 2588 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2589 int to, xmlRegAtomPtr atom) { 2590 int ret = 1; 2591 int res; 2592 int transnr, nbTrans; 2593 xmlRegTransPtr t1; 2594 int deep = 1; 2595 2596 if (state == NULL) 2597 return(ret); 2598 if (state->markd == XML_REGEXP_MARK_VISITED) 2599 return(ret); 2600 2601 if (ctxt->flags & AM_AUTOMATA_RNG) 2602 deep = 0; 2603 2604 /* 2605 * don't recurse on transitions potentially added in the course of 2606 * the elimination. 2607 */ 2608 nbTrans = state->nbTrans; 2609 for (transnr = 0;transnr < nbTrans;transnr++) { 2610 t1 = &(state->trans[transnr]); 2611 /* 2612 * check transitions conflicting with the one looked at 2613 */ 2614 if (t1->atom == NULL) { 2615 if (t1->to < 0) 2616 continue; 2617 state->markd = XML_REGEXP_MARK_VISITED; 2618 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2619 to, atom); 2620 state->markd = 0; 2621 if (res == 0) { 2622 ret = 0; 2623 /* t1->nd = 1; */ 2624 } 2625 continue; 2626 } 2627 if (t1->to != to) 2628 continue; 2629 if (xmlFACompareAtoms(t1->atom, atom, deep)) { 2630 ret = 0; 2631 /* mark the transition as non-deterministic */ 2632 t1->nd = 1; 2633 } 2634 } 2635 return(ret); 2636 } 2637 2638 /** 2639 * xmlFAComputesDeterminism: 2640 * @ctxt: a regexp parser context 2641 * 2642 * Check whether the associated regexp is determinist, 2643 * should be called after xmlFAEliminateEpsilonTransitions() 2644 * 2645 */ 2646 static int 2647 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2648 int statenr, transnr; 2649 xmlRegStatePtr state; 2650 xmlRegTransPtr t1, t2, last; 2651 int i; 2652 int ret = 1; 2653 int deep = 1; 2654 2655 #ifdef DEBUG_REGEXP_GRAPH 2656 printf("xmlFAComputesDeterminism\n"); 2657 xmlRegPrintCtxt(stdout, ctxt); 2658 #endif 2659 if (ctxt->determinist != -1) 2660 return(ctxt->determinist); 2661 2662 if (ctxt->flags & AM_AUTOMATA_RNG) 2663 deep = 0; 2664 2665 /* 2666 * First cleanup the automata removing cancelled transitions 2667 */ 2668 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2669 state = ctxt->states[statenr]; 2670 if (state == NULL) 2671 continue; 2672 if (state->nbTrans < 2) 2673 continue; 2674 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2675 t1 = &(state->trans[transnr]); 2676 /* 2677 * Determinism checks in case of counted or all transitions 2678 * will have to be handled separately 2679 */ 2680 if (t1->atom == NULL) { 2681 /* t1->nd = 1; */ 2682 continue; 2683 } 2684 if (t1->to == -1) /* eliminated */ 2685 continue; 2686 for (i = 0;i < transnr;i++) { 2687 t2 = &(state->trans[i]); 2688 if (t2->to == -1) /* eliminated */ 2689 continue; 2690 if (t2->atom != NULL) { 2691 if (t1->to == t2->to) { 2692 /* 2693 * Here we use deep because we want to keep the 2694 * transitions which indicate a conflict 2695 */ 2696 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) && 2697 (t1->counter == t2->counter) && 2698 (t1->count == t2->count)) 2699 t2->to = -1; /* eliminated */ 2700 } 2701 } 2702 } 2703 } 2704 } 2705 2706 /* 2707 * Check for all states that there aren't 2 transitions 2708 * with the same atom and a different target. 2709 */ 2710 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2711 state = ctxt->states[statenr]; 2712 if (state == NULL) 2713 continue; 2714 if (state->nbTrans < 2) 2715 continue; 2716 last = NULL; 2717 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2718 t1 = &(state->trans[transnr]); 2719 /* 2720 * Determinism checks in case of counted or all transitions 2721 * will have to be handled separately 2722 */ 2723 if (t1->atom == NULL) { 2724 continue; 2725 } 2726 if (t1->to == -1) /* eliminated */ 2727 continue; 2728 for (i = 0;i < transnr;i++) { 2729 t2 = &(state->trans[i]); 2730 if (t2->to == -1) /* eliminated */ 2731 continue; 2732 if (t2->atom != NULL) { 2733 /* 2734 * But here we don't use deep because we want to 2735 * find transitions which indicate a conflict 2736 */ 2737 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) { 2738 ret = 0; 2739 /* mark the transitions as non-deterministic ones */ 2740 t1->nd = 1; 2741 t2->nd = 1; 2742 last = t1; 2743 } 2744 } else if (t1->to != -1) { 2745 /* 2746 * do the closure in case of remaining specific 2747 * epsilon transitions like choices or all 2748 */ 2749 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2750 t2->to, t2->atom); 2751 /* don't shortcut the computation so all non deterministic 2752 transition get marked down 2753 if (ret == 0) 2754 return(0); 2755 */ 2756 if (ret == 0) { 2757 t1->nd = 1; 2758 /* t2->nd = 1; */ 2759 last = t1; 2760 } 2761 } 2762 } 2763 /* don't shortcut the computation so all non deterministic 2764 transition get marked down 2765 if (ret == 0) 2766 break; */ 2767 } 2768 2769 /* 2770 * mark specifically the last non-deterministic transition 2771 * from a state since there is no need to set-up rollback 2772 * from it 2773 */ 2774 if (last != NULL) { 2775 last->nd = 2; 2776 } 2777 2778 /* don't shortcut the computation so all non deterministic 2779 transition get marked down 2780 if (ret == 0) 2781 break; */ 2782 } 2783 2784 ctxt->determinist = ret; 2785 return(ret); 2786 } 2787 2788 /************************************************************************ 2789 * * 2790 * Routines to check input against transition atoms * 2791 * * 2792 ************************************************************************/ 2793 2794 static int 2795 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2796 int start, int end, const xmlChar *blockName) { 2797 int ret = 0; 2798 2799 switch (type) { 2800 case XML_REGEXP_STRING: 2801 case XML_REGEXP_SUBREG: 2802 case XML_REGEXP_RANGES: 2803 case XML_REGEXP_EPSILON: 2804 return(-1); 2805 case XML_REGEXP_ANYCHAR: 2806 ret = ((codepoint != '\n') && (codepoint != '\r')); 2807 break; 2808 case XML_REGEXP_CHARVAL: 2809 ret = ((codepoint >= start) && (codepoint <= end)); 2810 break; 2811 case XML_REGEXP_NOTSPACE: 2812 neg = !neg; 2813 case XML_REGEXP_ANYSPACE: 2814 ret = ((codepoint == '\n') || (codepoint == '\r') || 2815 (codepoint == '\t') || (codepoint == ' ')); 2816 break; 2817 case XML_REGEXP_NOTINITNAME: 2818 neg = !neg; 2819 case XML_REGEXP_INITNAME: 2820 ret = (IS_LETTER(codepoint) || 2821 (codepoint == '_') || (codepoint == ':')); 2822 break; 2823 case XML_REGEXP_NOTNAMECHAR: 2824 neg = !neg; 2825 case XML_REGEXP_NAMECHAR: 2826 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2827 (codepoint == '.') || (codepoint == '-') || 2828 (codepoint == '_') || (codepoint == ':') || 2829 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2830 break; 2831 case XML_REGEXP_NOTDECIMAL: 2832 neg = !neg; 2833 case XML_REGEXP_DECIMAL: 2834 ret = xmlUCSIsCatNd(codepoint); 2835 break; 2836 case XML_REGEXP_REALCHAR: 2837 neg = !neg; 2838 case XML_REGEXP_NOTREALCHAR: 2839 ret = xmlUCSIsCatP(codepoint); 2840 if (ret == 0) 2841 ret = xmlUCSIsCatZ(codepoint); 2842 if (ret == 0) 2843 ret = xmlUCSIsCatC(codepoint); 2844 break; 2845 case XML_REGEXP_LETTER: 2846 ret = xmlUCSIsCatL(codepoint); 2847 break; 2848 case XML_REGEXP_LETTER_UPPERCASE: 2849 ret = xmlUCSIsCatLu(codepoint); 2850 break; 2851 case XML_REGEXP_LETTER_LOWERCASE: 2852 ret = xmlUCSIsCatLl(codepoint); 2853 break; 2854 case XML_REGEXP_LETTER_TITLECASE: 2855 ret = xmlUCSIsCatLt(codepoint); 2856 break; 2857 case XML_REGEXP_LETTER_MODIFIER: 2858 ret = xmlUCSIsCatLm(codepoint); 2859 break; 2860 case XML_REGEXP_LETTER_OTHERS: 2861 ret = xmlUCSIsCatLo(codepoint); 2862 break; 2863 case XML_REGEXP_MARK: 2864 ret = xmlUCSIsCatM(codepoint); 2865 break; 2866 case XML_REGEXP_MARK_NONSPACING: 2867 ret = xmlUCSIsCatMn(codepoint); 2868 break; 2869 case XML_REGEXP_MARK_SPACECOMBINING: 2870 ret = xmlUCSIsCatMc(codepoint); 2871 break; 2872 case XML_REGEXP_MARK_ENCLOSING: 2873 ret = xmlUCSIsCatMe(codepoint); 2874 break; 2875 case XML_REGEXP_NUMBER: 2876 ret = xmlUCSIsCatN(codepoint); 2877 break; 2878 case XML_REGEXP_NUMBER_DECIMAL: 2879 ret = xmlUCSIsCatNd(codepoint); 2880 break; 2881 case XML_REGEXP_NUMBER_LETTER: 2882 ret = xmlUCSIsCatNl(codepoint); 2883 break; 2884 case XML_REGEXP_NUMBER_OTHERS: 2885 ret = xmlUCSIsCatNo(codepoint); 2886 break; 2887 case XML_REGEXP_PUNCT: 2888 ret = xmlUCSIsCatP(codepoint); 2889 break; 2890 case XML_REGEXP_PUNCT_CONNECTOR: 2891 ret = xmlUCSIsCatPc(codepoint); 2892 break; 2893 case XML_REGEXP_PUNCT_DASH: 2894 ret = xmlUCSIsCatPd(codepoint); 2895 break; 2896 case XML_REGEXP_PUNCT_OPEN: 2897 ret = xmlUCSIsCatPs(codepoint); 2898 break; 2899 case XML_REGEXP_PUNCT_CLOSE: 2900 ret = xmlUCSIsCatPe(codepoint); 2901 break; 2902 case XML_REGEXP_PUNCT_INITQUOTE: 2903 ret = xmlUCSIsCatPi(codepoint); 2904 break; 2905 case XML_REGEXP_PUNCT_FINQUOTE: 2906 ret = xmlUCSIsCatPf(codepoint); 2907 break; 2908 case XML_REGEXP_PUNCT_OTHERS: 2909 ret = xmlUCSIsCatPo(codepoint); 2910 break; 2911 case XML_REGEXP_SEPAR: 2912 ret = xmlUCSIsCatZ(codepoint); 2913 break; 2914 case XML_REGEXP_SEPAR_SPACE: 2915 ret = xmlUCSIsCatZs(codepoint); 2916 break; 2917 case XML_REGEXP_SEPAR_LINE: 2918 ret = xmlUCSIsCatZl(codepoint); 2919 break; 2920 case XML_REGEXP_SEPAR_PARA: 2921 ret = xmlUCSIsCatZp(codepoint); 2922 break; 2923 case XML_REGEXP_SYMBOL: 2924 ret = xmlUCSIsCatS(codepoint); 2925 break; 2926 case XML_REGEXP_SYMBOL_MATH: 2927 ret = xmlUCSIsCatSm(codepoint); 2928 break; 2929 case XML_REGEXP_SYMBOL_CURRENCY: 2930 ret = xmlUCSIsCatSc(codepoint); 2931 break; 2932 case XML_REGEXP_SYMBOL_MODIFIER: 2933 ret = xmlUCSIsCatSk(codepoint); 2934 break; 2935 case XML_REGEXP_SYMBOL_OTHERS: 2936 ret = xmlUCSIsCatSo(codepoint); 2937 break; 2938 case XML_REGEXP_OTHER: 2939 ret = xmlUCSIsCatC(codepoint); 2940 break; 2941 case XML_REGEXP_OTHER_CONTROL: 2942 ret = xmlUCSIsCatCc(codepoint); 2943 break; 2944 case XML_REGEXP_OTHER_FORMAT: 2945 ret = xmlUCSIsCatCf(codepoint); 2946 break; 2947 case XML_REGEXP_OTHER_PRIVATE: 2948 ret = xmlUCSIsCatCo(codepoint); 2949 break; 2950 case XML_REGEXP_OTHER_NA: 2951 /* ret = xmlUCSIsCatCn(codepoint); */ 2952 /* Seems it doesn't exist anymore in recent Unicode releases */ 2953 ret = 0; 2954 break; 2955 case XML_REGEXP_BLOCK_NAME: 2956 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2957 break; 2958 } 2959 if (neg) 2960 return(!ret); 2961 return(ret); 2962 } 2963 2964 static int 2965 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2966 int i, ret = 0; 2967 xmlRegRangePtr range; 2968 2969 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2970 return(-1); 2971 2972 switch (atom->type) { 2973 case XML_REGEXP_SUBREG: 2974 case XML_REGEXP_EPSILON: 2975 return(-1); 2976 case XML_REGEXP_CHARVAL: 2977 return(codepoint == atom->codepoint); 2978 case XML_REGEXP_RANGES: { 2979 int accept = 0; 2980 2981 for (i = 0;i < atom->nbRanges;i++) { 2982 range = atom->ranges[i]; 2983 if (range->neg == 2) { 2984 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2985 0, range->start, range->end, 2986 range->blockName); 2987 if (ret != 0) 2988 return(0); /* excluded char */ 2989 } else if (range->neg) { 2990 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2991 0, range->start, range->end, 2992 range->blockName); 2993 if (ret == 0) 2994 accept = 1; 2995 else 2996 return(0); 2997 } else { 2998 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2999 0, range->start, range->end, 3000 range->blockName); 3001 if (ret != 0) 3002 accept = 1; /* might still be excluded */ 3003 } 3004 } 3005 return(accept); 3006 } 3007 case XML_REGEXP_STRING: 3008 printf("TODO: XML_REGEXP_STRING\n"); 3009 return(-1); 3010 case XML_REGEXP_ANYCHAR: 3011 case XML_REGEXP_ANYSPACE: 3012 case XML_REGEXP_NOTSPACE: 3013 case XML_REGEXP_INITNAME: 3014 case XML_REGEXP_NOTINITNAME: 3015 case XML_REGEXP_NAMECHAR: 3016 case XML_REGEXP_NOTNAMECHAR: 3017 case XML_REGEXP_DECIMAL: 3018 case XML_REGEXP_NOTDECIMAL: 3019 case XML_REGEXP_REALCHAR: 3020 case XML_REGEXP_NOTREALCHAR: 3021 case XML_REGEXP_LETTER: 3022 case XML_REGEXP_LETTER_UPPERCASE: 3023 case XML_REGEXP_LETTER_LOWERCASE: 3024 case XML_REGEXP_LETTER_TITLECASE: 3025 case XML_REGEXP_LETTER_MODIFIER: 3026 case XML_REGEXP_LETTER_OTHERS: 3027 case XML_REGEXP_MARK: 3028 case XML_REGEXP_MARK_NONSPACING: 3029 case XML_REGEXP_MARK_SPACECOMBINING: 3030 case XML_REGEXP_MARK_ENCLOSING: 3031 case XML_REGEXP_NUMBER: 3032 case XML_REGEXP_NUMBER_DECIMAL: 3033 case XML_REGEXP_NUMBER_LETTER: 3034 case XML_REGEXP_NUMBER_OTHERS: 3035 case XML_REGEXP_PUNCT: 3036 case XML_REGEXP_PUNCT_CONNECTOR: 3037 case XML_REGEXP_PUNCT_DASH: 3038 case XML_REGEXP_PUNCT_OPEN: 3039 case XML_REGEXP_PUNCT_CLOSE: 3040 case XML_REGEXP_PUNCT_INITQUOTE: 3041 case XML_REGEXP_PUNCT_FINQUOTE: 3042 case XML_REGEXP_PUNCT_OTHERS: 3043 case XML_REGEXP_SEPAR: 3044 case XML_REGEXP_SEPAR_SPACE: 3045 case XML_REGEXP_SEPAR_LINE: 3046 case XML_REGEXP_SEPAR_PARA: 3047 case XML_REGEXP_SYMBOL: 3048 case XML_REGEXP_SYMBOL_MATH: 3049 case XML_REGEXP_SYMBOL_CURRENCY: 3050 case XML_REGEXP_SYMBOL_MODIFIER: 3051 case XML_REGEXP_SYMBOL_OTHERS: 3052 case XML_REGEXP_OTHER: 3053 case XML_REGEXP_OTHER_CONTROL: 3054 case XML_REGEXP_OTHER_FORMAT: 3055 case XML_REGEXP_OTHER_PRIVATE: 3056 case XML_REGEXP_OTHER_NA: 3057 case XML_REGEXP_BLOCK_NAME: 3058 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 3059 (const xmlChar *)atom->valuep); 3060 if (atom->neg) 3061 ret = !ret; 3062 break; 3063 } 3064 return(ret); 3065 } 3066 3067 /************************************************************************ 3068 * * 3069 * Saving and restoring state of an execution context * 3070 * * 3071 ************************************************************************/ 3072 3073 #ifdef DEBUG_REGEXP_EXEC 3074 static void 3075 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 3076 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 3077 if (exec->inputStack != NULL) { 3078 int i; 3079 printf(": "); 3080 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 3081 printf("%s ", (const char *) 3082 exec->inputStack[exec->inputStackNr - (i + 1)].value); 3083 } else { 3084 printf(": %s", &(exec->inputString[exec->index])); 3085 } 3086 printf("\n"); 3087 } 3088 #endif 3089 3090 static void 3091 xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 3092 #ifdef DEBUG_REGEXP_EXEC 3093 printf("saving "); 3094 exec->transno++; 3095 xmlFARegDebugExec(exec); 3096 exec->transno--; 3097 #endif 3098 #ifdef MAX_PUSH 3099 if (exec->nbPush > MAX_PUSH) { 3100 return; 3101 } 3102 exec->nbPush++; 3103 #endif 3104 3105 if (exec->maxRollbacks == 0) { 3106 exec->maxRollbacks = 4; 3107 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 3108 sizeof(xmlRegExecRollback)); 3109 if (exec->rollbacks == NULL) { 3110 xmlRegexpErrMemory(NULL, "saving regexp"); 3111 exec->maxRollbacks = 0; 3112 return; 3113 } 3114 memset(exec->rollbacks, 0, 3115 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3116 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 3117 xmlRegExecRollback *tmp; 3118 int len = exec->maxRollbacks; 3119 3120 exec->maxRollbacks *= 2; 3121 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 3122 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3123 if (tmp == NULL) { 3124 xmlRegexpErrMemory(NULL, "saving regexp"); 3125 exec->maxRollbacks /= 2; 3126 return; 3127 } 3128 exec->rollbacks = tmp; 3129 tmp = &exec->rollbacks[len]; 3130 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 3131 } 3132 exec->rollbacks[exec->nbRollbacks].state = exec->state; 3133 exec->rollbacks[exec->nbRollbacks].index = exec->index; 3134 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 3135 if (exec->comp->nbCounters > 0) { 3136 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3137 exec->rollbacks[exec->nbRollbacks].counts = (int *) 3138 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 3139 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3140 xmlRegexpErrMemory(NULL, "saving regexp"); 3141 exec->status = -5; 3142 return; 3143 } 3144 } 3145 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 3146 exec->comp->nbCounters * sizeof(int)); 3147 } 3148 exec->nbRollbacks++; 3149 } 3150 3151 static void 3152 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 3153 if (exec->nbRollbacks <= 0) { 3154 exec->status = -1; 3155 #ifdef DEBUG_REGEXP_EXEC 3156 printf("rollback failed on empty stack\n"); 3157 #endif 3158 return; 3159 } 3160 exec->nbRollbacks--; 3161 exec->state = exec->rollbacks[exec->nbRollbacks].state; 3162 exec->index = exec->rollbacks[exec->nbRollbacks].index; 3163 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 3164 if (exec->comp->nbCounters > 0) { 3165 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3166 fprintf(stderr, "exec save: allocation failed"); 3167 exec->status = -6; 3168 return; 3169 } 3170 if (exec->counts) { 3171 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 3172 exec->comp->nbCounters * sizeof(int)); 3173 } 3174 } 3175 3176 #ifdef DEBUG_REGEXP_EXEC 3177 printf("restored "); 3178 xmlFARegDebugExec(exec); 3179 #endif 3180 } 3181 3182 /************************************************************************ 3183 * * 3184 * Verifier, running an input against a compiled regexp * 3185 * * 3186 ************************************************************************/ 3187 3188 static int 3189 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 3190 xmlRegExecCtxt execval; 3191 xmlRegExecCtxtPtr exec = &execval; 3192 int ret, codepoint = 0, len, deter; 3193 3194 exec->inputString = content; 3195 exec->index = 0; 3196 exec->nbPush = 0; 3197 exec->determinist = 1; 3198 exec->maxRollbacks = 0; 3199 exec->nbRollbacks = 0; 3200 exec->rollbacks = NULL; 3201 exec->status = 0; 3202 exec->comp = comp; 3203 exec->state = comp->states[0]; 3204 exec->transno = 0; 3205 exec->transcount = 0; 3206 exec->inputStack = NULL; 3207 exec->inputStackMax = 0; 3208 if (comp->nbCounters > 0) { 3209 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3210 if (exec->counts == NULL) { 3211 xmlRegexpErrMemory(NULL, "running regexp"); 3212 return(-1); 3213 } 3214 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3215 } else 3216 exec->counts = NULL; 3217 while ((exec->status == 0) && (exec->state != NULL) && 3218 ((exec->inputString[exec->index] != 0) || 3219 ((exec->state != NULL) && 3220 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3221 xmlRegTransPtr trans; 3222 xmlRegAtomPtr atom; 3223 3224 /* 3225 * If end of input on non-terminal state, rollback, however we may 3226 * still have epsilon like transition for counted transitions 3227 * on counters, in that case don't break too early. Additionally, 3228 * if we are working on a range like "AB{0,2}", where B is not present, 3229 * we don't want to break. 3230 */ 3231 len = 1; 3232 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3233 /* 3234 * if there is a transition, we must check if 3235 * atom allows minOccurs of 0 3236 */ 3237 if (exec->transno < exec->state->nbTrans) { 3238 trans = &exec->state->trans[exec->transno]; 3239 if (trans->to >=0) { 3240 atom = trans->atom; 3241 if (!((atom->min == 0) && (atom->max > 0))) 3242 goto rollback; 3243 } 3244 } else 3245 goto rollback; 3246 } 3247 3248 exec->transcount = 0; 3249 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3250 trans = &exec->state->trans[exec->transno]; 3251 if (trans->to < 0) 3252 continue; 3253 atom = trans->atom; 3254 ret = 0; 3255 deter = 1; 3256 if (trans->count >= 0) { 3257 int count; 3258 xmlRegCounterPtr counter; 3259 3260 if (exec->counts == NULL) { 3261 exec->status = -1; 3262 goto error; 3263 } 3264 /* 3265 * A counted transition. 3266 */ 3267 3268 count = exec->counts[trans->count]; 3269 counter = &exec->comp->counters[trans->count]; 3270 #ifdef DEBUG_REGEXP_EXEC 3271 printf("testing count %d: val %d, min %d, max %d\n", 3272 trans->count, count, counter->min, counter->max); 3273 #endif 3274 ret = ((count >= counter->min) && (count <= counter->max)); 3275 if ((ret) && (counter->min != counter->max)) 3276 deter = 0; 3277 } else if (atom == NULL) { 3278 fprintf(stderr, "epsilon transition left at runtime\n"); 3279 exec->status = -2; 3280 break; 3281 } else if (exec->inputString[exec->index] != 0) { 3282 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3283 ret = xmlRegCheckCharacter(atom, codepoint); 3284 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3285 xmlRegStatePtr to = comp->states[trans->to]; 3286 3287 /* 3288 * this is a multiple input sequence 3289 * If there is a counter associated increment it now. 3290 * before potentially saving and rollback 3291 * do not increment if the counter is already over the 3292 * maximum limit in which case get to next transition 3293 */ 3294 if (trans->counter >= 0) { 3295 xmlRegCounterPtr counter; 3296 3297 if ((exec->counts == NULL) || 3298 (exec->comp == NULL) || 3299 (exec->comp->counters == NULL)) { 3300 exec->status = -1; 3301 goto error; 3302 } 3303 counter = &exec->comp->counters[trans->counter]; 3304 if (exec->counts[trans->counter] >= counter->max) 3305 continue; /* for loop on transitions */ 3306 3307 #ifdef DEBUG_REGEXP_EXEC 3308 printf("Increasing count %d\n", trans->counter); 3309 #endif 3310 exec->counts[trans->counter]++; 3311 } 3312 if (exec->state->nbTrans > exec->transno + 1) { 3313 xmlFARegExecSave(exec); 3314 } 3315 exec->transcount = 1; 3316 do { 3317 /* 3318 * Try to progress as much as possible on the input 3319 */ 3320 if (exec->transcount == atom->max) { 3321 break; 3322 } 3323 exec->index += len; 3324 /* 3325 * End of input: stop here 3326 */ 3327 if (exec->inputString[exec->index] == 0) { 3328 exec->index -= len; 3329 break; 3330 } 3331 if (exec->transcount >= atom->min) { 3332 int transno = exec->transno; 3333 xmlRegStatePtr state = exec->state; 3334 3335 /* 3336 * The transition is acceptable save it 3337 */ 3338 exec->transno = -1; /* trick */ 3339 exec->state = to; 3340 xmlFARegExecSave(exec); 3341 exec->transno = transno; 3342 exec->state = state; 3343 } 3344 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3345 len); 3346 ret = xmlRegCheckCharacter(atom, codepoint); 3347 exec->transcount++; 3348 } while (ret == 1); 3349 if (exec->transcount < atom->min) 3350 ret = 0; 3351 3352 /* 3353 * If the last check failed but one transition was found 3354 * possible, rollback 3355 */ 3356 if (ret < 0) 3357 ret = 0; 3358 if (ret == 0) { 3359 goto rollback; 3360 } 3361 if (trans->counter >= 0) { 3362 if (exec->counts == NULL) { 3363 exec->status = -1; 3364 goto error; 3365 } 3366 #ifdef DEBUG_REGEXP_EXEC 3367 printf("Decreasing count %d\n", trans->counter); 3368 #endif 3369 exec->counts[trans->counter]--; 3370 } 3371 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3372 /* 3373 * we don't match on the codepoint, but minOccurs of 0 3374 * says that's ok. Setting len to 0 inhibits stepping 3375 * over the codepoint. 3376 */ 3377 exec->transcount = 1; 3378 len = 0; 3379 ret = 1; 3380 } 3381 } else if ((atom->min == 0) && (atom->max > 0)) { 3382 /* another spot to match when minOccurs is 0 */ 3383 exec->transcount = 1; 3384 len = 0; 3385 ret = 1; 3386 } 3387 if (ret == 1) { 3388 if ((trans->nd == 1) || 3389 ((trans->count >= 0) && (deter == 0) && 3390 (exec->state->nbTrans > exec->transno + 1))) { 3391 #ifdef DEBUG_REGEXP_EXEC 3392 if (trans->nd == 1) 3393 printf("Saving on nd transition atom %d for %c at %d\n", 3394 trans->atom->no, codepoint, exec->index); 3395 else 3396 printf("Saving on counted transition count %d for %c at %d\n", 3397 trans->count, codepoint, exec->index); 3398 #endif 3399 xmlFARegExecSave(exec); 3400 } 3401 if (trans->counter >= 0) { 3402 xmlRegCounterPtr counter; 3403 3404 /* make sure we don't go over the counter maximum value */ 3405 if ((exec->counts == NULL) || 3406 (exec->comp == NULL) || 3407 (exec->comp->counters == NULL)) { 3408 exec->status = -1; 3409 goto error; 3410 } 3411 counter = &exec->comp->counters[trans->counter]; 3412 if (exec->counts[trans->counter] >= counter->max) 3413 continue; /* for loop on transitions */ 3414 #ifdef DEBUG_REGEXP_EXEC 3415 printf("Increasing count %d\n", trans->counter); 3416 #endif 3417 exec->counts[trans->counter]++; 3418 } 3419 if ((trans->count >= 0) && 3420 (trans->count < REGEXP_ALL_COUNTER)) { 3421 if (exec->counts == NULL) { 3422 exec->status = -1; 3423 goto error; 3424 } 3425 #ifdef DEBUG_REGEXP_EXEC 3426 printf("resetting count %d on transition\n", 3427 trans->count); 3428 #endif 3429 exec->counts[trans->count] = 0; 3430 } 3431 #ifdef DEBUG_REGEXP_EXEC 3432 printf("entering state %d\n", trans->to); 3433 #endif 3434 exec->state = comp->states[trans->to]; 3435 exec->transno = 0; 3436 if (trans->atom != NULL) { 3437 exec->index += len; 3438 } 3439 goto progress; 3440 } else if (ret < 0) { 3441 exec->status = -4; 3442 break; 3443 } 3444 } 3445 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3446 rollback: 3447 /* 3448 * Failed to find a way out 3449 */ 3450 exec->determinist = 0; 3451 #ifdef DEBUG_REGEXP_EXEC 3452 printf("rollback from state %d on %d:%c\n", exec->state->no, 3453 codepoint,codepoint); 3454 #endif 3455 xmlFARegExecRollBack(exec); 3456 } 3457 progress: 3458 continue; 3459 } 3460 error: 3461 if (exec->rollbacks != NULL) { 3462 if (exec->counts != NULL) { 3463 int i; 3464 3465 for (i = 0;i < exec->maxRollbacks;i++) 3466 if (exec->rollbacks[i].counts != NULL) 3467 xmlFree(exec->rollbacks[i].counts); 3468 } 3469 xmlFree(exec->rollbacks); 3470 } 3471 if (exec->state == NULL) 3472 return(-1); 3473 if (exec->counts != NULL) 3474 xmlFree(exec->counts); 3475 if (exec->status == 0) 3476 return(1); 3477 if (exec->status == -1) { 3478 if (exec->nbPush > MAX_PUSH) 3479 return(-1); 3480 return(0); 3481 } 3482 return(exec->status); 3483 } 3484 3485 /************************************************************************ 3486 * * 3487 * Progressive interface to the verifier one atom at a time * 3488 * * 3489 ************************************************************************/ 3490 #ifdef DEBUG_ERR 3491 static void testerr(xmlRegExecCtxtPtr exec); 3492 #endif 3493 3494 /** 3495 * xmlRegNewExecCtxt: 3496 * @comp: a precompiled regular expression 3497 * @callback: a callback function used for handling progresses in the 3498 * automata matching phase 3499 * @data: the context data associated to the callback in this context 3500 * 3501 * Build a context used for progressive evaluation of a regexp. 3502 * 3503 * Returns the new context 3504 */ 3505 xmlRegExecCtxtPtr 3506 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3507 xmlRegExecCtxtPtr exec; 3508 3509 if (comp == NULL) 3510 return(NULL); 3511 if ((comp->compact == NULL) && (comp->states == NULL)) 3512 return(NULL); 3513 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3514 if (exec == NULL) { 3515 xmlRegexpErrMemory(NULL, "creating execution context"); 3516 return(NULL); 3517 } 3518 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3519 exec->inputString = NULL; 3520 exec->index = 0; 3521 exec->determinist = 1; 3522 exec->maxRollbacks = 0; 3523 exec->nbRollbacks = 0; 3524 exec->rollbacks = NULL; 3525 exec->status = 0; 3526 exec->comp = comp; 3527 if (comp->compact == NULL) 3528 exec->state = comp->states[0]; 3529 exec->transno = 0; 3530 exec->transcount = 0; 3531 exec->callback = callback; 3532 exec->data = data; 3533 if (comp->nbCounters > 0) { 3534 /* 3535 * For error handling, exec->counts is allocated twice the size 3536 * the second half is used to store the data in case of rollback 3537 */ 3538 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3539 * 2); 3540 if (exec->counts == NULL) { 3541 xmlRegexpErrMemory(NULL, "creating execution context"); 3542 xmlFree(exec); 3543 return(NULL); 3544 } 3545 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3546 exec->errCounts = &exec->counts[comp->nbCounters]; 3547 } else { 3548 exec->counts = NULL; 3549 exec->errCounts = NULL; 3550 } 3551 exec->inputStackMax = 0; 3552 exec->inputStackNr = 0; 3553 exec->inputStack = NULL; 3554 exec->errStateNo = -1; 3555 exec->errString = NULL; 3556 exec->nbPush = 0; 3557 return(exec); 3558 } 3559 3560 /** 3561 * xmlRegFreeExecCtxt: 3562 * @exec: a regular expression evaulation context 3563 * 3564 * Free the structures associated to a regular expression evaulation context. 3565 */ 3566 void 3567 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3568 if (exec == NULL) 3569 return; 3570 3571 if (exec->rollbacks != NULL) { 3572 if (exec->counts != NULL) { 3573 int i; 3574 3575 for (i = 0;i < exec->maxRollbacks;i++) 3576 if (exec->rollbacks[i].counts != NULL) 3577 xmlFree(exec->rollbacks[i].counts); 3578 } 3579 xmlFree(exec->rollbacks); 3580 } 3581 if (exec->counts != NULL) 3582 xmlFree(exec->counts); 3583 if (exec->inputStack != NULL) { 3584 int i; 3585 3586 for (i = 0;i < exec->inputStackNr;i++) { 3587 if (exec->inputStack[i].value != NULL) 3588 xmlFree(exec->inputStack[i].value); 3589 } 3590 xmlFree(exec->inputStack); 3591 } 3592 if (exec->errString != NULL) 3593 xmlFree(exec->errString); 3594 xmlFree(exec); 3595 } 3596 3597 static void 3598 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3599 void *data) { 3600 #ifdef DEBUG_PUSH 3601 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3602 #endif 3603 if (exec->inputStackMax == 0) { 3604 exec->inputStackMax = 4; 3605 exec->inputStack = (xmlRegInputTokenPtr) 3606 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3607 if (exec->inputStack == NULL) { 3608 xmlRegexpErrMemory(NULL, "pushing input string"); 3609 exec->inputStackMax = 0; 3610 return; 3611 } 3612 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3613 xmlRegInputTokenPtr tmp; 3614 3615 exec->inputStackMax *= 2; 3616 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3617 exec->inputStackMax * sizeof(xmlRegInputToken)); 3618 if (tmp == NULL) { 3619 xmlRegexpErrMemory(NULL, "pushing input string"); 3620 exec->inputStackMax /= 2; 3621 return; 3622 } 3623 exec->inputStack = tmp; 3624 } 3625 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3626 exec->inputStack[exec->inputStackNr].data = data; 3627 exec->inputStackNr++; 3628 exec->inputStack[exec->inputStackNr].value = NULL; 3629 exec->inputStack[exec->inputStackNr].data = NULL; 3630 } 3631 3632 /** 3633 * xmlRegStrEqualWildcard: 3634 * @expStr: the string to be evaluated 3635 * @valStr: the validation string 3636 * 3637 * Checks if both strings are equal or have the same content. "*" 3638 * can be used as a wildcard in @valStr; "|" is used as a seperator of 3639 * substrings in both @expStr and @valStr. 3640 * 3641 * Returns 1 if the comparison is satisfied and the number of substrings 3642 * is equal, 0 otherwise. 3643 */ 3644 3645 static int 3646 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3647 if (expStr == valStr) return(1); 3648 if (expStr == NULL) return(0); 3649 if (valStr == NULL) return(0); 3650 do { 3651 /* 3652 * Eval if we have a wildcard for the current item. 3653 */ 3654 if (*expStr != *valStr) { 3655 /* if one of them starts with a wildcard make valStr be it */ 3656 if (*valStr == '*') { 3657 const xmlChar *tmp; 3658 3659 tmp = valStr; 3660 valStr = expStr; 3661 expStr = tmp; 3662 } 3663 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3664 do { 3665 if (*valStr == XML_REG_STRING_SEPARATOR) 3666 break; 3667 valStr++; 3668 } while (*valStr != 0); 3669 continue; 3670 } else 3671 return(0); 3672 } 3673 expStr++; 3674 valStr++; 3675 } while (*valStr != 0); 3676 if (*expStr != 0) 3677 return (0); 3678 else 3679 return (1); 3680 } 3681 3682 /** 3683 * xmlRegCompactPushString: 3684 * @exec: a regexp execution context 3685 * @comp: the precompiled exec with a compact table 3686 * @value: a string token input 3687 * @data: data associated to the token to reuse in callbacks 3688 * 3689 * Push one input token in the execution context 3690 * 3691 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3692 * a negative value in case of error. 3693 */ 3694 static int 3695 xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3696 xmlRegexpPtr comp, 3697 const xmlChar *value, 3698 void *data) { 3699 int state = exec->index; 3700 int i, target; 3701 3702 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3703 return(-1); 3704 3705 if (value == NULL) { 3706 /* 3707 * are we at a final state ? 3708 */ 3709 if (comp->compact[state * (comp->nbstrings + 1)] == 3710 XML_REGEXP_FINAL_STATE) 3711 return(1); 3712 return(0); 3713 } 3714 3715 #ifdef DEBUG_PUSH 3716 printf("value pushed: %s\n", value); 3717 #endif 3718 3719 /* 3720 * Examine all outside transitions from current state 3721 */ 3722 for (i = 0;i < comp->nbstrings;i++) { 3723 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3724 if ((target > 0) && (target <= comp->nbstates)) { 3725 target--; /* to avoid 0 */ 3726 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3727 exec->index = target; 3728 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3729 exec->callback(exec->data, value, 3730 comp->transdata[state * comp->nbstrings + i], data); 3731 } 3732 #ifdef DEBUG_PUSH 3733 printf("entering state %d\n", target); 3734 #endif 3735 if (comp->compact[target * (comp->nbstrings + 1)] == 3736 XML_REGEXP_SINK_STATE) 3737 goto error; 3738 3739 if (comp->compact[target * (comp->nbstrings + 1)] == 3740 XML_REGEXP_FINAL_STATE) 3741 return(1); 3742 return(0); 3743 } 3744 } 3745 } 3746 /* 3747 * Failed to find an exit transition out from current state for the 3748 * current token 3749 */ 3750 #ifdef DEBUG_PUSH 3751 printf("failed to find a transition for %s on state %d\n", value, state); 3752 #endif 3753 error: 3754 if (exec->errString != NULL) 3755 xmlFree(exec->errString); 3756 exec->errString = xmlStrdup(value); 3757 exec->errStateNo = state; 3758 exec->status = -1; 3759 #ifdef DEBUG_ERR 3760 testerr(exec); 3761 #endif 3762 return(-1); 3763 } 3764 3765 /** 3766 * xmlRegExecPushStringInternal: 3767 * @exec: a regexp execution context or NULL to indicate the end 3768 * @value: a string token input 3769 * @data: data associated to the token to reuse in callbacks 3770 * @compound: value was assembled from 2 strings 3771 * 3772 * Push one input token in the execution context 3773 * 3774 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3775 * a negative value in case of error. 3776 */ 3777 static int 3778 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3779 void *data, int compound) { 3780 xmlRegTransPtr trans; 3781 xmlRegAtomPtr atom; 3782 int ret; 3783 int final = 0; 3784 int progress = 1; 3785 3786 if (exec == NULL) 3787 return(-1); 3788 if (exec->comp == NULL) 3789 return(-1); 3790 if (exec->status != 0) 3791 return(exec->status); 3792 3793 if (exec->comp->compact != NULL) 3794 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3795 3796 if (value == NULL) { 3797 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3798 return(1); 3799 final = 1; 3800 } 3801 3802 #ifdef DEBUG_PUSH 3803 printf("value pushed: %s\n", value); 3804 #endif 3805 /* 3806 * If we have an active rollback stack push the new value there 3807 * and get back to where we were left 3808 */ 3809 if ((value != NULL) && (exec->inputStackNr > 0)) { 3810 xmlFARegExecSaveInputString(exec, value, data); 3811 value = exec->inputStack[exec->index].value; 3812 data = exec->inputStack[exec->index].data; 3813 #ifdef DEBUG_PUSH 3814 printf("value loaded: %s\n", value); 3815 #endif 3816 } 3817 3818 while ((exec->status == 0) && 3819 ((value != NULL) || 3820 ((final == 1) && 3821 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3822 3823 /* 3824 * End of input on non-terminal state, rollback, however we may 3825 * still have epsilon like transition for counted transitions 3826 * on counters, in that case don't break too early. 3827 */ 3828 if ((value == NULL) && (exec->counts == NULL)) 3829 goto rollback; 3830 3831 exec->transcount = 0; 3832 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3833 trans = &exec->state->trans[exec->transno]; 3834 if (trans->to < 0) 3835 continue; 3836 atom = trans->atom; 3837 ret = 0; 3838 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3839 int i; 3840 int count; 3841 xmlRegTransPtr t; 3842 xmlRegCounterPtr counter; 3843 3844 ret = 0; 3845 3846 #ifdef DEBUG_PUSH 3847 printf("testing all lax %d\n", trans->count); 3848 #endif 3849 /* 3850 * Check all counted transitions from the current state 3851 */ 3852 if ((value == NULL) && (final)) { 3853 ret = 1; 3854 } else if (value != NULL) { 3855 for (i = 0;i < exec->state->nbTrans;i++) { 3856 t = &exec->state->trans[i]; 3857 if ((t->counter < 0) || (t == trans)) 3858 continue; 3859 counter = &exec->comp->counters[t->counter]; 3860 count = exec->counts[t->counter]; 3861 if ((count < counter->max) && 3862 (t->atom != NULL) && 3863 (xmlStrEqual(value, t->atom->valuep))) { 3864 ret = 0; 3865 break; 3866 } 3867 if ((count >= counter->min) && 3868 (count < counter->max) && 3869 (t->atom != NULL) && 3870 (xmlStrEqual(value, t->atom->valuep))) { 3871 ret = 1; 3872 break; 3873 } 3874 } 3875 } 3876 } else if (trans->count == REGEXP_ALL_COUNTER) { 3877 int i; 3878 int count; 3879 xmlRegTransPtr t; 3880 xmlRegCounterPtr counter; 3881 3882 ret = 1; 3883 3884 #ifdef DEBUG_PUSH 3885 printf("testing all %d\n", trans->count); 3886 #endif 3887 /* 3888 * Check all counted transitions from the current state 3889 */ 3890 for (i = 0;i < exec->state->nbTrans;i++) { 3891 t = &exec->state->trans[i]; 3892 if ((t->counter < 0) || (t == trans)) 3893 continue; 3894 counter = &exec->comp->counters[t->counter]; 3895 count = exec->counts[t->counter]; 3896 if ((count < counter->min) || (count > counter->max)) { 3897 ret = 0; 3898 break; 3899 } 3900 } 3901 } else if (trans->count >= 0) { 3902 int count; 3903 xmlRegCounterPtr counter; 3904 3905 /* 3906 * A counted transition. 3907 */ 3908 3909 count = exec->counts[trans->count]; 3910 counter = &exec->comp->counters[trans->count]; 3911 #ifdef DEBUG_PUSH 3912 printf("testing count %d: val %d, min %d, max %d\n", 3913 trans->count, count, counter->min, counter->max); 3914 #endif 3915 ret = ((count >= counter->min) && (count <= counter->max)); 3916 } else if (atom == NULL) { 3917 fprintf(stderr, "epsilon transition left at runtime\n"); 3918 exec->status = -2; 3919 break; 3920 } else if (value != NULL) { 3921 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3922 if (atom->neg) { 3923 ret = !ret; 3924 if (!compound) 3925 ret = 0; 3926 } 3927 if ((ret == 1) && (trans->counter >= 0)) { 3928 xmlRegCounterPtr counter; 3929 int count; 3930 3931 count = exec->counts[trans->counter]; 3932 counter = &exec->comp->counters[trans->counter]; 3933 if (count >= counter->max) 3934 ret = 0; 3935 } 3936 3937 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3938 xmlRegStatePtr to = exec->comp->states[trans->to]; 3939 3940 /* 3941 * this is a multiple input sequence 3942 */ 3943 if (exec->state->nbTrans > exec->transno + 1) { 3944 if (exec->inputStackNr <= 0) { 3945 xmlFARegExecSaveInputString(exec, value, data); 3946 } 3947 xmlFARegExecSave(exec); 3948 } 3949 exec->transcount = 1; 3950 do { 3951 /* 3952 * Try to progress as much as possible on the input 3953 */ 3954 if (exec->transcount == atom->max) { 3955 break; 3956 } 3957 exec->index++; 3958 value = exec->inputStack[exec->index].value; 3959 data = exec->inputStack[exec->index].data; 3960 #ifdef DEBUG_PUSH 3961 printf("value loaded: %s\n", value); 3962 #endif 3963 3964 /* 3965 * End of input: stop here 3966 */ 3967 if (value == NULL) { 3968 exec->index --; 3969 break; 3970 } 3971 if (exec->transcount >= atom->min) { 3972 int transno = exec->transno; 3973 xmlRegStatePtr state = exec->state; 3974 3975 /* 3976 * The transition is acceptable save it 3977 */ 3978 exec->transno = -1; /* trick */ 3979 exec->state = to; 3980 if (exec->inputStackNr <= 0) { 3981 xmlFARegExecSaveInputString(exec, value, data); 3982 } 3983 xmlFARegExecSave(exec); 3984 exec->transno = transno; 3985 exec->state = state; 3986 } 3987 ret = xmlStrEqual(value, atom->valuep); 3988 exec->transcount++; 3989 } while (ret == 1); 3990 if (exec->transcount < atom->min) 3991 ret = 0; 3992 3993 /* 3994 * If the last check failed but one transition was found 3995 * possible, rollback 3996 */ 3997 if (ret < 0) 3998 ret = 0; 3999 if (ret == 0) { 4000 goto rollback; 4001 } 4002 } 4003 } 4004 if (ret == 1) { 4005 if ((exec->callback != NULL) && (atom != NULL) && 4006 (data != NULL)) { 4007 exec->callback(exec->data, atom->valuep, 4008 atom->data, data); 4009 } 4010 if (exec->state->nbTrans > exec->transno + 1) { 4011 if (exec->inputStackNr <= 0) { 4012 xmlFARegExecSaveInputString(exec, value, data); 4013 } 4014 xmlFARegExecSave(exec); 4015 } 4016 if (trans->counter >= 0) { 4017 #ifdef DEBUG_PUSH 4018 printf("Increasing count %d\n", trans->counter); 4019 #endif 4020 exec->counts[trans->counter]++; 4021 } 4022 if ((trans->count >= 0) && 4023 (trans->count < REGEXP_ALL_COUNTER)) { 4024 #ifdef DEBUG_REGEXP_EXEC 4025 printf("resetting count %d on transition\n", 4026 trans->count); 4027 #endif 4028 exec->counts[trans->count] = 0; 4029 } 4030 #ifdef DEBUG_PUSH 4031 printf("entering state %d\n", trans->to); 4032 #endif 4033 if ((exec->comp->states[trans->to] != NULL) && 4034 (exec->comp->states[trans->to]->type == 4035 XML_REGEXP_SINK_STATE)) { 4036 /* 4037 * entering a sink state, save the current state as error 4038 * state. 4039 */ 4040 if (exec->errString != NULL) 4041 xmlFree(exec->errString); 4042 exec->errString = xmlStrdup(value); 4043 exec->errState = exec->state; 4044 memcpy(exec->errCounts, exec->counts, 4045 exec->comp->nbCounters * sizeof(int)); 4046 } 4047 exec->state = exec->comp->states[trans->to]; 4048 exec->transno = 0; 4049 if (trans->atom != NULL) { 4050 if (exec->inputStack != NULL) { 4051 exec->index++; 4052 if (exec->index < exec->inputStackNr) { 4053 value = exec->inputStack[exec->index].value; 4054 data = exec->inputStack[exec->index].data; 4055 #ifdef DEBUG_PUSH 4056 printf("value loaded: %s\n", value); 4057 #endif 4058 } else { 4059 value = NULL; 4060 data = NULL; 4061 #ifdef DEBUG_PUSH 4062 printf("end of input\n"); 4063 #endif 4064 } 4065 } else { 4066 value = NULL; 4067 data = NULL; 4068 #ifdef DEBUG_PUSH 4069 printf("end of input\n"); 4070 #endif 4071 } 4072 } 4073 goto progress; 4074 } else if (ret < 0) { 4075 exec->status = -4; 4076 break; 4077 } 4078 } 4079 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4080 rollback: 4081 /* 4082 * if we didn't yet rollback on the current input 4083 * store the current state as the error state. 4084 */ 4085 if ((progress) && (exec->state != NULL) && 4086 (exec->state->type != XML_REGEXP_SINK_STATE)) { 4087 progress = 0; 4088 if (exec->errString != NULL) 4089 xmlFree(exec->errString); 4090 exec->errString = xmlStrdup(value); 4091 exec->errState = exec->state; 4092 memcpy(exec->errCounts, exec->counts, 4093 exec->comp->nbCounters * sizeof(int)); 4094 } 4095 4096 /* 4097 * Failed to find a way out 4098 */ 4099 exec->determinist = 0; 4100 xmlFARegExecRollBack(exec); 4101 if ((exec->inputStack != NULL ) && (exec->status == 0)) { 4102 value = exec->inputStack[exec->index].value; 4103 data = exec->inputStack[exec->index].data; 4104 #ifdef DEBUG_PUSH 4105 printf("value loaded: %s\n", value); 4106 #endif 4107 } 4108 } 4109 continue; 4110 progress: 4111 progress = 1; 4112 continue; 4113 } 4114 if (exec->status == 0) { 4115 return(exec->state->type == XML_REGEXP_FINAL_STATE); 4116 } 4117 #ifdef DEBUG_ERR 4118 if (exec->status < 0) { 4119 testerr(exec); 4120 } 4121 #endif 4122 return(exec->status); 4123 } 4124 4125 /** 4126 * xmlRegExecPushString: 4127 * @exec: a regexp execution context or NULL to indicate the end 4128 * @value: a string token input 4129 * @data: data associated to the token to reuse in callbacks 4130 * 4131 * Push one input token in the execution context 4132 * 4133 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4134 * a negative value in case of error. 4135 */ 4136 int 4137 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 4138 void *data) { 4139 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 4140 } 4141 4142 /** 4143 * xmlRegExecPushString2: 4144 * @exec: a regexp execution context or NULL to indicate the end 4145 * @value: the first string token input 4146 * @value2: the second string token input 4147 * @data: data associated to the token to reuse in callbacks 4148 * 4149 * Push one input token in the execution context 4150 * 4151 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4152 * a negative value in case of error. 4153 */ 4154 int 4155 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 4156 const xmlChar *value2, void *data) { 4157 xmlChar buf[150]; 4158 int lenn, lenp, ret; 4159 xmlChar *str; 4160 4161 if (exec == NULL) 4162 return(-1); 4163 if (exec->comp == NULL) 4164 return(-1); 4165 if (exec->status != 0) 4166 return(exec->status); 4167 4168 if (value2 == NULL) 4169 return(xmlRegExecPushString(exec, value, data)); 4170 4171 lenn = strlen((char *) value2); 4172 lenp = strlen((char *) value); 4173 4174 if (150 < lenn + lenp + 2) { 4175 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4176 if (str == NULL) { 4177 exec->status = -1; 4178 return(-1); 4179 } 4180 } else { 4181 str = buf; 4182 } 4183 memcpy(&str[0], value, lenp); 4184 str[lenp] = XML_REG_STRING_SEPARATOR; 4185 memcpy(&str[lenp + 1], value2, lenn); 4186 str[lenn + lenp + 1] = 0; 4187 4188 if (exec->comp->compact != NULL) 4189 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 4190 else 4191 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 4192 4193 if (str != buf) 4194 xmlFree(str); 4195 return(ret); 4196 } 4197 4198 /** 4199 * xmlRegExecGetValues: 4200 * @exec: a regexp execution context 4201 * @err: error extraction or normal one 4202 * @nbval: pointer to the number of accepted values IN/OUT 4203 * @nbneg: return number of negative transitions 4204 * @values: pointer to the array of acceptable values 4205 * @terminal: return value if this was a terminal state 4206 * 4207 * Extract informations from the regexp execution, internal routine to 4208 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 4209 * 4210 * Returns: 0 in case of success or -1 in case of error. 4211 */ 4212 static int 4213 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 4214 int *nbval, int *nbneg, 4215 xmlChar **values, int *terminal) { 4216 int maxval; 4217 int nb = 0; 4218 4219 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 4220 (values == NULL) || (*nbval <= 0)) 4221 return(-1); 4222 4223 maxval = *nbval; 4224 *nbval = 0; 4225 *nbneg = 0; 4226 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4227 xmlRegexpPtr comp; 4228 int target, i, state; 4229 4230 comp = exec->comp; 4231 4232 if (err) { 4233 if (exec->errStateNo == -1) return(-1); 4234 state = exec->errStateNo; 4235 } else { 4236 state = exec->index; 4237 } 4238 if (terminal != NULL) { 4239 if (comp->compact[state * (comp->nbstrings + 1)] == 4240 XML_REGEXP_FINAL_STATE) 4241 *terminal = 1; 4242 else 4243 *terminal = 0; 4244 } 4245 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4246 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4247 if ((target > 0) && (target <= comp->nbstates) && 4248 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4249 XML_REGEXP_SINK_STATE)) { 4250 values[nb++] = comp->stringMap[i]; 4251 (*nbval)++; 4252 } 4253 } 4254 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4255 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4256 if ((target > 0) && (target <= comp->nbstates) && 4257 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4258 XML_REGEXP_SINK_STATE)) { 4259 values[nb++] = comp->stringMap[i]; 4260 (*nbneg)++; 4261 } 4262 } 4263 } else { 4264 int transno; 4265 xmlRegTransPtr trans; 4266 xmlRegAtomPtr atom; 4267 xmlRegStatePtr state; 4268 4269 if (terminal != NULL) { 4270 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4271 *terminal = 1; 4272 else 4273 *terminal = 0; 4274 } 4275 4276 if (err) { 4277 if (exec->errState == NULL) return(-1); 4278 state = exec->errState; 4279 } else { 4280 if (exec->state == NULL) return(-1); 4281 state = exec->state; 4282 } 4283 for (transno = 0; 4284 (transno < state->nbTrans) && (nb < maxval); 4285 transno++) { 4286 trans = &state->trans[transno]; 4287 if (trans->to < 0) 4288 continue; 4289 atom = trans->atom; 4290 if ((atom == NULL) || (atom->valuep == NULL)) 4291 continue; 4292 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4293 /* this should not be reached but ... */ 4294 TODO; 4295 } else if (trans->count == REGEXP_ALL_COUNTER) { 4296 /* this should not be reached but ... */ 4297 TODO; 4298 } else if (trans->counter >= 0) { 4299 xmlRegCounterPtr counter = NULL; 4300 int count; 4301 4302 if (err) 4303 count = exec->errCounts[trans->counter]; 4304 else 4305 count = exec->counts[trans->counter]; 4306 if (exec->comp != NULL) 4307 counter = &exec->comp->counters[trans->counter]; 4308 if ((counter == NULL) || (count < counter->max)) { 4309 if (atom->neg) 4310 values[nb++] = (xmlChar *) atom->valuep2; 4311 else 4312 values[nb++] = (xmlChar *) atom->valuep; 4313 (*nbval)++; 4314 } 4315 } else { 4316 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) && 4317 (exec->comp->states[trans->to]->type != 4318 XML_REGEXP_SINK_STATE)) { 4319 if (atom->neg) 4320 values[nb++] = (xmlChar *) atom->valuep2; 4321 else 4322 values[nb++] = (xmlChar *) atom->valuep; 4323 (*nbval)++; 4324 } 4325 } 4326 } 4327 for (transno = 0; 4328 (transno < state->nbTrans) && (nb < maxval); 4329 transno++) { 4330 trans = &state->trans[transno]; 4331 if (trans->to < 0) 4332 continue; 4333 atom = trans->atom; 4334 if ((atom == NULL) || (atom->valuep == NULL)) 4335 continue; 4336 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4337 continue; 4338 } else if (trans->count == REGEXP_ALL_COUNTER) { 4339 continue; 4340 } else if (trans->counter >= 0) { 4341 continue; 4342 } else { 4343 if ((exec->comp->states[trans->to] != NULL) && 4344 (exec->comp->states[trans->to]->type == 4345 XML_REGEXP_SINK_STATE)) { 4346 if (atom->neg) 4347 values[nb++] = (xmlChar *) atom->valuep2; 4348 else 4349 values[nb++] = (xmlChar *) atom->valuep; 4350 (*nbneg)++; 4351 } 4352 } 4353 } 4354 } 4355 return(0); 4356 } 4357 4358 /** 4359 * xmlRegExecNextValues: 4360 * @exec: a regexp execution context 4361 * @nbval: pointer to the number of accepted values IN/OUT 4362 * @nbneg: return number of negative transitions 4363 * @values: pointer to the array of acceptable values 4364 * @terminal: return value if this was a terminal state 4365 * 4366 * Extract informations from the regexp execution, 4367 * the parameter @values must point to an array of @nbval string pointers 4368 * on return nbval will contain the number of possible strings in that 4369 * state and the @values array will be updated with them. The string values 4370 * returned will be freed with the @exec context and don't need to be 4371 * deallocated. 4372 * 4373 * Returns: 0 in case of success or -1 in case of error. 4374 */ 4375 int 4376 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4377 xmlChar **values, int *terminal) { 4378 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4379 } 4380 4381 /** 4382 * xmlRegExecErrInfo: 4383 * @exec: a regexp execution context generating an error 4384 * @string: return value for the error string 4385 * @nbval: pointer to the number of accepted values IN/OUT 4386 * @nbneg: return number of negative transitions 4387 * @values: pointer to the array of acceptable values 4388 * @terminal: return value if this was a terminal state 4389 * 4390 * Extract error informations from the regexp execution, the parameter 4391 * @string will be updated with the value pushed and not accepted, 4392 * the parameter @values must point to an array of @nbval string pointers 4393 * on return nbval will contain the number of possible strings in that 4394 * state and the @values array will be updated with them. The string values 4395 * returned will be freed with the @exec context and don't need to be 4396 * deallocated. 4397 * 4398 * Returns: 0 in case of success or -1 in case of error. 4399 */ 4400 int 4401 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4402 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4403 if (exec == NULL) 4404 return(-1); 4405 if (string != NULL) { 4406 if (exec->status != 0) 4407 *string = exec->errString; 4408 else 4409 *string = NULL; 4410 } 4411 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4412 } 4413 4414 #ifdef DEBUG_ERR 4415 static void testerr(xmlRegExecCtxtPtr exec) { 4416 const xmlChar *string; 4417 xmlChar *values[5]; 4418 int nb = 5; 4419 int nbneg; 4420 int terminal; 4421 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4422 } 4423 #endif 4424 4425 #if 0 4426 static int 4427 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4428 xmlRegTransPtr trans; 4429 xmlRegAtomPtr atom; 4430 int ret; 4431 int codepoint, len; 4432 4433 if (exec == NULL) 4434 return(-1); 4435 if (exec->status != 0) 4436 return(exec->status); 4437 4438 while ((exec->status == 0) && 4439 ((exec->inputString[exec->index] != 0) || 4440 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4441 4442 /* 4443 * End of input on non-terminal state, rollback, however we may 4444 * still have epsilon like transition for counted transitions 4445 * on counters, in that case don't break too early. 4446 */ 4447 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4448 goto rollback; 4449 4450 exec->transcount = 0; 4451 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4452 trans = &exec->state->trans[exec->transno]; 4453 if (trans->to < 0) 4454 continue; 4455 atom = trans->atom; 4456 ret = 0; 4457 if (trans->count >= 0) { 4458 int count; 4459 xmlRegCounterPtr counter; 4460 4461 /* 4462 * A counted transition. 4463 */ 4464 4465 count = exec->counts[trans->count]; 4466 counter = &exec->comp->counters[trans->count]; 4467 #ifdef DEBUG_REGEXP_EXEC 4468 printf("testing count %d: val %d, min %d, max %d\n", 4469 trans->count, count, counter->min, counter->max); 4470 #endif 4471 ret = ((count >= counter->min) && (count <= counter->max)); 4472 } else if (atom == NULL) { 4473 fprintf(stderr, "epsilon transition left at runtime\n"); 4474 exec->status = -2; 4475 break; 4476 } else if (exec->inputString[exec->index] != 0) { 4477 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4478 ret = xmlRegCheckCharacter(atom, codepoint); 4479 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4480 xmlRegStatePtr to = exec->comp->states[trans->to]; 4481 4482 /* 4483 * this is a multiple input sequence 4484 */ 4485 if (exec->state->nbTrans > exec->transno + 1) { 4486 xmlFARegExecSave(exec); 4487 } 4488 exec->transcount = 1; 4489 do { 4490 /* 4491 * Try to progress as much as possible on the input 4492 */ 4493 if (exec->transcount == atom->max) { 4494 break; 4495 } 4496 exec->index += len; 4497 /* 4498 * End of input: stop here 4499 */ 4500 if (exec->inputString[exec->index] == 0) { 4501 exec->index -= len; 4502 break; 4503 } 4504 if (exec->transcount >= atom->min) { 4505 int transno = exec->transno; 4506 xmlRegStatePtr state = exec->state; 4507 4508 /* 4509 * The transition is acceptable save it 4510 */ 4511 exec->transno = -1; /* trick */ 4512 exec->state = to; 4513 xmlFARegExecSave(exec); 4514 exec->transno = transno; 4515 exec->state = state; 4516 } 4517 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4518 len); 4519 ret = xmlRegCheckCharacter(atom, codepoint); 4520 exec->transcount++; 4521 } while (ret == 1); 4522 if (exec->transcount < atom->min) 4523 ret = 0; 4524 4525 /* 4526 * If the last check failed but one transition was found 4527 * possible, rollback 4528 */ 4529 if (ret < 0) 4530 ret = 0; 4531 if (ret == 0) { 4532 goto rollback; 4533 } 4534 } 4535 } 4536 if (ret == 1) { 4537 if (exec->state->nbTrans > exec->transno + 1) { 4538 xmlFARegExecSave(exec); 4539 } 4540 /* 4541 * restart count for expressions like this ((abc){2})* 4542 */ 4543 if (trans->count >= 0) { 4544 #ifdef DEBUG_REGEXP_EXEC 4545 printf("Reset count %d\n", trans->count); 4546 #endif 4547 exec->counts[trans->count] = 0; 4548 } 4549 if (trans->counter >= 0) { 4550 #ifdef DEBUG_REGEXP_EXEC 4551 printf("Increasing count %d\n", trans->counter); 4552 #endif 4553 exec->counts[trans->counter]++; 4554 } 4555 #ifdef DEBUG_REGEXP_EXEC 4556 printf("entering state %d\n", trans->to); 4557 #endif 4558 exec->state = exec->comp->states[trans->to]; 4559 exec->transno = 0; 4560 if (trans->atom != NULL) { 4561 exec->index += len; 4562 } 4563 goto progress; 4564 } else if (ret < 0) { 4565 exec->status = -4; 4566 break; 4567 } 4568 } 4569 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4570 rollback: 4571 /* 4572 * Failed to find a way out 4573 */ 4574 exec->determinist = 0; 4575 xmlFARegExecRollBack(exec); 4576 } 4577 progress: 4578 continue; 4579 } 4580 } 4581 #endif 4582 /************************************************************************ 4583 * * 4584 * Parser for the Schemas Datatype Regular Expressions * 4585 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4586 * * 4587 ************************************************************************/ 4588 4589 /** 4590 * xmlFAIsChar: 4591 * @ctxt: a regexp parser context 4592 * 4593 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4594 */ 4595 static int 4596 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4597 int cur; 4598 int len; 4599 4600 cur = CUR_SCHAR(ctxt->cur, len); 4601 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4602 (cur == '*') || (cur == '+') || (cur == '(') || 4603 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4604 (cur == 0x5D) || (cur == 0)) 4605 return(-1); 4606 return(cur); 4607 } 4608 4609 /** 4610 * xmlFAParseCharProp: 4611 * @ctxt: a regexp parser context 4612 * 4613 * [27] charProp ::= IsCategory | IsBlock 4614 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4615 * Separators | Symbols | Others 4616 * [29] Letters ::= 'L' [ultmo]? 4617 * [30] Marks ::= 'M' [nce]? 4618 * [31] Numbers ::= 'N' [dlo]? 4619 * [32] Punctuation ::= 'P' [cdseifo]? 4620 * [33] Separators ::= 'Z' [slp]? 4621 * [34] Symbols ::= 'S' [mcko]? 4622 * [35] Others ::= 'C' [cfon]? 4623 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4624 */ 4625 static void 4626 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4627 int cur; 4628 xmlRegAtomType type = (xmlRegAtomType) 0; 4629 xmlChar *blockName = NULL; 4630 4631 cur = CUR; 4632 if (cur == 'L') { 4633 NEXT; 4634 cur = CUR; 4635 if (cur == 'u') { 4636 NEXT; 4637 type = XML_REGEXP_LETTER_UPPERCASE; 4638 } else if (cur == 'l') { 4639 NEXT; 4640 type = XML_REGEXP_LETTER_LOWERCASE; 4641 } else if (cur == 't') { 4642 NEXT; 4643 type = XML_REGEXP_LETTER_TITLECASE; 4644 } else if (cur == 'm') { 4645 NEXT; 4646 type = XML_REGEXP_LETTER_MODIFIER; 4647 } else if (cur == 'o') { 4648 NEXT; 4649 type = XML_REGEXP_LETTER_OTHERS; 4650 } else { 4651 type = XML_REGEXP_LETTER; 4652 } 4653 } else if (cur == 'M') { 4654 NEXT; 4655 cur = CUR; 4656 if (cur == 'n') { 4657 NEXT; 4658 /* nonspacing */ 4659 type = XML_REGEXP_MARK_NONSPACING; 4660 } else if (cur == 'c') { 4661 NEXT; 4662 /* spacing combining */ 4663 type = XML_REGEXP_MARK_SPACECOMBINING; 4664 } else if (cur == 'e') { 4665 NEXT; 4666 /* enclosing */ 4667 type = XML_REGEXP_MARK_ENCLOSING; 4668 } else { 4669 /* all marks */ 4670 type = XML_REGEXP_MARK; 4671 } 4672 } else if (cur == 'N') { 4673 NEXT; 4674 cur = CUR; 4675 if (cur == 'd') { 4676 NEXT; 4677 /* digital */ 4678 type = XML_REGEXP_NUMBER_DECIMAL; 4679 } else if (cur == 'l') { 4680 NEXT; 4681 /* letter */ 4682 type = XML_REGEXP_NUMBER_LETTER; 4683 } else if (cur == 'o') { 4684 NEXT; 4685 /* other */ 4686 type = XML_REGEXP_NUMBER_OTHERS; 4687 } else { 4688 /* all numbers */ 4689 type = XML_REGEXP_NUMBER; 4690 } 4691 } else if (cur == 'P') { 4692 NEXT; 4693 cur = CUR; 4694 if (cur == 'c') { 4695 NEXT; 4696 /* connector */ 4697 type = XML_REGEXP_PUNCT_CONNECTOR; 4698 } else if (cur == 'd') { 4699 NEXT; 4700 /* dash */ 4701 type = XML_REGEXP_PUNCT_DASH; 4702 } else if (cur == 's') { 4703 NEXT; 4704 /* open */ 4705 type = XML_REGEXP_PUNCT_OPEN; 4706 } else if (cur == 'e') { 4707 NEXT; 4708 /* close */ 4709 type = XML_REGEXP_PUNCT_CLOSE; 4710 } else if (cur == 'i') { 4711 NEXT; 4712 /* initial quote */ 4713 type = XML_REGEXP_PUNCT_INITQUOTE; 4714 } else if (cur == 'f') { 4715 NEXT; 4716 /* final quote */ 4717 type = XML_REGEXP_PUNCT_FINQUOTE; 4718 } else if (cur == 'o') { 4719 NEXT; 4720 /* other */ 4721 type = XML_REGEXP_PUNCT_OTHERS; 4722 } else { 4723 /* all punctuation */ 4724 type = XML_REGEXP_PUNCT; 4725 } 4726 } else if (cur == 'Z') { 4727 NEXT; 4728 cur = CUR; 4729 if (cur == 's') { 4730 NEXT; 4731 /* space */ 4732 type = XML_REGEXP_SEPAR_SPACE; 4733 } else if (cur == 'l') { 4734 NEXT; 4735 /* line */ 4736 type = XML_REGEXP_SEPAR_LINE; 4737 } else if (cur == 'p') { 4738 NEXT; 4739 /* paragraph */ 4740 type = XML_REGEXP_SEPAR_PARA; 4741 } else { 4742 /* all separators */ 4743 type = XML_REGEXP_SEPAR; 4744 } 4745 } else if (cur == 'S') { 4746 NEXT; 4747 cur = CUR; 4748 if (cur == 'm') { 4749 NEXT; 4750 type = XML_REGEXP_SYMBOL_MATH; 4751 /* math */ 4752 } else if (cur == 'c') { 4753 NEXT; 4754 type = XML_REGEXP_SYMBOL_CURRENCY; 4755 /* currency */ 4756 } else if (cur == 'k') { 4757 NEXT; 4758 type = XML_REGEXP_SYMBOL_MODIFIER; 4759 /* modifiers */ 4760 } else if (cur == 'o') { 4761 NEXT; 4762 type = XML_REGEXP_SYMBOL_OTHERS; 4763 /* other */ 4764 } else { 4765 /* all symbols */ 4766 type = XML_REGEXP_SYMBOL; 4767 } 4768 } else if (cur == 'C') { 4769 NEXT; 4770 cur = CUR; 4771 if (cur == 'c') { 4772 NEXT; 4773 /* control */ 4774 type = XML_REGEXP_OTHER_CONTROL; 4775 } else if (cur == 'f') { 4776 NEXT; 4777 /* format */ 4778 type = XML_REGEXP_OTHER_FORMAT; 4779 } else if (cur == 'o') { 4780 NEXT; 4781 /* private use */ 4782 type = XML_REGEXP_OTHER_PRIVATE; 4783 } else if (cur == 'n') { 4784 NEXT; 4785 /* not assigned */ 4786 type = XML_REGEXP_OTHER_NA; 4787 } else { 4788 /* all others */ 4789 type = XML_REGEXP_OTHER; 4790 } 4791 } else if (cur == 'I') { 4792 const xmlChar *start; 4793 NEXT; 4794 cur = CUR; 4795 if (cur != 's') { 4796 ERROR("IsXXXX expected"); 4797 return; 4798 } 4799 NEXT; 4800 start = ctxt->cur; 4801 cur = CUR; 4802 if (((cur >= 'a') && (cur <= 'z')) || 4803 ((cur >= 'A') && (cur <= 'Z')) || 4804 ((cur >= '0') && (cur <= '9')) || 4805 (cur == 0x2D)) { 4806 NEXT; 4807 cur = CUR; 4808 while (((cur >= 'a') && (cur <= 'z')) || 4809 ((cur >= 'A') && (cur <= 'Z')) || 4810 ((cur >= '0') && (cur <= '9')) || 4811 (cur == 0x2D)) { 4812 NEXT; 4813 cur = CUR; 4814 } 4815 } 4816 type = XML_REGEXP_BLOCK_NAME; 4817 blockName = xmlStrndup(start, ctxt->cur - start); 4818 } else { 4819 ERROR("Unknown char property"); 4820 return; 4821 } 4822 if (ctxt->atom == NULL) { 4823 ctxt->atom = xmlRegNewAtom(ctxt, type); 4824 if (ctxt->atom != NULL) 4825 ctxt->atom->valuep = blockName; 4826 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4827 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4828 type, 0, 0, blockName); 4829 } 4830 } 4831 4832 /** 4833 * xmlFAParseCharClassEsc: 4834 * @ctxt: a regexp parser context 4835 * 4836 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4837 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4838 * [25] catEsc ::= '\p{' charProp '}' 4839 * [26] complEsc ::= '\P{' charProp '}' 4840 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4841 */ 4842 static void 4843 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4844 int cur; 4845 4846 if (CUR == '.') { 4847 if (ctxt->atom == NULL) { 4848 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4849 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4850 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4851 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4852 } 4853 NEXT; 4854 return; 4855 } 4856 if (CUR != '\\') { 4857 ERROR("Escaped sequence: expecting \\"); 4858 return; 4859 } 4860 NEXT; 4861 cur = CUR; 4862 if (cur == 'p') { 4863 NEXT; 4864 if (CUR != '{') { 4865 ERROR("Expecting '{'"); 4866 return; 4867 } 4868 NEXT; 4869 xmlFAParseCharProp(ctxt); 4870 if (CUR != '}') { 4871 ERROR("Expecting '}'"); 4872 return; 4873 } 4874 NEXT; 4875 } else if (cur == 'P') { 4876 NEXT; 4877 if (CUR != '{') { 4878 ERROR("Expecting '{'"); 4879 return; 4880 } 4881 NEXT; 4882 xmlFAParseCharProp(ctxt); 4883 ctxt->atom->neg = 1; 4884 if (CUR != '}') { 4885 ERROR("Expecting '}'"); 4886 return; 4887 } 4888 NEXT; 4889 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4890 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4891 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4892 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4893 (cur == 0x5E)) { 4894 if (ctxt->atom == NULL) { 4895 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4896 if (ctxt->atom != NULL) { 4897 switch (cur) { 4898 case 'n': 4899 ctxt->atom->codepoint = '\n'; 4900 break; 4901 case 'r': 4902 ctxt->atom->codepoint = '\r'; 4903 break; 4904 case 't': 4905 ctxt->atom->codepoint = '\t'; 4906 break; 4907 default: 4908 ctxt->atom->codepoint = cur; 4909 } 4910 } 4911 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4912 switch (cur) { 4913 case 'n': 4914 cur = '\n'; 4915 break; 4916 case 'r': 4917 cur = '\r'; 4918 break; 4919 case 't': 4920 cur = '\t'; 4921 break; 4922 } 4923 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4924 XML_REGEXP_CHARVAL, cur, cur, NULL); 4925 } 4926 NEXT; 4927 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4928 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4929 (cur == 'w') || (cur == 'W')) { 4930 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4931 4932 switch (cur) { 4933 case 's': 4934 type = XML_REGEXP_ANYSPACE; 4935 break; 4936 case 'S': 4937 type = XML_REGEXP_NOTSPACE; 4938 break; 4939 case 'i': 4940 type = XML_REGEXP_INITNAME; 4941 break; 4942 case 'I': 4943 type = XML_REGEXP_NOTINITNAME; 4944 break; 4945 case 'c': 4946 type = XML_REGEXP_NAMECHAR; 4947 break; 4948 case 'C': 4949 type = XML_REGEXP_NOTNAMECHAR; 4950 break; 4951 case 'd': 4952 type = XML_REGEXP_DECIMAL; 4953 break; 4954 case 'D': 4955 type = XML_REGEXP_NOTDECIMAL; 4956 break; 4957 case 'w': 4958 type = XML_REGEXP_REALCHAR; 4959 break; 4960 case 'W': 4961 type = XML_REGEXP_NOTREALCHAR; 4962 break; 4963 } 4964 NEXT; 4965 if (ctxt->atom == NULL) { 4966 ctxt->atom = xmlRegNewAtom(ctxt, type); 4967 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4968 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4969 type, 0, 0, NULL); 4970 } 4971 } else { 4972 ERROR("Wrong escape sequence, misuse of character '\\'"); 4973 } 4974 } 4975 4976 /** 4977 * xmlFAParseCharRange: 4978 * @ctxt: a regexp parser context 4979 * 4980 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 4981 * [18] seRange ::= charOrEsc '-' charOrEsc 4982 * [20] charOrEsc ::= XmlChar | SingleCharEsc 4983 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 4984 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 4985 */ 4986 static void 4987 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 4988 int cur, len; 4989 int start = -1; 4990 int end = -1; 4991 4992 if (CUR == '\0') { 4993 ERROR("Expecting ']'"); 4994 return; 4995 } 4996 4997 cur = CUR; 4998 if (cur == '\\') { 4999 NEXT; 5000 cur = CUR; 5001 switch (cur) { 5002 case 'n': start = 0xA; break; 5003 case 'r': start = 0xD; break; 5004 case 't': start = 0x9; break; 5005 case '\\': case '|': case '.': case '-': case '^': case '?': 5006 case '*': case '+': case '{': case '}': case '(': case ')': 5007 case '[': case ']': 5008 start = cur; break; 5009 default: 5010 ERROR("Invalid escape value"); 5011 return; 5012 } 5013 end = start; 5014 len = 1; 5015 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5016 end = start = CUR_SCHAR(ctxt->cur, len); 5017 } else { 5018 ERROR("Expecting a char range"); 5019 return; 5020 } 5021 /* 5022 * Since we are "inside" a range, we can assume ctxt->cur is past 5023 * the start of ctxt->string, and PREV should be safe 5024 */ 5025 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) { 5026 NEXTL(len); 5027 return; 5028 } 5029 NEXTL(len); 5030 cur = CUR; 5031 if ((cur != '-') || (NXT(1) == ']')) { 5032 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5033 XML_REGEXP_CHARVAL, start, end, NULL); 5034 return; 5035 } 5036 NEXT; 5037 cur = CUR; 5038 if (cur == '\\') { 5039 NEXT; 5040 cur = CUR; 5041 switch (cur) { 5042 case 'n': end = 0xA; break; 5043 case 'r': end = 0xD; break; 5044 case 't': end = 0x9; break; 5045 case '\\': case '|': case '.': case '-': case '^': case '?': 5046 case '*': case '+': case '{': case '}': case '(': case ')': 5047 case '[': case ']': 5048 end = cur; break; 5049 default: 5050 ERROR("Invalid escape value"); 5051 return; 5052 } 5053 len = 1; 5054 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5055 end = CUR_SCHAR(ctxt->cur, len); 5056 } else { 5057 ERROR("Expecting the end of a char range"); 5058 return; 5059 } 5060 5061 /* TODO check that the values are acceptable character ranges for XML */ 5062 if (end < start) { 5063 ERROR("End of range is before start of range"); 5064 } else { 5065 NEXTL(len); 5066 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5067 XML_REGEXP_CHARVAL, start, end, NULL); 5068 } 5069 return; 5070 } 5071 5072 /** 5073 * xmlFAParsePosCharGroup: 5074 * @ctxt: a regexp parser context 5075 * 5076 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 5077 */ 5078 static void 5079 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 5080 do { 5081 if (CUR == '\\') { 5082 xmlFAParseCharClassEsc(ctxt); 5083 } else { 5084 xmlFAParseCharRange(ctxt); 5085 } 5086 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 5087 (CUR != 0) && (ctxt->error == 0)); 5088 } 5089 5090 /** 5091 * xmlFAParseCharGroup: 5092 * @ctxt: a regexp parser context 5093 * 5094 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 5095 * [15] negCharGroup ::= '^' posCharGroup 5096 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 5097 * [12] charClassExpr ::= '[' charGroup ']' 5098 */ 5099 static void 5100 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 5101 int n = ctxt->neg; 5102 while ((CUR != ']') && (ctxt->error == 0)) { 5103 if (CUR == '^') { 5104 int neg = ctxt->neg; 5105 5106 NEXT; 5107 ctxt->neg = !ctxt->neg; 5108 xmlFAParsePosCharGroup(ctxt); 5109 ctxt->neg = neg; 5110 } else if ((CUR == '-') && (NXT(1) == '[')) { 5111 int neg = ctxt->neg; 5112 ctxt->neg = 2; 5113 NEXT; /* eat the '-' */ 5114 NEXT; /* eat the '[' */ 5115 xmlFAParseCharGroup(ctxt); 5116 if (CUR == ']') { 5117 NEXT; 5118 } else { 5119 ERROR("charClassExpr: ']' expected"); 5120 break; 5121 } 5122 ctxt->neg = neg; 5123 break; 5124 } else if (CUR != ']') { 5125 xmlFAParsePosCharGroup(ctxt); 5126 } 5127 } 5128 ctxt->neg = n; 5129 } 5130 5131 /** 5132 * xmlFAParseCharClass: 5133 * @ctxt: a regexp parser context 5134 * 5135 * [11] charClass ::= charClassEsc | charClassExpr 5136 * [12] charClassExpr ::= '[' charGroup ']' 5137 */ 5138 static void 5139 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 5140 if (CUR == '[') { 5141 NEXT; 5142 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 5143 if (ctxt->atom == NULL) 5144 return; 5145 xmlFAParseCharGroup(ctxt); 5146 if (CUR == ']') { 5147 NEXT; 5148 } else { 5149 ERROR("xmlFAParseCharClass: ']' expected"); 5150 } 5151 } else { 5152 xmlFAParseCharClassEsc(ctxt); 5153 } 5154 } 5155 5156 /** 5157 * xmlFAParseQuantExact: 5158 * @ctxt: a regexp parser context 5159 * 5160 * [8] QuantExact ::= [0-9]+ 5161 * 5162 * Returns 0 if success or -1 in case of error 5163 */ 5164 static int 5165 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 5166 int ret = 0; 5167 int ok = 0; 5168 5169 while ((CUR >= '0') && (CUR <= '9')) { 5170 ret = ret * 10 + (CUR - '0'); 5171 ok = 1; 5172 NEXT; 5173 } 5174 if (ok != 1) { 5175 return(-1); 5176 } 5177 return(ret); 5178 } 5179 5180 /** 5181 * xmlFAParseQuantifier: 5182 * @ctxt: a regexp parser context 5183 * 5184 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 5185 * [5] quantity ::= quantRange | quantMin | QuantExact 5186 * [6] quantRange ::= QuantExact ',' QuantExact 5187 * [7] quantMin ::= QuantExact ',' 5188 * [8] QuantExact ::= [0-9]+ 5189 */ 5190 static int 5191 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5192 int cur; 5193 5194 cur = CUR; 5195 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5196 if (ctxt->atom != NULL) { 5197 if (cur == '?') 5198 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5199 else if (cur == '*') 5200 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5201 else if (cur == '+') 5202 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5203 } 5204 NEXT; 5205 return(1); 5206 } 5207 if (cur == '{') { 5208 int min = 0, max = 0; 5209 5210 NEXT; 5211 cur = xmlFAParseQuantExact(ctxt); 5212 if (cur >= 0) 5213 min = cur; 5214 if (CUR == ',') { 5215 NEXT; 5216 if (CUR == '}') 5217 max = INT_MAX; 5218 else { 5219 cur = xmlFAParseQuantExact(ctxt); 5220 if (cur >= 0) 5221 max = cur; 5222 else { 5223 ERROR("Improper quantifier"); 5224 } 5225 } 5226 } 5227 if (CUR == '}') { 5228 NEXT; 5229 } else { 5230 ERROR("Unterminated quantifier"); 5231 } 5232 if (max == 0) 5233 max = min; 5234 if (ctxt->atom != NULL) { 5235 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5236 ctxt->atom->min = min; 5237 ctxt->atom->max = max; 5238 } 5239 return(1); 5240 } 5241 return(0); 5242 } 5243 5244 /** 5245 * xmlFAParseAtom: 5246 * @ctxt: a regexp parser context 5247 * 5248 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5249 */ 5250 static int 5251 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5252 int codepoint, len; 5253 5254 codepoint = xmlFAIsChar(ctxt); 5255 if (codepoint > 0) { 5256 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5257 if (ctxt->atom == NULL) 5258 return(-1); 5259 codepoint = CUR_SCHAR(ctxt->cur, len); 5260 ctxt->atom->codepoint = codepoint; 5261 NEXTL(len); 5262 return(1); 5263 } else if (CUR == '|') { 5264 return(0); 5265 } else if (CUR == 0) { 5266 return(0); 5267 } else if (CUR == ')') { 5268 return(0); 5269 } else if (CUR == '(') { 5270 xmlRegStatePtr start, oldend, start0; 5271 5272 NEXT; 5273 /* 5274 * this extra Epsilon transition is needed if we count with 0 allowed 5275 * unfortunately this can't be known at that point 5276 */ 5277 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5278 start0 = ctxt->state; 5279 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5280 start = ctxt->state; 5281 oldend = ctxt->end; 5282 ctxt->end = NULL; 5283 ctxt->atom = NULL; 5284 xmlFAParseRegExp(ctxt, 0); 5285 if (CUR == ')') { 5286 NEXT; 5287 } else { 5288 ERROR("xmlFAParseAtom: expecting ')'"); 5289 } 5290 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5291 if (ctxt->atom == NULL) 5292 return(-1); 5293 ctxt->atom->start = start; 5294 ctxt->atom->start0 = start0; 5295 ctxt->atom->stop = ctxt->state; 5296 ctxt->end = oldend; 5297 return(1); 5298 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5299 xmlFAParseCharClass(ctxt); 5300 return(1); 5301 } 5302 return(0); 5303 } 5304 5305 /** 5306 * xmlFAParsePiece: 5307 * @ctxt: a regexp parser context 5308 * 5309 * [3] piece ::= atom quantifier? 5310 */ 5311 static int 5312 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5313 int ret; 5314 5315 ctxt->atom = NULL; 5316 ret = xmlFAParseAtom(ctxt); 5317 if (ret == 0) 5318 return(0); 5319 if (ctxt->atom == NULL) { 5320 ERROR("internal: no atom generated"); 5321 } 5322 xmlFAParseQuantifier(ctxt); 5323 return(1); 5324 } 5325 5326 /** 5327 * xmlFAParseBranch: 5328 * @ctxt: a regexp parser context 5329 * @to: optional target to the end of the branch 5330 * 5331 * @to is used to optimize by removing duplicate path in automata 5332 * in expressions like (a|b)(c|d) 5333 * 5334 * [2] branch ::= piece* 5335 */ 5336 static int 5337 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5338 xmlRegStatePtr previous; 5339 int ret; 5340 5341 previous = ctxt->state; 5342 ret = xmlFAParsePiece(ctxt); 5343 if (ret != 0) { 5344 if (xmlFAGenerateTransitions(ctxt, previous, 5345 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5346 return(-1); 5347 previous = ctxt->state; 5348 ctxt->atom = NULL; 5349 } 5350 while ((ret != 0) && (ctxt->error == 0)) { 5351 ret = xmlFAParsePiece(ctxt); 5352 if (ret != 0) { 5353 if (xmlFAGenerateTransitions(ctxt, previous, 5354 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5355 return(-1); 5356 previous = ctxt->state; 5357 ctxt->atom = NULL; 5358 } 5359 } 5360 return(0); 5361 } 5362 5363 /** 5364 * xmlFAParseRegExp: 5365 * @ctxt: a regexp parser context 5366 * @top: is this the top-level expression ? 5367 * 5368 * [1] regExp ::= branch ( '|' branch )* 5369 */ 5370 static void 5371 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5372 xmlRegStatePtr start, end; 5373 5374 /* if not top start should have been generated by an epsilon trans */ 5375 start = ctxt->state; 5376 ctxt->end = NULL; 5377 xmlFAParseBranch(ctxt, NULL); 5378 if (top) { 5379 #ifdef DEBUG_REGEXP_GRAPH 5380 printf("State %d is final\n", ctxt->state->no); 5381 #endif 5382 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5383 } 5384 if (CUR != '|') { 5385 ctxt->end = ctxt->state; 5386 return; 5387 } 5388 end = ctxt->state; 5389 while ((CUR == '|') && (ctxt->error == 0)) { 5390 NEXT; 5391 if (CUR == 0) { 5392 ERROR("expecting a branch after |") 5393 return; 5394 } 5395 ctxt->state = start; 5396 ctxt->end = NULL; 5397 xmlFAParseBranch(ctxt, end); 5398 } 5399 if (!top) { 5400 ctxt->state = end; 5401 ctxt->end = end; 5402 } 5403 } 5404 5405 /************************************************************************ 5406 * * 5407 * The basic API * 5408 * * 5409 ************************************************************************/ 5410 5411 /** 5412 * xmlRegexpPrint: 5413 * @output: the file for the output debug 5414 * @regexp: the compiled regexp 5415 * 5416 * Print the content of the compiled regular expression 5417 */ 5418 void 5419 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5420 int i; 5421 5422 if (output == NULL) 5423 return; 5424 fprintf(output, " regexp: "); 5425 if (regexp == NULL) { 5426 fprintf(output, "NULL\n"); 5427 return; 5428 } 5429 fprintf(output, "'%s' ", regexp->string); 5430 fprintf(output, "\n"); 5431 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5432 for (i = 0;i < regexp->nbAtoms; i++) { 5433 fprintf(output, " %02d ", i); 5434 xmlRegPrintAtom(output, regexp->atoms[i]); 5435 } 5436 fprintf(output, "%d states:", regexp->nbStates); 5437 fprintf(output, "\n"); 5438 for (i = 0;i < regexp->nbStates; i++) { 5439 xmlRegPrintState(output, regexp->states[i]); 5440 } 5441 fprintf(output, "%d counters:\n", regexp->nbCounters); 5442 for (i = 0;i < regexp->nbCounters; i++) { 5443 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5444 regexp->counters[i].max); 5445 } 5446 } 5447 5448 /** 5449 * xmlRegexpCompile: 5450 * @regexp: a regular expression string 5451 * 5452 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5453 * Appendix F and builds an automata suitable for testing strings against 5454 * that regular expression 5455 * 5456 * Returns the compiled expression or NULL in case of error 5457 */ 5458 xmlRegexpPtr 5459 xmlRegexpCompile(const xmlChar *regexp) { 5460 xmlRegexpPtr ret; 5461 xmlRegParserCtxtPtr ctxt; 5462 5463 ctxt = xmlRegNewParserCtxt(regexp); 5464 if (ctxt == NULL) 5465 return(NULL); 5466 5467 /* initialize the parser */ 5468 ctxt->end = NULL; 5469 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5470 xmlRegStatePush(ctxt, ctxt->start); 5471 5472 /* parse the expression building an automata */ 5473 xmlFAParseRegExp(ctxt, 1); 5474 if (CUR != 0) { 5475 ERROR("xmlFAParseRegExp: extra characters"); 5476 } 5477 if (ctxt->error != 0) { 5478 xmlRegFreeParserCtxt(ctxt); 5479 return(NULL); 5480 } 5481 ctxt->end = ctxt->state; 5482 ctxt->start->type = XML_REGEXP_START_STATE; 5483 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5484 5485 /* remove the Epsilon except for counted transitions */ 5486 xmlFAEliminateEpsilonTransitions(ctxt); 5487 5488 5489 if (ctxt->error != 0) { 5490 xmlRegFreeParserCtxt(ctxt); 5491 return(NULL); 5492 } 5493 ret = xmlRegEpxFromParse(ctxt); 5494 xmlRegFreeParserCtxt(ctxt); 5495 return(ret); 5496 } 5497 5498 /** 5499 * xmlRegexpExec: 5500 * @comp: the compiled regular expression 5501 * @content: the value to check against the regular expression 5502 * 5503 * Check if the regular expression generates the value 5504 * 5505 * Returns 1 if it matches, 0 if not and a negative value in case of error 5506 */ 5507 int 5508 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5509 if ((comp == NULL) || (content == NULL)) 5510 return(-1); 5511 return(xmlFARegExec(comp, content)); 5512 } 5513 5514 /** 5515 * xmlRegexpIsDeterminist: 5516 * @comp: the compiled regular expression 5517 * 5518 * Check if the regular expression is determinist 5519 * 5520 * Returns 1 if it yes, 0 if not and a negative value in case of error 5521 */ 5522 int 5523 xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5524 xmlAutomataPtr am; 5525 int ret; 5526 5527 if (comp == NULL) 5528 return(-1); 5529 if (comp->determinist != -1) 5530 return(comp->determinist); 5531 5532 am = xmlNewAutomata(); 5533 if (am->states != NULL) { 5534 int i; 5535 5536 for (i = 0;i < am->nbStates;i++) 5537 xmlRegFreeState(am->states[i]); 5538 xmlFree(am->states); 5539 } 5540 am->nbAtoms = comp->nbAtoms; 5541 am->atoms = comp->atoms; 5542 am->nbStates = comp->nbStates; 5543 am->states = comp->states; 5544 am->determinist = -1; 5545 am->flags = comp->flags; 5546 ret = xmlFAComputesDeterminism(am); 5547 am->atoms = NULL; 5548 am->states = NULL; 5549 xmlFreeAutomata(am); 5550 comp->determinist = ret; 5551 return(ret); 5552 } 5553 5554 /** 5555 * xmlRegFreeRegexp: 5556 * @regexp: the regexp 5557 * 5558 * Free a regexp 5559 */ 5560 void 5561 xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5562 int i; 5563 if (regexp == NULL) 5564 return; 5565 5566 if (regexp->string != NULL) 5567 xmlFree(regexp->string); 5568 if (regexp->states != NULL) { 5569 for (i = 0;i < regexp->nbStates;i++) 5570 xmlRegFreeState(regexp->states[i]); 5571 xmlFree(regexp->states); 5572 } 5573 if (regexp->atoms != NULL) { 5574 for (i = 0;i < regexp->nbAtoms;i++) 5575 xmlRegFreeAtom(regexp->atoms[i]); 5576 xmlFree(regexp->atoms); 5577 } 5578 if (regexp->counters != NULL) 5579 xmlFree(regexp->counters); 5580 if (regexp->compact != NULL) 5581 xmlFree(regexp->compact); 5582 if (regexp->transdata != NULL) 5583 xmlFree(regexp->transdata); 5584 if (regexp->stringMap != NULL) { 5585 for (i = 0; i < regexp->nbstrings;i++) 5586 xmlFree(regexp->stringMap[i]); 5587 xmlFree(regexp->stringMap); 5588 } 5589 5590 xmlFree(regexp); 5591 } 5592 5593 #ifdef LIBXML_AUTOMATA_ENABLED 5594 /************************************************************************ 5595 * * 5596 * The Automata interface * 5597 * * 5598 ************************************************************************/ 5599 5600 /** 5601 * xmlNewAutomata: 5602 * 5603 * Create a new automata 5604 * 5605 * Returns the new object or NULL in case of failure 5606 */ 5607 xmlAutomataPtr 5608 xmlNewAutomata(void) { 5609 xmlAutomataPtr ctxt; 5610 5611 ctxt = xmlRegNewParserCtxt(NULL); 5612 if (ctxt == NULL) 5613 return(NULL); 5614 5615 /* initialize the parser */ 5616 ctxt->end = NULL; 5617 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5618 if (ctxt->start == NULL) { 5619 xmlFreeAutomata(ctxt); 5620 return(NULL); 5621 } 5622 ctxt->start->type = XML_REGEXP_START_STATE; 5623 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5624 xmlRegFreeState(ctxt->start); 5625 xmlFreeAutomata(ctxt); 5626 return(NULL); 5627 } 5628 ctxt->flags = 0; 5629 5630 return(ctxt); 5631 } 5632 5633 /** 5634 * xmlFreeAutomata: 5635 * @am: an automata 5636 * 5637 * Free an automata 5638 */ 5639 void 5640 xmlFreeAutomata(xmlAutomataPtr am) { 5641 if (am == NULL) 5642 return; 5643 xmlRegFreeParserCtxt(am); 5644 } 5645 5646 /** 5647 * xmlAutomataSetFlags: 5648 * @am: an automata 5649 * @flags: a set of internal flags 5650 * 5651 * Set some flags on the automata 5652 */ 5653 void 5654 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) { 5655 if (am == NULL) 5656 return; 5657 am->flags |= flags; 5658 } 5659 5660 /** 5661 * xmlAutomataGetInitState: 5662 * @am: an automata 5663 * 5664 * Initial state lookup 5665 * 5666 * Returns the initial state of the automata 5667 */ 5668 xmlAutomataStatePtr 5669 xmlAutomataGetInitState(xmlAutomataPtr am) { 5670 if (am == NULL) 5671 return(NULL); 5672 return(am->start); 5673 } 5674 5675 /** 5676 * xmlAutomataSetFinalState: 5677 * @am: an automata 5678 * @state: a state in this automata 5679 * 5680 * Makes that state a final state 5681 * 5682 * Returns 0 or -1 in case of error 5683 */ 5684 int 5685 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5686 if ((am == NULL) || (state == NULL)) 5687 return(-1); 5688 state->type = XML_REGEXP_FINAL_STATE; 5689 return(0); 5690 } 5691 5692 /** 5693 * xmlAutomataNewTransition: 5694 * @am: an automata 5695 * @from: the starting point of the transition 5696 * @to: the target point of the transition or NULL 5697 * @token: the input string associated to that transition 5698 * @data: data passed to the callback function if the transition is activated 5699 * 5700 * If @to is NULL, this creates first a new target state in the automata 5701 * and then adds a transition from the @from state to the target state 5702 * activated by the value of @token 5703 * 5704 * Returns the target state or NULL in case of error 5705 */ 5706 xmlAutomataStatePtr 5707 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5708 xmlAutomataStatePtr to, const xmlChar *token, 5709 void *data) { 5710 xmlRegAtomPtr atom; 5711 5712 if ((am == NULL) || (from == NULL) || (token == NULL)) 5713 return(NULL); 5714 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5715 if (atom == NULL) 5716 return(NULL); 5717 atom->data = data; 5718 atom->valuep = xmlStrdup(token); 5719 5720 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5721 xmlRegFreeAtom(atom); 5722 return(NULL); 5723 } 5724 if (to == NULL) 5725 return(am->state); 5726 return(to); 5727 } 5728 5729 /** 5730 * xmlAutomataNewTransition2: 5731 * @am: an automata 5732 * @from: the starting point of the transition 5733 * @to: the target point of the transition or NULL 5734 * @token: the first input string associated to that transition 5735 * @token2: the second input string associated to that transition 5736 * @data: data passed to the callback function if the transition is activated 5737 * 5738 * If @to is NULL, this creates first a new target state in the automata 5739 * and then adds a transition from the @from state to the target state 5740 * activated by the value of @token 5741 * 5742 * Returns the target state or NULL in case of error 5743 */ 5744 xmlAutomataStatePtr 5745 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5746 xmlAutomataStatePtr to, const xmlChar *token, 5747 const xmlChar *token2, void *data) { 5748 xmlRegAtomPtr atom; 5749 5750 if ((am == NULL) || (from == NULL) || (token == NULL)) 5751 return(NULL); 5752 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5753 if (atom == NULL) 5754 return(NULL); 5755 atom->data = data; 5756 if ((token2 == NULL) || (*token2 == 0)) { 5757 atom->valuep = xmlStrdup(token); 5758 } else { 5759 int lenn, lenp; 5760 xmlChar *str; 5761 5762 lenn = strlen((char *) token2); 5763 lenp = strlen((char *) token); 5764 5765 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5766 if (str == NULL) { 5767 xmlRegFreeAtom(atom); 5768 return(NULL); 5769 } 5770 memcpy(&str[0], token, lenp); 5771 str[lenp] = '|'; 5772 memcpy(&str[lenp + 1], token2, lenn); 5773 str[lenn + lenp + 1] = 0; 5774 5775 atom->valuep = str; 5776 } 5777 5778 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5779 xmlRegFreeAtom(atom); 5780 return(NULL); 5781 } 5782 if (to == NULL) 5783 return(am->state); 5784 return(to); 5785 } 5786 5787 /** 5788 * xmlAutomataNewNegTrans: 5789 * @am: an automata 5790 * @from: the starting point of the transition 5791 * @to: the target point of the transition or NULL 5792 * @token: the first input string associated to that transition 5793 * @token2: the second input string associated to that transition 5794 * @data: data passed to the callback function if the transition is activated 5795 * 5796 * If @to is NULL, this creates first a new target state in the automata 5797 * and then adds a transition from the @from state to the target state 5798 * activated by any value except (@token,@token2) 5799 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5800 # the semantic of XSD ##other 5801 * 5802 * Returns the target state or NULL in case of error 5803 */ 5804 xmlAutomataStatePtr 5805 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5806 xmlAutomataStatePtr to, const xmlChar *token, 5807 const xmlChar *token2, void *data) { 5808 xmlRegAtomPtr atom; 5809 xmlChar err_msg[200]; 5810 5811 if ((am == NULL) || (from == NULL) || (token == NULL)) 5812 return(NULL); 5813 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5814 if (atom == NULL) 5815 return(NULL); 5816 atom->data = data; 5817 atom->neg = 1; 5818 if ((token2 == NULL) || (*token2 == 0)) { 5819 atom->valuep = xmlStrdup(token); 5820 } else { 5821 int lenn, lenp; 5822 xmlChar *str; 5823 5824 lenn = strlen((char *) token2); 5825 lenp = strlen((char *) token); 5826 5827 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5828 if (str == NULL) { 5829 xmlRegFreeAtom(atom); 5830 return(NULL); 5831 } 5832 memcpy(&str[0], token, lenp); 5833 str[lenp] = '|'; 5834 memcpy(&str[lenp + 1], token2, lenn); 5835 str[lenn + lenp + 1] = 0; 5836 5837 atom->valuep = str; 5838 } 5839 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5840 err_msg[199] = 0; 5841 atom->valuep2 = xmlStrdup(err_msg); 5842 5843 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5844 xm