1 /* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard (at) redhat.com> 15 */ 16 17 #define IN_LIBXML 18 #include "libxml.h" 19 20 #ifdef LIBXML_REGEXP_ENABLED 21 22 /* #define DEBUG_ERR */ 23 24 #include <stdio.h> 25 #include <string.h> 26 #ifdef HAVE_LIMITS_H 27 #include <limits.h> 28 #endif 29 30 #include <libxml/tree.h> 31 #include <libxml/parserInternals.h> 32 #include <libxml/xmlregexp.h> 33 #include <libxml/xmlautomata.h> 34 #include <libxml/xmlunicode.h> 35 36 #ifndef INT_MAX 37 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 38 #endif 39 40 /* #define DEBUG_REGEXP_GRAPH */ 41 /* #define DEBUG_REGEXP_EXEC */ 42 /* #define DEBUG_PUSH */ 43 /* #define DEBUG_COMPACTION */ 44 45 #define MAX_PUSH 10000000 46 47 #ifdef ERROR 48 #undef ERROR 49 #endif 50 #define ERROR(str) \ 51 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 52 xmlRegexpErrCompile(ctxt, str); 53 #define NEXT ctxt->cur++ 54 #define CUR (*(ctxt->cur)) 55 #define NXT(index) (ctxt->cur[index]) 56 57 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 58 #define NEXTL(l) ctxt->cur += l; 59 #define XML_REG_STRING_SEPARATOR '|' 60 /* 61 * Need PREV to check on a '-' within a Character Group. May only be used 62 * when it's guaranteed that cur is not at the beginning of ctxt->string! 63 */ 64 #define PREV (ctxt->cur[-1]) 65 66 /** 67 * TODO: 68 * 69 * macro to flag unimplemented blocks 70 */ 71 #define TODO \ 72 xmlGenericError(xmlGenericErrorContext, \ 73 "Unimplemented block at %s:%d\n", \ 74 __FILE__, __LINE__); 75 76 /************************************************************************ 77 * * 78 * Datatypes and structures * 79 * * 80 ************************************************************************/ 81 82 /* 83 * Note: the order of the enums below is significant, do not shuffle 84 */ 85 typedef enum { 86 XML_REGEXP_EPSILON = 1, 87 XML_REGEXP_CHARVAL, 88 XML_REGEXP_RANGES, 89 XML_REGEXP_SUBREG, /* used for () sub regexps */ 90 XML_REGEXP_STRING, 91 XML_REGEXP_ANYCHAR, /* . */ 92 XML_REGEXP_ANYSPACE, /* \s */ 93 XML_REGEXP_NOTSPACE, /* \S */ 94 XML_REGEXP_INITNAME, /* \l */ 95 XML_REGEXP_NOTINITNAME, /* \L */ 96 XML_REGEXP_NAMECHAR, /* \c */ 97 XML_REGEXP_NOTNAMECHAR, /* \C */ 98 XML_REGEXP_DECIMAL, /* \d */ 99 XML_REGEXP_NOTDECIMAL, /* \D */ 100 XML_REGEXP_REALCHAR, /* \w */ 101 XML_REGEXP_NOTREALCHAR, /* \W */ 102 XML_REGEXP_LETTER = 100, 103 XML_REGEXP_LETTER_UPPERCASE, 104 XML_REGEXP_LETTER_LOWERCASE, 105 XML_REGEXP_LETTER_TITLECASE, 106 XML_REGEXP_LETTER_MODIFIER, 107 XML_REGEXP_LETTER_OTHERS, 108 XML_REGEXP_MARK, 109 XML_REGEXP_MARK_NONSPACING, 110 XML_REGEXP_MARK_SPACECOMBINING, 111 XML_REGEXP_MARK_ENCLOSING, 112 XML_REGEXP_NUMBER, 113 XML_REGEXP_NUMBER_DECIMAL, 114 XML_REGEXP_NUMBER_LETTER, 115 XML_REGEXP_NUMBER_OTHERS, 116 XML_REGEXP_PUNCT, 117 XML_REGEXP_PUNCT_CONNECTOR, 118 XML_REGEXP_PUNCT_DASH, 119 XML_REGEXP_PUNCT_OPEN, 120 XML_REGEXP_PUNCT_CLOSE, 121 XML_REGEXP_PUNCT_INITQUOTE, 122 XML_REGEXP_PUNCT_FINQUOTE, 123 XML_REGEXP_PUNCT_OTHERS, 124 XML_REGEXP_SEPAR, 125 XML_REGEXP_SEPAR_SPACE, 126 XML_REGEXP_SEPAR_LINE, 127 XML_REGEXP_SEPAR_PARA, 128 XML_REGEXP_SYMBOL, 129 XML_REGEXP_SYMBOL_MATH, 130 XML_REGEXP_SYMBOL_CURRENCY, 131 XML_REGEXP_SYMBOL_MODIFIER, 132 XML_REGEXP_SYMBOL_OTHERS, 133 XML_REGEXP_OTHER, 134 XML_REGEXP_OTHER_CONTROL, 135 XML_REGEXP_OTHER_FORMAT, 136 XML_REGEXP_OTHER_PRIVATE, 137 XML_REGEXP_OTHER_NA, 138 XML_REGEXP_BLOCK_NAME 139 } xmlRegAtomType; 140 141 typedef enum { 142 XML_REGEXP_QUANT_EPSILON = 1, 143 XML_REGEXP_QUANT_ONCE, 144 XML_REGEXP_QUANT_OPT, 145 XML_REGEXP_QUANT_MULT, 146 XML_REGEXP_QUANT_PLUS, 147 XML_REGEXP_QUANT_ONCEONLY, 148 XML_REGEXP_QUANT_ALL, 149 XML_REGEXP_QUANT_RANGE 150 } xmlRegQuantType; 151 152 typedef enum { 153 XML_REGEXP_START_STATE = 1, 154 XML_REGEXP_FINAL_STATE, 155 XML_REGEXP_TRANS_STATE, 156 XML_REGEXP_SINK_STATE, 157 XML_REGEXP_UNREACH_STATE 158 } xmlRegStateType; 159 160 typedef enum { 161 XML_REGEXP_MARK_NORMAL = 0, 162 XML_REGEXP_MARK_START, 163 XML_REGEXP_MARK_VISITED 164 } xmlRegMarkedType; 165 166 typedef struct _xmlRegRange xmlRegRange; 167 typedef xmlRegRange *xmlRegRangePtr; 168 169 struct _xmlRegRange { 170 int neg; /* 0 normal, 1 not, 2 exclude */ 171 xmlRegAtomType type; 172 int start; 173 int end; 174 xmlChar *blockName; 175 }; 176 177 typedef struct _xmlRegAtom xmlRegAtom; 178 typedef xmlRegAtom *xmlRegAtomPtr; 179 180 typedef struct _xmlAutomataState xmlRegState; 181 typedef xmlRegState *xmlRegStatePtr; 182 183 struct _xmlRegAtom { 184 int no; 185 xmlRegAtomType type; 186 xmlRegQuantType quant; 187 int min; 188 int max; 189 190 void *valuep; 191 void *valuep2; 192 int neg; 193 int codepoint; 194 xmlRegStatePtr start; 195 xmlRegStatePtr start0; 196 xmlRegStatePtr stop; 197 int maxRanges; 198 int nbRanges; 199 xmlRegRangePtr *ranges; 200 void *data; 201 }; 202 203 typedef struct _xmlRegCounter xmlRegCounter; 204 typedef xmlRegCounter *xmlRegCounterPtr; 205 206 struct _xmlRegCounter { 207 int min; 208 int max; 209 }; 210 211 typedef struct _xmlRegTrans xmlRegTrans; 212 typedef xmlRegTrans *xmlRegTransPtr; 213 214 struct _xmlRegTrans { 215 xmlRegAtomPtr atom; 216 int to; 217 int counter; 218 int count; 219 int nd; 220 }; 221 222 struct _xmlAutomataState { 223 xmlRegStateType type; 224 xmlRegMarkedType mark; 225 xmlRegMarkedType markd; 226 xmlRegMarkedType reached; 227 int no; 228 int maxTrans; 229 int nbTrans; 230 xmlRegTrans *trans; 231 /* knowing states ponting to us can speed things up */ 232 int maxTransTo; 233 int nbTransTo; 234 int *transTo; 235 }; 236 237 typedef struct _xmlAutomata xmlRegParserCtxt; 238 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 239 240 #define AM_AUTOMATA_RNG 1 241 242 struct _xmlAutomata { 243 xmlChar *string; 244 xmlChar *cur; 245 246 int error; 247 int neg; 248 249 xmlRegStatePtr start; 250 xmlRegStatePtr end; 251 xmlRegStatePtr state; 252 253 xmlRegAtomPtr atom; 254 255 int maxAtoms; 256 int nbAtoms; 257 xmlRegAtomPtr *atoms; 258 259 int maxStates; 260 int nbStates; 261 xmlRegStatePtr *states; 262 263 int maxCounters; 264 int nbCounters; 265 xmlRegCounter *counters; 266 267 int determinist; 268 int negs; 269 int flags; 270 }; 271 272 struct _xmlRegexp { 273 xmlChar *string; 274 int nbStates; 275 xmlRegStatePtr *states; 276 int nbAtoms; 277 xmlRegAtomPtr *atoms; 278 int nbCounters; 279 xmlRegCounter *counters; 280 int determinist; 281 int flags; 282 /* 283 * That's the compact form for determinists automatas 284 */ 285 int nbstates; 286 int *compact; 287 void **transdata; 288 int nbstrings; 289 xmlChar **stringMap; 290 }; 291 292 typedef struct _xmlRegExecRollback xmlRegExecRollback; 293 typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 294 295 struct _xmlRegExecRollback { 296 xmlRegStatePtr state;/* the current state */ 297 int index; /* the index in the input stack */ 298 int nextbranch; /* the next transition to explore in that state */ 299 int *counts; /* save the automata state if it has some */ 300 }; 301 302 typedef struct _xmlRegInputToken xmlRegInputToken; 303 typedef xmlRegInputToken *xmlRegInputTokenPtr; 304 305 struct _xmlRegInputToken { 306 xmlChar *value; 307 void *data; 308 }; 309 310 struct _xmlRegExecCtxt { 311 int status; /* execution status != 0 indicate an error */ 312 int determinist; /* did we find an indeterministic behaviour */ 313 xmlRegexpPtr comp; /* the compiled regexp */ 314 xmlRegExecCallbacks callback; 315 void *data; 316 317 xmlRegStatePtr state;/* the current state */ 318 int transno; /* the current transition on that state */ 319 int transcount; /* the number of chars in char counted transitions */ 320 321 /* 322 * A stack of rollback states 323 */ 324 int maxRollbacks; 325 int nbRollbacks; 326 xmlRegExecRollback *rollbacks; 327 328 /* 329 * The state of the automata if any 330 */ 331 int *counts; 332 333 /* 334 * The input stack 335 */ 336 int inputStackMax; 337 int inputStackNr; 338 int index; 339 int *charStack; 340 const xmlChar *inputString; /* when operating on characters */ 341 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 342 343 /* 344 * error handling 345 */ 346 int errStateNo; /* the error state number */ 347 xmlRegStatePtr errState; /* the error state */ 348 xmlChar *errString; /* the string raising the error */ 349 int *errCounts; /* counters at the error state */ 350 int nbPush; 351 }; 352 353 #define REGEXP_ALL_COUNTER 0x123456 354 #define REGEXP_ALL_LAX_COUNTER 0x123457 355 356 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 357 static void xmlRegFreeState(xmlRegStatePtr state); 358 static void xmlRegFreeAtom(xmlRegAtomPtr atom); 359 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 360 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 361 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 362 int neg, int start, int end, const xmlChar *blockName); 363 364 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags); 365 366 /************************************************************************ 367 * * 368 * Regexp memory error handler * 369 * * 370 ************************************************************************/ 371 /** 372 * xmlRegexpErrMemory: 373 * @extra: extra information 374 * 375 * Handle an out of memory condition 376 */ 377 static void 378 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 379 { 380 const char *regexp = NULL; 381 if (ctxt != NULL) { 382 regexp = (const char *) ctxt->string; 383 ctxt->error = XML_ERR_NO_MEMORY; 384 } 385 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 386 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 387 regexp, NULL, 0, 0, 388 "Memory allocation failed : %s\n", extra); 389 } 390 391 /** 392 * xmlRegexpErrCompile: 393 * @extra: extra information 394 * 395 * Handle a compilation failure 396 */ 397 static void 398 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 399 { 400 const char *regexp = NULL; 401 int idx = 0; 402 403 if (ctxt != NULL) { 404 regexp = (const char *) ctxt->string; 405 idx = ctxt->cur - ctxt->string; 406 ctxt->error = XML_REGEXP_COMPILE_ERROR; 407 } 408 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 409 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 410 regexp, NULL, idx, 0, 411 "failed to compile: %s\n", extra); 412 } 413 414 /************************************************************************ 415 * * 416 * Allocation/Deallocation * 417 * * 418 ************************************************************************/ 419 420 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 421 /** 422 * xmlRegEpxFromParse: 423 * @ctxt: the parser context used to build it 424 * 425 * Allocate a new regexp and fill it with the result from the parser 426 * 427 * Returns the new regexp or NULL in case of error 428 */ 429 static xmlRegexpPtr 430 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 431 xmlRegexpPtr ret; 432 433 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 434 if (ret == NULL) { 435 xmlRegexpErrMemory(ctxt, "compiling regexp"); 436 return(NULL); 437 } 438 memset(ret, 0, sizeof(xmlRegexp)); 439 ret->string = ctxt->string; 440 ret->nbStates = ctxt->nbStates; 441 ret->states = ctxt->states; 442 ret->nbAtoms = ctxt->nbAtoms; 443 ret->atoms = ctxt->atoms; 444 ret->nbCounters = ctxt->nbCounters; 445 ret->counters = ctxt->counters; 446 ret->determinist = ctxt->determinist; 447 ret->flags = ctxt->flags; 448 if (ret->determinist == -1) { 449 xmlRegexpIsDeterminist(ret); 450 } 451 452 if ((ret->determinist != 0) && 453 (ret->nbCounters == 0) && 454 (ctxt->negs == 0) && 455 (ret->atoms != NULL) && 456 (ret->atoms[0] != NULL) && 457 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 458 int i, j, nbstates = 0, nbatoms = 0; 459 int *stateRemap; 460 int *stringRemap; 461 int *transitions; 462 void **transdata; 463 xmlChar **stringMap; 464 xmlChar *value; 465 466 /* 467 * Switch to a compact representation 468 * 1/ counting the effective number of states left 469 * 2/ counting the unique number of atoms, and check that 470 * they are all of the string type 471 * 3/ build a table state x atom for the transitions 472 */ 473 474 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 475 if (stateRemap == NULL) { 476 xmlRegexpErrMemory(ctxt, "compiling regexp"); 477 xmlFree(ret); 478 return(NULL); 479 } 480 for (i = 0;i < ret->nbStates;i++) { 481 if (ret->states[i] != NULL) { 482 stateRemap[i] = nbstates; 483 nbstates++; 484 } else { 485 stateRemap[i] = -1; 486 } 487 } 488 #ifdef DEBUG_COMPACTION 489 printf("Final: %d states\n", nbstates); 490 #endif 491 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 492 if (stringMap == NULL) { 493 xmlRegexpErrMemory(ctxt, "compiling regexp"); 494 xmlFree(stateRemap); 495 xmlFree(ret); 496 return(NULL); 497 } 498 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 499 if (stringRemap == NULL) { 500 xmlRegexpErrMemory(ctxt, "compiling regexp"); 501 xmlFree(stringMap); 502 xmlFree(stateRemap); 503 xmlFree(ret); 504 return(NULL); 505 } 506 for (i = 0;i < ret->nbAtoms;i++) { 507 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 508 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 509 value = ret->atoms[i]->valuep; 510 for (j = 0;j < nbatoms;j++) { 511 if (xmlStrEqual(stringMap[j], value)) { 512 stringRemap[i] = j; 513 break; 514 } 515 } 516 if (j >= nbatoms) { 517 stringRemap[i] = nbatoms; 518 stringMap[nbatoms] = xmlStrdup(value); 519 if (stringMap[nbatoms] == NULL) { 520 for (i = 0;i < nbatoms;i++) 521 xmlFree(stringMap[i]); 522 xmlFree(stringRemap); 523 xmlFree(stringMap); 524 xmlFree(stateRemap); 525 xmlFree(ret); 526 return(NULL); 527 } 528 nbatoms++; 529 } 530 } else { 531 xmlFree(stateRemap); 532 xmlFree(stringRemap); 533 for (i = 0;i < nbatoms;i++) 534 xmlFree(stringMap[i]); 535 xmlFree(stringMap); 536 xmlFree(ret); 537 return(NULL); 538 } 539 } 540 #ifdef DEBUG_COMPACTION 541 printf("Final: %d atoms\n", nbatoms); 542 #endif 543 transitions = (int *) xmlMalloc((nbstates + 1) * 544 (nbatoms + 1) * sizeof(int)); 545 if (transitions == NULL) { 546 xmlFree(stateRemap); 547 xmlFree(stringRemap); 548 xmlFree(stringMap); 549 xmlFree(ret); 550 return(NULL); 551 } 552 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 553 554 /* 555 * Allocate the transition table. The first entry for each 556 * state corresponds to the state type. 557 */ 558 transdata = NULL; 559 560 for (i = 0;i < ret->nbStates;i++) { 561 int stateno, atomno, targetno, prev; 562 xmlRegStatePtr state; 563 xmlRegTransPtr trans; 564 565 stateno = stateRemap[i]; 566 if (stateno == -1) 567 continue; 568 state = ret->states[i]; 569 570 transitions[stateno * (nbatoms + 1)] = state->type; 571 572 for (j = 0;j < state->nbTrans;j++) { 573 trans = &(state->trans[j]); 574 if ((trans->to == -1) || (trans->atom == NULL)) 575 continue; 576 atomno = stringRemap[trans->atom->no]; 577 if ((trans->atom->data != NULL) && (transdata == NULL)) { 578 transdata = (void **) xmlMalloc(nbstates * nbatoms * 579 sizeof(void *)); 580 if (transdata != NULL) 581 memset(transdata, 0, 582 nbstates * nbatoms * sizeof(void *)); 583 else { 584 xmlRegexpErrMemory(ctxt, "compiling regexp"); 585 break; 586 } 587 } 588 targetno = stateRemap[trans->to]; 589 /* 590 * if the same atom can generate transitions to 2 different 591 * states then it means the automata is not determinist and 592 * the compact form can't be used ! 593 */ 594 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 595 if (prev != 0) { 596 if (prev != targetno + 1) { 597 ret->determinist = 0; 598 #ifdef DEBUG_COMPACTION 599 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 600 i, j, trans->atom->no, trans->to, atomno, targetno); 601 printf(" previous to is %d\n", prev); 602 #endif 603 if (transdata != NULL) 604 xmlFree(transdata); 605 xmlFree(transitions); 606 xmlFree(stateRemap); 607 xmlFree(stringRemap); 608 for (i = 0;i < nbatoms;i++) 609 xmlFree(stringMap[i]); 610 xmlFree(stringMap); 611 goto not_determ; 612 } 613 } else { 614 #if 0 615 printf("State %d trans %d: atom %d to %d : %d to %d\n", 616 i, j, trans->atom->no, trans->to, atomno, targetno); 617 #endif 618 transitions[stateno * (nbatoms + 1) + atomno + 1] = 619 targetno + 1; /* to avoid 0 */ 620 if (transdata != NULL) 621 transdata[stateno * nbatoms + atomno] = 622 trans->atom->data; 623 } 624 } 625 } 626 ret->determinist = 1; 627 #ifdef DEBUG_COMPACTION 628 /* 629 * Debug 630 */ 631 for (i = 0;i < nbstates;i++) { 632 for (j = 0;j < nbatoms + 1;j++) { 633 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 634 } 635 printf("\n"); 636 } 637 printf("\n"); 638 #endif 639 /* 640 * Cleanup of the old data 641 */ 642 if (ret->states != NULL) { 643 for (i = 0;i < ret->nbStates;i++) 644 xmlRegFreeState(ret->states[i]); 645 xmlFree(ret->states); 646 } 647 ret->states = NULL; 648 ret->nbStates = 0; 649 if (ret->atoms != NULL) { 650 for (i = 0;i < ret->nbAtoms;i++) 651 xmlRegFreeAtom(ret->atoms[i]); 652 xmlFree(ret->atoms); 653 } 654 ret->atoms = NULL; 655 ret->nbAtoms = 0; 656 657 ret->compact = transitions; 658 ret->transdata = transdata; 659 ret->stringMap = stringMap; 660 ret->nbstrings = nbatoms; 661 ret->nbstates = nbstates; 662 xmlFree(stateRemap); 663 xmlFree(stringRemap); 664 } 665 not_determ: 666 ctxt->string = NULL; 667 ctxt->nbStates = 0; 668 ctxt->states = NULL; 669 ctxt->nbAtoms = 0; 670 ctxt->atoms = NULL; 671 ctxt->nbCounters = 0; 672 ctxt->counters = NULL; 673 return(ret); 674 } 675 676 /** 677 * xmlRegNewParserCtxt: 678 * @string: the string to parse 679 * 680 * Allocate a new regexp parser context 681 * 682 * Returns the new context or NULL in case of error 683 */ 684 static xmlRegParserCtxtPtr 685 xmlRegNewParserCtxt(const xmlChar *string) { 686 xmlRegParserCtxtPtr ret; 687 688 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 689 if (ret == NULL) 690 return(NULL); 691 memset(ret, 0, sizeof(xmlRegParserCtxt)); 692 if (string != NULL) 693 ret->string = xmlStrdup(string); 694 ret->cur = ret->string; 695 ret->neg = 0; 696 ret->negs = 0; 697 ret->error = 0; 698 ret->determinist = -1; 699 return(ret); 700 } 701 702 /** 703 * xmlRegNewRange: 704 * @ctxt: the regexp parser context 705 * @neg: is that negative 706 * @type: the type of range 707 * @start: the start codepoint 708 * @end: the end codepoint 709 * 710 * Allocate a new regexp range 711 * 712 * Returns the new range or NULL in case of error 713 */ 714 static xmlRegRangePtr 715 xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 716 int neg, xmlRegAtomType type, int start, int end) { 717 xmlRegRangePtr ret; 718 719 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 720 if (ret == NULL) { 721 xmlRegexpErrMemory(ctxt, "allocating range"); 722 return(NULL); 723 } 724 ret->neg = neg; 725 ret->type = type; 726 ret->start = start; 727 ret->end = end; 728 return(ret); 729 } 730 731 /** 732 * xmlRegFreeRange: 733 * @range: the regexp range 734 * 735 * Free a regexp range 736 */ 737 static void 738 xmlRegFreeRange(xmlRegRangePtr range) { 739 if (range == NULL) 740 return; 741 742 if (range->blockName != NULL) 743 xmlFree(range->blockName); 744 xmlFree(range); 745 } 746 747 /** 748 * xmlRegCopyRange: 749 * @range: the regexp range 750 * 751 * Copy a regexp range 752 * 753 * Returns the new copy or NULL in case of error. 754 */ 755 static xmlRegRangePtr 756 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) { 757 xmlRegRangePtr ret; 758 759 if (range == NULL) 760 return(NULL); 761 762 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start, 763 range->end); 764 if (ret == NULL) 765 return(NULL); 766 if (range->blockName != NULL) { 767 ret->blockName = xmlStrdup(range->blockName); 768 if (ret->blockName == NULL) { 769 xmlRegexpErrMemory(ctxt, "allocating range"); 770 xmlRegFreeRange(ret); 771 return(NULL); 772 } 773 } 774 return(ret); 775 } 776 777 /** 778 * xmlRegNewAtom: 779 * @ctxt: the regexp parser context 780 * @type: the type of atom 781 * 782 * Allocate a new atom 783 * 784 * Returns the new atom or NULL in case of error 785 */ 786 static xmlRegAtomPtr 787 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 788 xmlRegAtomPtr ret; 789 790 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 791 if (ret == NULL) { 792 xmlRegexpErrMemory(ctxt, "allocating atom"); 793 return(NULL); 794 } 795 memset(ret, 0, sizeof(xmlRegAtom)); 796 ret->type = type; 797 ret->quant = XML_REGEXP_QUANT_ONCE; 798 ret->min = 0; 799 ret->max = 0; 800 return(ret); 801 } 802 803 /** 804 * xmlRegFreeAtom: 805 * @atom: the regexp atom 806 * 807 * Free a regexp atom 808 */ 809 static void 810 xmlRegFreeAtom(xmlRegAtomPtr atom) { 811 int i; 812 813 if (atom == NULL) 814 return; 815 816 for (i = 0;i < atom->nbRanges;i++) 817 xmlRegFreeRange(atom->ranges[i]); 818 if (atom->ranges != NULL) 819 xmlFree(atom->ranges); 820 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 821 xmlFree(atom->valuep); 822 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 823 xmlFree(atom->valuep2); 824 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 825 xmlFree(atom->valuep); 826 xmlFree(atom); 827 } 828 829 /** 830 * xmlRegCopyAtom: 831 * @ctxt: the regexp parser context 832 * @atom: the oiginal atom 833 * 834 * Allocate a new regexp range 835 * 836 * Returns the new atom or NULL in case of error 837 */ 838 static xmlRegAtomPtr 839 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 840 xmlRegAtomPtr ret; 841 842 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 843 if (ret == NULL) { 844 xmlRegexpErrMemory(ctxt, "copying atom"); 845 return(NULL); 846 } 847 memset(ret, 0, sizeof(xmlRegAtom)); 848 ret->type = atom->type; 849 ret->quant = atom->quant; 850 ret->min = atom->min; 851 ret->max = atom->max; 852 if (atom->nbRanges > 0) { 853 int i; 854 855 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) * 856 atom->nbRanges); 857 if (ret->ranges == NULL) { 858 xmlRegexpErrMemory(ctxt, "copying atom"); 859 goto error; 860 } 861 for (i = 0;i < atom->nbRanges;i++) { 862 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]); 863 if (ret->ranges[i] == NULL) 864 goto error; 865 ret->nbRanges = i + 1; 866 } 867 } 868 return(ret); 869 870 error: 871 xmlRegFreeAtom(ret); 872 return(NULL); 873 } 874 875 static xmlRegStatePtr 876 xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 877 xmlRegStatePtr ret; 878 879 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 880 if (ret == NULL) { 881 xmlRegexpErrMemory(ctxt, "allocating state"); 882 return(NULL); 883 } 884 memset(ret, 0, sizeof(xmlRegState)); 885 ret->type = XML_REGEXP_TRANS_STATE; 886 ret->mark = XML_REGEXP_MARK_NORMAL; 887 return(ret); 888 } 889 890 /** 891 * xmlRegFreeState: 892 * @state: the regexp state 893 * 894 * Free a regexp state 895 */ 896 static void 897 xmlRegFreeState(xmlRegStatePtr state) { 898 if (state == NULL) 899 return; 900 901 if (state->trans != NULL) 902 xmlFree(state->trans); 903 if (state->transTo != NULL) 904 xmlFree(state->transTo); 905 xmlFree(state); 906 } 907 908 /** 909 * xmlRegFreeParserCtxt: 910 * @ctxt: the regexp parser context 911 * 912 * Free a regexp parser context 913 */ 914 static void 915 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 916 int i; 917 if (ctxt == NULL) 918 return; 919 920 if (ctxt->string != NULL) 921 xmlFree(ctxt->string); 922 if (ctxt->states != NULL) { 923 for (i = 0;i < ctxt->nbStates;i++) 924 xmlRegFreeState(ctxt->states[i]); 925 xmlFree(ctxt->states); 926 } 927 if (ctxt->atoms != NULL) { 928 for (i = 0;i < ctxt->nbAtoms;i++) 929 xmlRegFreeAtom(ctxt->atoms[i]); 930 xmlFree(ctxt->atoms); 931 } 932 if (ctxt->counters != NULL) 933 xmlFree(ctxt->counters); 934 xmlFree(ctxt); 935 } 936 937 /************************************************************************ 938 * * 939 * Display of Data structures * 940 * * 941 ************************************************************************/ 942 943 static void 944 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 945 switch (type) { 946 case XML_REGEXP_EPSILON: 947 fprintf(output, "epsilon "); break; 948 case XML_REGEXP_CHARVAL: 949 fprintf(output, "charval "); break; 950 case XML_REGEXP_RANGES: 951 fprintf(output, "ranges "); break; 952 case XML_REGEXP_SUBREG: 953 fprintf(output, "subexpr "); break; 954 case XML_REGEXP_STRING: 955 fprintf(output, "string "); break; 956 case XML_REGEXP_ANYCHAR: 957 fprintf(output, "anychar "); break; 958 case XML_REGEXP_ANYSPACE: 959 fprintf(output, "anyspace "); break; 960 case XML_REGEXP_NOTSPACE: 961 fprintf(output, "notspace "); break; 962 case XML_REGEXP_INITNAME: 963 fprintf(output, "initname "); break; 964 case XML_REGEXP_NOTINITNAME: 965 fprintf(output, "notinitname "); break; 966 case XML_REGEXP_NAMECHAR: 967 fprintf(output, "namechar "); break; 968 case XML_REGEXP_NOTNAMECHAR: 969 fprintf(output, "notnamechar "); break; 970 case XML_REGEXP_DECIMAL: 971 fprintf(output, "decimal "); break; 972 case XML_REGEXP_NOTDECIMAL: 973 fprintf(output, "notdecimal "); break; 974 case XML_REGEXP_REALCHAR: 975 fprintf(output, "realchar "); break; 976 case XML_REGEXP_NOTREALCHAR: 977 fprintf(output, "notrealchar "); break; 978 case XML_REGEXP_LETTER: 979 fprintf(output, "LETTER "); break; 980 case XML_REGEXP_LETTER_UPPERCASE: 981 fprintf(output, "LETTER_UPPERCASE "); break; 982 case XML_REGEXP_LETTER_LOWERCASE: 983 fprintf(output, "LETTER_LOWERCASE "); break; 984 case XML_REGEXP_LETTER_TITLECASE: 985 fprintf(output, "LETTER_TITLECASE "); break; 986 case XML_REGEXP_LETTER_MODIFIER: 987 fprintf(output, "LETTER_MODIFIER "); break; 988 case XML_REGEXP_LETTER_OTHERS: 989 fprintf(output, "LETTER_OTHERS "); break; 990 case XML_REGEXP_MARK: 991 fprintf(output, "MARK "); break; 992 case XML_REGEXP_MARK_NONSPACING: 993 fprintf(output, "MARK_NONSPACING "); break; 994 case XML_REGEXP_MARK_SPACECOMBINING: 995 fprintf(output, "MARK_SPACECOMBINING "); break; 996 case XML_REGEXP_MARK_ENCLOSING: 997 fprintf(output, "MARK_ENCLOSING "); break; 998 case XML_REGEXP_NUMBER: 999 fprintf(output, "NUMBER "); break; 1000 case XML_REGEXP_NUMBER_DECIMAL: 1001 fprintf(output, "NUMBER_DECIMAL "); break; 1002 case XML_REGEXP_NUMBER_LETTER: 1003 fprintf(output, "NUMBER_LETTER "); break; 1004 case XML_REGEXP_NUMBER_OTHERS: 1005 fprintf(output, "NUMBER_OTHERS "); break; 1006 case XML_REGEXP_PUNCT: 1007 fprintf(output, "PUNCT "); break; 1008 case XML_REGEXP_PUNCT_CONNECTOR: 1009 fprintf(output, "PUNCT_CONNECTOR "); break; 1010 case XML_REGEXP_PUNCT_DASH: 1011 fprintf(output, "PUNCT_DASH "); break; 1012 case XML_REGEXP_PUNCT_OPEN: 1013 fprintf(output, "PUNCT_OPEN "); break; 1014 case XML_REGEXP_PUNCT_CLOSE: 1015 fprintf(output, "PUNCT_CLOSE "); break; 1016 case XML_REGEXP_PUNCT_INITQUOTE: 1017 fprintf(output, "PUNCT_INITQUOTE "); break; 1018 case XML_REGEXP_PUNCT_FINQUOTE: 1019 fprintf(output, "PUNCT_FINQUOTE "); break; 1020 case XML_REGEXP_PUNCT_OTHERS: 1021 fprintf(output, "PUNCT_OTHERS "); break; 1022 case XML_REGEXP_SEPAR: 1023 fprintf(output, "SEPAR "); break; 1024 case XML_REGEXP_SEPAR_SPACE: 1025 fprintf(output, "SEPAR_SPACE "); break; 1026 case XML_REGEXP_SEPAR_LINE: 1027 fprintf(output, "SEPAR_LINE "); break; 1028 case XML_REGEXP_SEPAR_PARA: 1029 fprintf(output, "SEPAR_PARA "); break; 1030 case XML_REGEXP_SYMBOL: 1031 fprintf(output, "SYMBOL "); break; 1032 case XML_REGEXP_SYMBOL_MATH: 1033 fprintf(output, "SYMBOL_MATH "); break; 1034 case XML_REGEXP_SYMBOL_CURRENCY: 1035 fprintf(output, "SYMBOL_CURRENCY "); break; 1036 case XML_REGEXP_SYMBOL_MODIFIER: 1037 fprintf(output, "SYMBOL_MODIFIER "); break; 1038 case XML_REGEXP_SYMBOL_OTHERS: 1039 fprintf(output, "SYMBOL_OTHERS "); break; 1040 case XML_REGEXP_OTHER: 1041 fprintf(output, "OTHER "); break; 1042 case XML_REGEXP_OTHER_CONTROL: 1043 fprintf(output, "OTHER_CONTROL "); break; 1044 case XML_REGEXP_OTHER_FORMAT: 1045 fprintf(output, "OTHER_FORMAT "); break; 1046 case XML_REGEXP_OTHER_PRIVATE: 1047 fprintf(output, "OTHER_PRIVATE "); break; 1048 case XML_REGEXP_OTHER_NA: 1049 fprintf(output, "OTHER_NA "); break; 1050 case XML_REGEXP_BLOCK_NAME: 1051 fprintf(output, "BLOCK "); break; 1052 } 1053 } 1054 1055 static void 1056 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 1057 switch (type) { 1058 case XML_REGEXP_QUANT_EPSILON: 1059 fprintf(output, "epsilon "); break; 1060 case XML_REGEXP_QUANT_ONCE: 1061 fprintf(output, "once "); break; 1062 case XML_REGEXP_QUANT_OPT: 1063 fprintf(output, "? "); break; 1064 case XML_REGEXP_QUANT_MULT: 1065 fprintf(output, "* "); break; 1066 case XML_REGEXP_QUANT_PLUS: 1067 fprintf(output, "+ "); break; 1068 case XML_REGEXP_QUANT_RANGE: 1069 fprintf(output, "range "); break; 1070 case XML_REGEXP_QUANT_ONCEONLY: 1071 fprintf(output, "onceonly "); break; 1072 case XML_REGEXP_QUANT_ALL: 1073 fprintf(output, "all "); break; 1074 } 1075 } 1076 static void 1077 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 1078 fprintf(output, " range: "); 1079 if (range->neg) 1080 fprintf(output, "negative "); 1081 xmlRegPrintAtomType(output, range->type); 1082 fprintf(output, "%c - %c\n", range->start, range->end); 1083 } 1084 1085 static void 1086 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 1087 fprintf(output, " atom: "); 1088 if (atom == NULL) { 1089 fprintf(output, "NULL\n"); 1090 return; 1091 } 1092 if (atom->neg) 1093 fprintf(output, "not "); 1094 xmlRegPrintAtomType(output, atom->type); 1095 xmlRegPrintQuantType(output, atom->quant); 1096 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1097 fprintf(output, "%d-%d ", atom->min, atom->max); 1098 if (atom->type == XML_REGEXP_STRING) 1099 fprintf(output, "'%s' ", (char *) atom->valuep); 1100 if (atom->type == XML_REGEXP_CHARVAL) 1101 fprintf(output, "char %c\n", atom->codepoint); 1102 else if (atom->type == XML_REGEXP_RANGES) { 1103 int i; 1104 fprintf(output, "%d entries\n", atom->nbRanges); 1105 for (i = 0; i < atom->nbRanges;i++) 1106 xmlRegPrintRange(output, atom->ranges[i]); 1107 } else if (atom->type == XML_REGEXP_SUBREG) { 1108 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1109 } else { 1110 fprintf(output, "\n"); 1111 } 1112 } 1113 1114 static void 1115 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1116 fprintf(output, " trans: "); 1117 if (trans == NULL) { 1118 fprintf(output, "NULL\n"); 1119 return; 1120 } 1121 if (trans->to < 0) { 1122 fprintf(output, "removed\n"); 1123 return; 1124 } 1125 if (trans->nd != 0) { 1126 if (trans->nd == 2) 1127 fprintf(output, "last not determinist, "); 1128 else 1129 fprintf(output, "not determinist, "); 1130 } 1131 if (trans->counter >= 0) { 1132 fprintf(output, "counted %d, ", trans->counter); 1133 } 1134 if (trans->count == REGEXP_ALL_COUNTER) { 1135 fprintf(output, "all transition, "); 1136 } else if (trans->count >= 0) { 1137 fprintf(output, "count based %d, ", trans->count); 1138 } 1139 if (trans->atom == NULL) { 1140 fprintf(output, "epsilon to %d\n", trans->to); 1141 return; 1142 } 1143 if (trans->atom->type == XML_REGEXP_CHARVAL) 1144 fprintf(output, "char %c ", trans->atom->codepoint); 1145 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1146 } 1147 1148 static void 1149 xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1150 int i; 1151 1152 fprintf(output, " state: "); 1153 if (state == NULL) { 1154 fprintf(output, "NULL\n"); 1155 return; 1156 } 1157 if (state->type == XML_REGEXP_START_STATE) 1158 fprintf(output, "START "); 1159 if (state->type == XML_REGEXP_FINAL_STATE) 1160 fprintf(output, "FINAL "); 1161 1162 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1163 for (i = 0;i < state->nbTrans; i++) { 1164 xmlRegPrintTrans(output, &(state->trans[i])); 1165 } 1166 } 1167 1168 #ifdef DEBUG_REGEXP_GRAPH 1169 static void 1170 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1171 int i; 1172 1173 fprintf(output, " ctxt: "); 1174 if (ctxt == NULL) { 1175 fprintf(output, "NULL\n"); 1176 return; 1177 } 1178 fprintf(output, "'%s' ", ctxt->string); 1179 if (ctxt->error) 1180 fprintf(output, "error "); 1181 if (ctxt->neg) 1182 fprintf(output, "neg "); 1183 fprintf(output, "\n"); 1184 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1185 for (i = 0;i < ctxt->nbAtoms; i++) { 1186 fprintf(output, " %02d ", i); 1187 xmlRegPrintAtom(output, ctxt->atoms[i]); 1188 } 1189 if (ctxt->atom != NULL) { 1190 fprintf(output, "current atom:\n"); 1191 xmlRegPrintAtom(output, ctxt->atom); 1192 } 1193 fprintf(output, "%d states:", ctxt->nbStates); 1194 if (ctxt->start != NULL) 1195 fprintf(output, " start: %d", ctxt->start->no); 1196 if (ctxt->end != NULL) 1197 fprintf(output, " end: %d", ctxt->end->no); 1198 fprintf(output, "\n"); 1199 for (i = 0;i < ctxt->nbStates; i++) { 1200 xmlRegPrintState(output, ctxt->states[i]); 1201 } 1202 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1203 for (i = 0;i < ctxt->nbCounters; i++) { 1204 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1205 ctxt->counters[i].max); 1206 } 1207 } 1208 #endif 1209 1210 /************************************************************************ 1211 * * 1212 * Finite Automata structures manipulations * 1213 * * 1214 ************************************************************************/ 1215 1216 static void 1217 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1218 int neg, xmlRegAtomType type, int start, int end, 1219 xmlChar *blockName) { 1220 xmlRegRangePtr range; 1221 1222 if (atom == NULL) { 1223 ERROR("add range: atom is NULL"); 1224 return; 1225 } 1226 if (atom->type != XML_REGEXP_RANGES) { 1227 ERROR("add range: atom is not ranges"); 1228 return; 1229 } 1230 if (atom->maxRanges == 0) { 1231 atom->maxRanges = 4; 1232 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1233 sizeof(xmlRegRangePtr)); 1234 if (atom->ranges == NULL) { 1235 xmlRegexpErrMemory(ctxt, "adding ranges"); 1236 atom->maxRanges = 0; 1237 return; 1238 } 1239 } else if (atom->nbRanges >= atom->maxRanges) { 1240 xmlRegRangePtr *tmp; 1241 atom->maxRanges *= 2; 1242 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1243 sizeof(xmlRegRangePtr)); 1244 if (tmp == NULL) { 1245 xmlRegexpErrMemory(ctxt, "adding ranges"); 1246 atom->maxRanges /= 2; 1247 return; 1248 } 1249 atom->ranges = tmp; 1250 } 1251 range = xmlRegNewRange(ctxt, neg, type, start, end); 1252 if (range == NULL) 1253 return; 1254 range->blockName = blockName; 1255 atom->ranges[atom->nbRanges++] = range; 1256 1257 } 1258 1259 static int 1260 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1261 if (ctxt->maxCounters == 0) { 1262 ctxt->maxCounters = 4; 1263 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1264 sizeof(xmlRegCounter)); 1265 if (ctxt->counters == NULL) { 1266 xmlRegexpErrMemory(ctxt, "allocating counter"); 1267 ctxt->maxCounters = 0; 1268 return(-1); 1269 } 1270 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1271 xmlRegCounter *tmp; 1272 ctxt->maxCounters *= 2; 1273 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1274 sizeof(xmlRegCounter)); 1275 if (tmp == NULL) { 1276 xmlRegexpErrMemory(ctxt, "allocating counter"); 1277 ctxt->maxCounters /= 2; 1278 return(-1); 1279 } 1280 ctxt->counters = tmp; 1281 } 1282 ctxt->counters[ctxt->nbCounters].min = -1; 1283 ctxt->counters[ctxt->nbCounters].max = -1; 1284 return(ctxt->nbCounters++); 1285 } 1286 1287 static int 1288 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1289 if (atom == NULL) { 1290 ERROR("atom push: atom is NULL"); 1291 return(-1); 1292 } 1293 if (ctxt->maxAtoms == 0) { 1294 ctxt->maxAtoms = 4; 1295 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1296 sizeof(xmlRegAtomPtr)); 1297 if (ctxt->atoms == NULL) { 1298 xmlRegexpErrMemory(ctxt, "pushing atom"); 1299 ctxt->maxAtoms = 0; 1300 return(-1); 1301 } 1302 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1303 xmlRegAtomPtr *tmp; 1304 ctxt->maxAtoms *= 2; 1305 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1306 sizeof(xmlRegAtomPtr)); 1307 if (tmp == NULL) { 1308 xmlRegexpErrMemory(ctxt, "allocating counter"); 1309 ctxt->maxAtoms /= 2; 1310 return(-1); 1311 } 1312 ctxt->atoms = tmp; 1313 } 1314 atom->no = ctxt->nbAtoms; 1315 ctxt->atoms[ctxt->nbAtoms++] = atom; 1316 return(0); 1317 } 1318 1319 static void 1320 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1321 int from) { 1322 if (target->maxTransTo == 0) { 1323 target->maxTransTo = 8; 1324 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1325 sizeof(int)); 1326 if (target->transTo == NULL) { 1327 xmlRegexpErrMemory(ctxt, "adding transition"); 1328 target->maxTransTo = 0; 1329 return; 1330 } 1331 } else if (target->nbTransTo >= target->maxTransTo) { 1332 int *tmp; 1333 target->maxTransTo *= 2; 1334 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1335 sizeof(int)); 1336 if (tmp == NULL) { 1337 xmlRegexpErrMemory(ctxt, "adding transition"); 1338 target->maxTransTo /= 2; 1339 return; 1340 } 1341 target->transTo = tmp; 1342 } 1343 target->transTo[target->nbTransTo] = from; 1344 target->nbTransTo++; 1345 } 1346 1347 static void 1348 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1349 xmlRegAtomPtr atom, xmlRegStatePtr target, 1350 int counter, int count) { 1351 1352 int nrtrans; 1353 1354 if (state == NULL) { 1355 ERROR("add state: state is NULL"); 1356 return; 1357 } 1358 if (target == NULL) { 1359 ERROR("add state: target is NULL"); 1360 return; 1361 } 1362 /* 1363 * Other routines follow the philosophy 'When in doubt, add a transition' 1364 * so we check here whether such a transition is already present and, if 1365 * so, silently ignore this request. 1366 */ 1367 1368 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1369 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1370 if ((trans->atom == atom) && 1371 (trans->to == target->no) && 1372 (trans->counter == counter) && 1373 (trans->count == count)) { 1374 #ifdef DEBUG_REGEXP_GRAPH 1375 printf("Ignoring duplicate transition from %d to %d\n", 1376 state->no, target->no); 1377 #endif 1378 return; 1379 } 1380 } 1381 1382 if (state->maxTrans == 0) { 1383 state->maxTrans = 8; 1384 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1385 sizeof(xmlRegTrans)); 1386 if (state->trans == NULL) { 1387 xmlRegexpErrMemory(ctxt, "adding transition"); 1388 state->maxTrans = 0; 1389 return; 1390 } 1391 } else if (state->nbTrans >= state->maxTrans) { 1392 xmlRegTrans *tmp; 1393 state->maxTrans *= 2; 1394 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1395 sizeof(xmlRegTrans)); 1396 if (tmp == NULL) { 1397 xmlRegexpErrMemory(ctxt, "adding transition"); 1398 state->maxTrans /= 2; 1399 return; 1400 } 1401 state->trans = tmp; 1402 } 1403 #ifdef DEBUG_REGEXP_GRAPH 1404 printf("Add trans from %d to %d ", state->no, target->no); 1405 if (count == REGEXP_ALL_COUNTER) 1406 printf("all transition\n"); 1407 else if (count >= 0) 1408 printf("count based %d\n", count); 1409 else if (counter >= 0) 1410 printf("counted %d\n", counter); 1411 else if (atom == NULL) 1412 printf("epsilon transition\n"); 1413 else if (atom != NULL) 1414 xmlRegPrintAtom(stdout, atom); 1415 #endif 1416 1417 state->trans[state->nbTrans].atom = atom; 1418 state->trans[state->nbTrans].to = target->no; 1419 state->trans[state->nbTrans].counter = counter; 1420 state->trans[state->nbTrans].count = count; 1421 state->trans[state->nbTrans].nd = 0; 1422 state->nbTrans++; 1423 xmlRegStateAddTransTo(ctxt, target, state->no); 1424 } 1425 1426 static int 1427 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1428 if (state == NULL) return(-1); 1429 if (ctxt->maxStates == 0) { 1430 ctxt->maxStates = 4; 1431 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1432 sizeof(xmlRegStatePtr)); 1433 if (ctxt->states == NULL) { 1434 xmlRegexpErrMemory(ctxt, "adding state"); 1435 ctxt->maxStates = 0; 1436 return(-1); 1437 } 1438 } else if (ctxt->nbStates >= ctxt->maxStates) { 1439 xmlRegStatePtr *tmp; 1440 ctxt->maxStates *= 2; 1441 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1442 sizeof(xmlRegStatePtr)); 1443 if (tmp == NULL) { 1444 xmlRegexpErrMemory(ctxt, "adding state"); 1445 ctxt->maxStates /= 2; 1446 return(-1); 1447 } 1448 ctxt->states = tmp; 1449 } 1450 state->no = ctxt->nbStates; 1451 ctxt->states[ctxt->nbStates++] = state; 1452 return(0); 1453 } 1454 1455 /** 1456 * xmlFAGenerateAllTransition: 1457 * @ctxt: a regexp parser context 1458 * @from: the from state 1459 * @to: the target state or NULL for building a new one 1460 * @lax: 1461 * 1462 */ 1463 static void 1464 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1465 xmlRegStatePtr from, xmlRegStatePtr to, 1466 int lax) { 1467 if (to == NULL) { 1468 to = xmlRegNewState(ctxt); 1469 xmlRegStatePush(ctxt, to); 1470 ctxt->state = to; 1471 } 1472 if (lax) 1473 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1474 else 1475 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1476 } 1477 1478 /** 1479 * xmlFAGenerateEpsilonTransition: 1480 * @ctxt: a regexp parser context 1481 * @from: the from state 1482 * @to: the target state or NULL for building a new one 1483 * 1484 */ 1485 static void 1486 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1487 xmlRegStatePtr from, xmlRegStatePtr to) { 1488 if (to == NULL) { 1489 to = xmlRegNewState(ctxt); 1490 xmlRegStatePush(ctxt, to); 1491 ctxt->state = to; 1492 } 1493 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1494 } 1495 1496 /** 1497 * xmlFAGenerateCountedEpsilonTransition: 1498 * @ctxt: a regexp parser context 1499 * @from: the from state 1500 * @to: the target state or NULL for building a new one 1501 * counter: the counter for that transition 1502 * 1503 */ 1504 static void 1505 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1506 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1507 if (to == NULL) { 1508 to = xmlRegNewState(ctxt); 1509 xmlRegStatePush(ctxt, to); 1510 ctxt->state = to; 1511 } 1512 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1513 } 1514 1515 /** 1516 * xmlFAGenerateCountedTransition: 1517 * @ctxt: a regexp parser context 1518 * @from: the from state 1519 * @to: the target state or NULL for building a new one 1520 * counter: the counter for that transition 1521 * 1522 */ 1523 static void 1524 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1525 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1526 if (to == NULL) { 1527 to = xmlRegNewState(ctxt); 1528 xmlRegStatePush(ctxt, to); 1529 ctxt->state = to; 1530 } 1531 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1532 } 1533 1534 /** 1535 * xmlFAGenerateTransitions: 1536 * @ctxt: a regexp parser context 1537 * @from: the from state 1538 * @to: the target state or NULL for building a new one 1539 * @atom: the atom generating the transition 1540 * 1541 * Returns 0 if success and -1 in case of error. 1542 */ 1543 static int 1544 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1545 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1546 xmlRegStatePtr end; 1547 1548 if (atom == NULL) { 1549 ERROR("genrate transition: atom == NULL"); 1550 return(-1); 1551 } 1552 if (atom->type == XML_REGEXP_SUBREG) { 1553 /* 1554 * this is a subexpression handling one should not need to 1555 * create a new node except for XML_REGEXP_QUANT_RANGE. 1556 */ 1557 if (xmlRegAtomPush(ctxt, atom) < 0) { 1558 return(-1); 1559 } 1560 if ((to != NULL) && (atom->stop != to) && 1561 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1562 /* 1563 * Generate an epsilon transition to link to the target 1564 */ 1565 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1566 #ifdef DV 1567 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1568 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1569 to = xmlRegNewState(ctxt); 1570 xmlRegStatePush(ctxt, to); 1571 ctxt->state = to; 1572 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1573 #endif 1574 } 1575 switch (atom->quant) { 1576 case XML_REGEXP_QUANT_OPT: 1577 atom->quant = XML_REGEXP_QUANT_ONCE; 1578 /* 1579 * transition done to the state after end of atom. 1580 * 1. set transition from atom start to new state 1581 * 2. set transition from atom end to this state. 1582 */ 1583 if (to == NULL) { 1584 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1585 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, 1586 ctxt->state); 1587 } else { 1588 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to); 1589 } 1590 break; 1591 case XML_REGEXP_QUANT_MULT: 1592 atom->quant = XML_REGEXP_QUANT_ONCE; 1593 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1594 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1595 break; 1596 case XML_REGEXP_QUANT_PLUS: 1597 atom->quant = XML_REGEXP_QUANT_ONCE; 1598 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1599 break; 1600 case XML_REGEXP_QUANT_RANGE: { 1601 int counter; 1602 xmlRegStatePtr inter, newstate; 1603 1604 /* 1605 * create the final state now if needed 1606 */ 1607 if (to != NULL) { 1608 newstate = to; 1609 } else { 1610 newstate = xmlRegNewState(ctxt); 1611 xmlRegStatePush(ctxt, newstate); 1612 } 1613 1614 /* 1615 * The principle here is to use counted transition 1616 * to avoid explosion in the number of states in the 1617 * graph. This is clearly more complex but should not 1618 * be exploitable at runtime. 1619 */ 1620 if ((atom->min == 0) && (atom->start0 == NULL)) { 1621 xmlRegAtomPtr copy; 1622 /* 1623 * duplicate a transition based on atom to count next 1624 * occurences after 1. We cannot loop to atom->start 1625 * directly because we need an epsilon transition to 1626 * newstate. 1627 */ 1628 /* ???? For some reason it seems we never reach that 1629 case, I suppose this got optimized out before when 1630 building the automata */ 1631 copy = xmlRegCopyAtom(ctxt, atom); 1632 if (copy == NULL) 1633 return(-1); 1634 copy->quant = XML_REGEXP_QUANT_ONCE; 1635 copy->min = 0; 1636 copy->max = 0; 1637 1638 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy) 1639 < 0) 1640 return(-1); 1641 inter = ctxt->state; 1642 counter = xmlRegGetCounter(ctxt); 1643 ctxt->counters[counter].min = atom->min - 1; 1644 ctxt->counters[counter].max = atom->max - 1; 1645 /* count the number of times we see it again */ 1646 xmlFAGenerateCountedEpsilonTransition(ctxt, inter, 1647 atom->stop, counter); 1648 /* allow a way out based on the count */ 1649 xmlFAGenerateCountedTransition(ctxt, inter, 1650 newstate, counter); 1651 /* and also allow a direct exit for 0 */ 1652 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1653 newstate); 1654 } else { 1655 /* 1656 * either we need the atom at least once or there 1657 * is an atom->start0 allowing to easilly plug the 1658 * epsilon transition. 1659 */ 1660 counter = xmlRegGetCounter(ctxt); 1661 ctxt->counters[counter].min = atom->min - 1; 1662 ctxt->counters[counter].max = atom->max - 1; 1663 /* count the number of times we see it again */ 1664 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1665 atom->start, counter); 1666 /* allow a way out based on the count */ 1667 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1668 newstate, counter); 1669 /* and if needed allow a direct exit for 0 */ 1670 if (atom->min == 0) 1671 xmlFAGenerateEpsilonTransition(ctxt, atom->start0, 1672 newstate); 1673 1674 } 1675 atom->min = 0; 1676 atom->max = 0; 1677 atom->quant = XML_REGEXP_QUANT_ONCE; 1678 ctxt->state = newstate; 1679 } 1680 default: 1681 break; 1682 } 1683 return(0); 1684 } 1685 if ((atom->min == 0) && (atom->max == 0) && 1686 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1687 /* 1688 * we can discard the atom and generate an epsilon transition instead 1689 */ 1690 if (to == NULL) { 1691 to = xmlRegNewState(ctxt); 1692 if (to != NULL) 1693 xmlRegStatePush(ctxt, to); 1694 else { 1695 return(-1); 1696 } 1697 } 1698 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1699 ctxt->state = to; 1700 xmlRegFreeAtom(atom); 1701 return(0); 1702 } 1703 if (to == NULL) { 1704 to = xmlRegNewState(ctxt); 1705 if (to != NULL) 1706 xmlRegStatePush(ctxt, to); 1707 else { 1708 return(-1); 1709 } 1710 } 1711 end = to; 1712 if ((atom->quant == XML_REGEXP_QUANT_MULT) || 1713 (atom->quant == XML_REGEXP_QUANT_PLUS)) { 1714 /* 1715 * Do not pollute the target state by adding transitions from 1716 * it as it is likely to be the shared target of multiple branches. 1717 * So isolate with an epsilon transition. 1718 */ 1719 xmlRegStatePtr tmp; 1720 1721 tmp = xmlRegNewState(ctxt); 1722 if (tmp != NULL) 1723 xmlRegStatePush(ctxt, tmp); 1724 else { 1725 return(-1); 1726 } 1727 xmlFAGenerateEpsilonTransition(ctxt, tmp, to); 1728 to = tmp; 1729 } 1730 if (xmlRegAtomPush(ctxt, atom) < 0) { 1731 return(-1); 1732 } 1733 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1734 ctxt->state = end; 1735 switch (atom->quant) { 1736 case XML_REGEXP_QUANT_OPT: 1737 atom->quant = XML_REGEXP_QUANT_ONCE; 1738 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1739 break; 1740 case XML_REGEXP_QUANT_MULT: 1741 atom->quant = XML_REGEXP_QUANT_ONCE; 1742 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1743 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1744 break; 1745 case XML_REGEXP_QUANT_PLUS: 1746 atom->quant = XML_REGEXP_QUANT_ONCE; 1747 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1748 break; 1749 case XML_REGEXP_QUANT_RANGE: 1750 #if DV_test 1751 if (atom->min == 0) { 1752 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1753 } 1754 #endif 1755 break; 1756 default: 1757 break; 1758 } 1759 return(0); 1760 } 1761 1762 /** 1763 * xmlFAReduceEpsilonTransitions: 1764 * @ctxt: a regexp parser context 1765 * @fromnr: the from state 1766 * @tonr: the to state 1767 * @counter: should that transition be associated to a counted 1768 * 1769 */ 1770 static void 1771 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1772 int tonr, int counter) { 1773 int transnr; 1774 xmlRegStatePtr from; 1775 xmlRegStatePtr to; 1776 1777 #ifdef DEBUG_REGEXP_GRAPH 1778 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1779 #endif 1780 from = ctxt->states[fromnr]; 1781 if (from == NULL) 1782 return; 1783 to = ctxt->states[tonr]; 1784 if (to == NULL) 1785 return; 1786 if ((to->mark == XML_REGEXP_MARK_START) || 1787 (to->mark == XML_REGEXP_MARK_VISITED)) 1788 return; 1789 1790 to->mark = XML_REGEXP_MARK_VISITED; 1791 if (to->type == XML_REGEXP_FINAL_STATE) { 1792 #ifdef DEBUG_REGEXP_GRAPH 1793 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1794 #endif 1795 from->type = XML_REGEXP_FINAL_STATE; 1796 } 1797 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1798 if (to->trans[transnr].to < 0) 1799 continue; 1800 if (to->trans[transnr].atom == NULL) { 1801 /* 1802 * Don't remove counted transitions 1803 * Don't loop either 1804 */ 1805 if (to->trans[transnr].to != fromnr) { 1806 if (to->trans[transnr].count >= 0) { 1807 int newto = to->trans[transnr].to; 1808 1809 xmlRegStateAddTrans(ctxt, from, NULL, 1810 ctxt->states[newto], 1811 -1, to->trans[transnr].count); 1812 } else { 1813 #ifdef DEBUG_REGEXP_GRAPH 1814 printf("Found epsilon trans %d from %d to %d\n", 1815 transnr, tonr, to->trans[transnr].to); 1816 #endif 1817 if (to->trans[transnr].counter >= 0) { 1818 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1819 to->trans[transnr].to, 1820 to->trans[transnr].counter); 1821 } else { 1822 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1823 to->trans[transnr].to, 1824 counter); 1825 } 1826 } 1827 } 1828 } else { 1829 int newto = to->trans[transnr].to; 1830 1831 if (to->trans[transnr].counter >= 0) { 1832 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1833 ctxt->states[newto], 1834 to->trans[transnr].counter, -1); 1835 } else { 1836 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1837 ctxt->states[newto], counter, -1); 1838 } 1839 } 1840 } 1841 to->mark = XML_REGEXP_MARK_NORMAL; 1842 } 1843 1844 /** 1845 * xmlFAEliminateSimpleEpsilonTransitions: 1846 * @ctxt: a regexp parser context 1847 * 1848 * Eliminating general epsilon transitions can get costly in the general 1849 * algorithm due to the large amount of generated new transitions and 1850 * associated comparisons. However for simple epsilon transition used just 1851 * to separate building blocks when generating the automata this can be 1852 * reduced to state elimination: 1853 * - if there exists an epsilon from X to Y 1854 * - if there is no other transition from X 1855 * then X and Y are semantically equivalent and X can be eliminated 1856 * If X is the start state then make Y the start state, else replace the 1857 * target of all transitions to X by transitions to Y. 1858 */ 1859 static void 1860 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1861 int statenr, i, j, newto; 1862 xmlRegStatePtr state, tmp; 1863 1864 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1865 state = ctxt->states[statenr]; 1866 if (state == NULL) 1867 continue; 1868 if (state->nbTrans != 1) 1869 continue; 1870 if (state->type == XML_REGEXP_UNREACH_STATE) 1871 continue; 1872 /* is the only transition out a basic transition */ 1873 if ((state->trans[0].atom == NULL) && 1874 (state->trans[0].to >= 0) && 1875 (state->trans[0].to != statenr) && 1876 (state->trans[0].counter < 0) && 1877 (state->trans[0].count < 0)) { 1878 newto = state->trans[0].to; 1879 1880 if (state->type == XML_REGEXP_START_STATE) { 1881 #ifdef DEBUG_REGEXP_GRAPH 1882 printf("Found simple epsilon trans from start %d to %d\n", 1883 statenr, newto); 1884 #endif 1885 } else { 1886 #ifdef DEBUG_REGEXP_GRAPH 1887 printf("Found simple epsilon trans from %d to %d\n", 1888 statenr, newto); 1889 #endif 1890 for (i = 0;i < state->nbTransTo;i++) { 1891 tmp = ctxt->states[state->transTo[i]]; 1892 for (j = 0;j < tmp->nbTrans;j++) { 1893 if (tmp->trans[j].to == statenr) { 1894 #ifdef DEBUG_REGEXP_GRAPH 1895 printf("Changed transition %d on %d to go to %d\n", 1896 j, tmp->no, newto); 1897 #endif 1898 tmp->trans[j].to = -1; 1899 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom, 1900 ctxt->states[newto], 1901 tmp->trans[j].counter, 1902 tmp->trans[j].count); 1903 } 1904 } 1905 } 1906 if (state->type == XML_REGEXP_FINAL_STATE) 1907 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1908 /* eliminate the transition completely */ 1909 state->nbTrans = 0; 1910 1911 state->type = XML_REGEXP_UNREACH_STATE; 1912 1913 } 1914 1915 } 1916 } 1917 } 1918 /** 1919 * xmlFAEliminateEpsilonTransitions: 1920 * @ctxt: a regexp parser context 1921 * 1922 */ 1923 static void 1924 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1925 int statenr, transnr; 1926 xmlRegStatePtr state; 1927 int has_epsilon; 1928 1929 if (ctxt->states == NULL) return; 1930 1931 /* 1932 * Eliminate simple epsilon transition and the associated unreachable 1933 * states. 1934 */ 1935 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1936 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1937 state = ctxt->states[statenr]; 1938 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) { 1939 #ifdef DEBUG_REGEXP_GRAPH 1940 printf("Removed unreachable state %d\n", statenr); 1941 #endif 1942 xmlRegFreeState(state); 1943 ctxt->states[statenr] = NULL; 1944 } 1945 } 1946 1947 has_epsilon = 0; 1948 1949 /* 1950 * Build the completed transitions bypassing the epsilons 1951 * Use a marking algorithm to avoid loops 1952 * Mark sink states too. 1953 * Process from the latests states backward to the start when 1954 * there is long cascading epsilon chains this minimize the 1955 * recursions and transition compares when adding the new ones 1956 */ 1957 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) { 1958 state = ctxt->states[statenr]; 1959 if (state == NULL) 1960 continue; 1961 if ((state->nbTrans == 0) && 1962 (state->type != XML_REGEXP_FINAL_STATE)) { 1963 state->type = XML_REGEXP_SINK_STATE; 1964 } 1965 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1966 if ((state->trans[transnr].atom == NULL) && 1967 (state->trans[transnr].to >= 0)) { 1968 if (state->trans[transnr].to == statenr) { 1969 state->trans[transnr].to = -1; 1970 #ifdef DEBUG_REGEXP_GRAPH 1971 printf("Removed loopback epsilon trans %d on %d\n", 1972 transnr, statenr); 1973 #endif 1974 } else if (state->trans[transnr].count < 0) { 1975 int newto = state->trans[transnr].to; 1976 1977 #ifdef DEBUG_REGEXP_GRAPH 1978 printf("Found epsilon trans %d from %d to %d\n", 1979 transnr, statenr, newto); 1980 #endif 1981 has_epsilon = 1; 1982 state->trans[transnr].to = -2; 1983 state->mark = XML_REGEXP_MARK_START; 1984 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1985 newto, state->trans[transnr].counter); 1986 state->mark = XML_REGEXP_MARK_NORMAL; 1987 #ifdef DEBUG_REGEXP_GRAPH 1988 } else { 1989 printf("Found counted transition %d on %d\n", 1990 transnr, statenr); 1991 #endif 1992 } 1993 } 1994 } 1995 } 1996 /* 1997 * Eliminate the epsilon transitions 1998 */ 1999 if (has_epsilon) { 2000 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2001 state = ctxt->states[statenr]; 2002 if (state == NULL) 2003 continue; 2004 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2005 xmlRegTransPtr trans = &(state->trans[transnr]); 2006 if ((trans->atom == NULL) && 2007 (trans->count < 0) && 2008 (trans->to >= 0)) { 2009 trans->to = -1; 2010 } 2011 } 2012 } 2013 } 2014 2015 /* 2016 * Use this pass to detect unreachable states too 2017 */ 2018 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2019 state = ctxt->states[statenr]; 2020 if (state != NULL) 2021 state->reached = XML_REGEXP_MARK_NORMAL; 2022 } 2023 state = ctxt->states[0]; 2024 if (state != NULL) 2025 state->reached = XML_REGEXP_MARK_START; 2026 while (state != NULL) { 2027 xmlRegStatePtr target = NULL; 2028 state->reached = XML_REGEXP_MARK_VISITED; 2029 /* 2030 * Mark all states reachable from the current reachable state 2031 */ 2032 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2033 if ((state->trans[transnr].to >= 0) && 2034 ((state->trans[transnr].atom != NULL) || 2035 (state->trans[transnr].count >= 0))) { 2036 int newto = state->trans[transnr].to; 2037 2038 if (ctxt->states[newto] == NULL) 2039 continue; 2040 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 2041 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 2042 target = ctxt->states[newto]; 2043 } 2044 } 2045 } 2046 2047 /* 2048 * find the next accessible state not explored 2049 */ 2050 if (target == NULL) { 2051 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 2052 state = ctxt->states[statenr]; 2053 if ((state != NULL) && (state->reached == 2054 XML_REGEXP_MARK_START)) { 2055 target = state; 2056 break; 2057 } 2058 } 2059 } 2060 state = target; 2061 } 2062 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2063 state = ctxt->states[statenr]; 2064 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 2065 #ifdef DEBUG_REGEXP_GRAPH 2066 printf("Removed unreachable state %d\n", statenr); 2067 #endif 2068 xmlRegFreeState(state); 2069 ctxt->states[statenr] = NULL; 2070 } 2071 } 2072 2073 } 2074 2075 static int 2076 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 2077 int ret = 0; 2078 2079 if ((range1->type == XML_REGEXP_RANGES) || 2080 (range2->type == XML_REGEXP_RANGES) || 2081 (range2->type == XML_REGEXP_SUBREG) || 2082 (range1->type == XML_REGEXP_SUBREG) || 2083 (range1->type == XML_REGEXP_STRING) || 2084 (range2->type == XML_REGEXP_STRING)) 2085 return(-1); 2086 2087 /* put them in order */ 2088 if (range1->type > range2->type) { 2089 xmlRegRangePtr tmp; 2090 2091 tmp = range1; 2092 range1 = range2; 2093 range2 = tmp; 2094 } 2095 if ((range1->type == XML_REGEXP_ANYCHAR) || 2096 (range2->type == XML_REGEXP_ANYCHAR)) { 2097 ret = 1; 2098 } else if ((range1->type == XML_REGEXP_EPSILON) || 2099 (range2->type == XML_REGEXP_EPSILON)) { 2100 return(0); 2101 } else if (range1->type == range2->type) { 2102 if (range1->type != XML_REGEXP_CHARVAL) 2103 ret = 1; 2104 else if ((range1->end < range2->start) || 2105 (range2->end < range1->start)) 2106 ret = 0; 2107 else 2108 ret = 1; 2109 } else if (range1->type == XML_REGEXP_CHARVAL) { 2110 int codepoint; 2111 int neg = 0; 2112 2113 /* 2114 * just check all codepoints in the range for acceptance, 2115 * this is usually way cheaper since done only once at 2116 * compilation than testing over and over at runtime or 2117 * pushing too many states when evaluating. 2118 */ 2119 if (((range1->neg == 0) && (range2->neg != 0)) || 2120 ((range1->neg != 0) && (range2->neg == 0))) 2121 neg = 1; 2122 2123 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 2124 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 2125 0, range2->start, range2->end, 2126 range2->blockName); 2127 if (ret < 0) 2128 return(-1); 2129 if (((neg == 1) && (ret == 0)) || 2130 ((neg == 0) && (ret == 1))) 2131 return(1); 2132 } 2133 return(0); 2134 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 2135 (range2->type == XML_REGEXP_BLOCK_NAME)) { 2136 if (range1->type == range2->type) { 2137 ret = xmlStrEqual(range1->blockName, range2->blockName); 2138 } else { 2139 /* 2140 * comparing a block range with anything else is way 2141 * too costly, and maintining the table is like too much 2142 * memory too, so let's force the automata to save state 2143 * here. 2144 */ 2145 return(1); 2146 } 2147 } else if ((range1->type < XML_REGEXP_LETTER) || 2148 (range2->type < XML_REGEXP_LETTER)) { 2149 if ((range1->type == XML_REGEXP_ANYSPACE) && 2150 (range2->type == XML_REGEXP_NOTSPACE)) 2151 ret = 0; 2152 else if ((range1->type == XML_REGEXP_INITNAME) && 2153 (range2->type == XML_REGEXP_NOTINITNAME)) 2154 ret = 0; 2155 else if ((range1->type == XML_REGEXP_NAMECHAR) && 2156 (range2->type == XML_REGEXP_NOTNAMECHAR)) 2157 ret = 0; 2158 else if ((range1->type == XML_REGEXP_DECIMAL) && 2159 (range2->type == XML_REGEXP_NOTDECIMAL)) 2160 ret = 0; 2161 else if ((range1->type == XML_REGEXP_REALCHAR) && 2162 (range2->type == XML_REGEXP_NOTREALCHAR)) 2163 ret = 0; 2164 else { 2165 /* same thing to limit complexity */ 2166 return(1); 2167 } 2168 } else { 2169 ret = 0; 2170 /* range1->type < range2->type here */ 2171 switch (range1->type) { 2172 case XML_REGEXP_LETTER: 2173 /* all disjoint except in the subgroups */ 2174 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2175 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2176 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2177 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2178 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2179 ret = 1; 2180 break; 2181 case XML_REGEXP_MARK: 2182 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2183 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2184 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2185 ret = 1; 2186 break; 2187 case XML_REGEXP_NUMBER: 2188 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2189 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2190 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2191 ret = 1; 2192 break; 2193 case XML_REGEXP_PUNCT: 2194 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2195 (range2->type == XML_REGEXP_PUNCT_DASH) || 2196 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2197 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2198 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2199 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2200 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2201 ret = 1; 2202 break; 2203 case XML_REGEXP_SEPAR: 2204 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2205 (range2->type == XML_REGEXP_SEPAR_LINE) || 2206 (range2->type == XML_REGEXP_SEPAR_PARA)) 2207 ret = 1; 2208 break; 2209 case XML_REGEXP_SYMBOL: 2210 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2211 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2212 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2213 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2214 ret = 1; 2215 break; 2216 case XML_REGEXP_OTHER: 2217 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2218 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2219 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2220 ret = 1; 2221 break; 2222 default: 2223 if ((range2->type >= XML_REGEXP_LETTER) && 2224 (range2->type < XML_REGEXP_BLOCK_NAME)) 2225 ret = 0; 2226 else { 2227 /* safety net ! */ 2228 return(1); 2229 } 2230 } 2231 } 2232 if (((range1->neg == 0) && (range2->neg != 0)) || 2233 ((range1->neg != 0) && (range2->neg == 0))) 2234 ret = !ret; 2235 return(ret); 2236 } 2237 2238 /** 2239 * xmlFACompareAtomTypes: 2240 * @type1: an atom type 2241 * @type2: an atom type 2242 * 2243 * Compares two atoms type to check whether they intersect in some ways, 2244 * this is used by xmlFACompareAtoms only 2245 * 2246 * Returns 1 if they may intersect and 0 otherwise 2247 */ 2248 static int 2249 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2250 if ((type1 == XML_REGEXP_EPSILON) || 2251 (type1 == XML_REGEXP_CHARVAL) || 2252 (type1 == XML_REGEXP_RANGES) || 2253 (type1 == XML_REGEXP_SUBREG) || 2254 (type1 == XML_REGEXP_STRING) || 2255 (type1 == XML_REGEXP_ANYCHAR)) 2256 return(1); 2257 if ((type2 == XML_REGEXP_EPSILON) || 2258 (type2 == XML_REGEXP_CHARVAL) || 2259 (type2 == XML_REGEXP_RANGES) || 2260 (type2 == XML_REGEXP_SUBREG) || 2261 (type2 == XML_REGEXP_STRING) || 2262 (type2 == XML_REGEXP_ANYCHAR)) 2263 return(1); 2264 2265 if (type1 == type2) return(1); 2266 2267 /* simplify subsequent compares by making sure type1 < type2 */ 2268 if (type1 > type2) { 2269 xmlRegAtomType tmp = type1; 2270 type1 = type2; 2271 type2 = tmp; 2272 } 2273 switch (type1) { 2274 case XML_REGEXP_ANYSPACE: /* \s */ 2275 /* can't be a letter, number, mark, pontuation, symbol */ 2276 if ((type2 == XML_REGEXP_NOTSPACE) || 2277 ((type2 >= XML_REGEXP_LETTER) && 2278 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2279 ((type2 >= XML_REGEXP_NUMBER) && 2280 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2281 ((type2 >= XML_REGEXP_MARK) && 2282 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2283 ((type2 >= XML_REGEXP_PUNCT) && 2284 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2285 ((type2 >= XML_REGEXP_SYMBOL) && 2286 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2287 ) return(0); 2288 break; 2289 case XML_REGEXP_NOTSPACE: /* \S */ 2290 break; 2291 case XML_REGEXP_INITNAME: /* \l */ 2292 /* can't be a number, mark, separator, pontuation, symbol or other */ 2293 if ((type2 == XML_REGEXP_NOTINITNAME) || 2294 ((type2 >= XML_REGEXP_NUMBER) && 2295 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2296 ((type2 >= XML_REGEXP_MARK) && 2297 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2298 ((type2 >= XML_REGEXP_SEPAR) && 2299 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2300 ((type2 >= XML_REGEXP_PUNCT) && 2301 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2302 ((type2 >= XML_REGEXP_SYMBOL) && 2303 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2304 ((type2 >= XML_REGEXP_OTHER) && 2305 (type2 <= XML_REGEXP_OTHER_NA)) 2306 ) return(0); 2307 break; 2308 case XML_REGEXP_NOTINITNAME: /* \L */ 2309 break; 2310 case XML_REGEXP_NAMECHAR: /* \c */ 2311 /* can't be a mark, separator, pontuation, symbol or other */ 2312 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2313 ((type2 >= XML_REGEXP_MARK) && 2314 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2315 ((type2 >= XML_REGEXP_PUNCT) && 2316 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2317 ((type2 >= XML_REGEXP_SEPAR) && 2318 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2319 ((type2 >= XML_REGEXP_SYMBOL) && 2320 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2321 ((type2 >= XML_REGEXP_OTHER) && 2322 (type2 <= XML_REGEXP_OTHER_NA)) 2323 ) return(0); 2324 break; 2325 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2326 break; 2327 case XML_REGEXP_DECIMAL: /* \d */ 2328 /* can't be a letter, mark, separator, pontuation, symbol or other */ 2329 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2330 (type2 == XML_REGEXP_REALCHAR) || 2331 ((type2 >= XML_REGEXP_LETTER) && 2332 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2333 ((type2 >= XML_REGEXP_MARK) && 2334 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2335 ((type2 >= XML_REGEXP_PUNCT) && 2336 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2337 ((type2 >= XML_REGEXP_SEPAR) && 2338 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2339 ((type2 >= XML_REGEXP_SYMBOL) && 2340 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2341 ((type2 >= XML_REGEXP_OTHER) && 2342 (type2 <= XML_REGEXP_OTHER_NA)) 2343 )return(0); 2344 break; 2345 case XML_REGEXP_NOTDECIMAL: /* \D */ 2346 break; 2347 case XML_REGEXP_REALCHAR: /* \w */ 2348 /* can't be a mark, separator, pontuation, symbol or other */ 2349 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2350 ((type2 >= XML_REGEXP_MARK) && 2351 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2352 ((type2 >= XML_REGEXP_PUNCT) && 2353 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2354 ((type2 >= XML_REGEXP_SEPAR) && 2355 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2356 ((type2 >= XML_REGEXP_SYMBOL) && 2357 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2358 ((type2 >= XML_REGEXP_OTHER) && 2359 (type2 <= XML_REGEXP_OTHER_NA)) 2360 )return(0); 2361 break; 2362 case XML_REGEXP_NOTREALCHAR: /* \W */ 2363 break; 2364 /* 2365 * at that point we know both type 1 and type2 are from 2366 * character categories are ordered and are different, 2367 * it becomes simple because this is a partition 2368 */ 2369 case XML_REGEXP_LETTER: 2370 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2371 return(1); 2372 return(0); 2373 case XML_REGEXP_LETTER_UPPERCASE: 2374 case XML_REGEXP_LETTER_LOWERCASE: 2375 case XML_REGEXP_LETTER_TITLECASE: 2376 case XML_REGEXP_LETTER_MODIFIER: 2377 case XML_REGEXP_LETTER_OTHERS: 2378 return(0); 2379 case XML_REGEXP_MARK: 2380 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2381 return(1); 2382 return(0); 2383 case XML_REGEXP_MARK_NONSPACING: 2384 case XML_REGEXP_MARK_SPACECOMBINING: 2385 case XML_REGEXP_MARK_ENCLOSING: 2386 return(0); 2387 case XML_REGEXP_NUMBER: 2388 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2389 return(1); 2390 return(0); 2391 case XML_REGEXP_NUMBER_DECIMAL: 2392 case XML_REGEXP_NUMBER_LETTER: 2393 case XML_REGEXP_NUMBER_OTHERS: 2394 return(0); 2395 case XML_REGEXP_PUNCT: 2396 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2397 return(1); 2398 return(0); 2399 case XML_REGEXP_PUNCT_CONNECTOR: 2400 case XML_REGEXP_PUNCT_DASH: 2401 case XML_REGEXP_PUNCT_OPEN: 2402 case XML_REGEXP_PUNCT_CLOSE: 2403 case XML_REGEXP_PUNCT_INITQUOTE: 2404 case XML_REGEXP_PUNCT_FINQUOTE: 2405 case XML_REGEXP_PUNCT_OTHERS: 2406 return(0); 2407 case XML_REGEXP_SEPAR: 2408 if (type2 <= XML_REGEXP_SEPAR_PARA) 2409 return(1); 2410 return(0); 2411 case XML_REGEXP_SEPAR_SPACE: 2412 case XML_REGEXP_SEPAR_LINE: 2413 case XML_REGEXP_SEPAR_PARA: 2414 return(0); 2415 case XML_REGEXP_SYMBOL: 2416 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2417 return(1); 2418 return(0); 2419 case XML_REGEXP_SYMBOL_MATH: 2420 case XML_REGEXP_SYMBOL_CURRENCY: 2421 case XML_REGEXP_SYMBOL_MODIFIER: 2422 case XML_REGEXP_SYMBOL_OTHERS: 2423 return(0); 2424 case XML_REGEXP_OTHER: 2425 if (type2 <= XML_REGEXP_OTHER_NA) 2426 return(1); 2427 return(0); 2428 case XML_REGEXP_OTHER_CONTROL: 2429 case XML_REGEXP_OTHER_FORMAT: 2430 case XML_REGEXP_OTHER_PRIVATE: 2431 case XML_REGEXP_OTHER_NA: 2432 return(0); 2433 default: 2434 break; 2435 } 2436 return(1); 2437 } 2438 2439 /** 2440 * xmlFAEqualAtoms: 2441 * @atom1: an atom 2442 * @atom2: an atom 2443 * @deep: if not set only compare string pointers 2444 * 2445 * Compares two atoms to check whether they are the same exactly 2446 * this is used to remove equivalent transitions 2447 * 2448 * Returns 1 if same and 0 otherwise 2449 */ 2450 static int 2451 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2452 int ret = 0; 2453 2454 if (atom1 == atom2) 2455 return(1); 2456 if ((atom1 == NULL) || (atom2 == NULL)) 2457 return(0); 2458 2459 if (atom1->type != atom2->type) 2460 return(0); 2461 switch (atom1->type) { 2462 case XML_REGEXP_EPSILON: 2463 ret = 0; 2464 break; 2465 case XML_REGEXP_STRING: 2466 if (!deep) 2467 ret = (atom1->valuep == atom2->valuep); 2468 else 2469 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2470 (xmlChar *)atom2->valuep); 2471 break; 2472 case XML_REGEXP_CHARVAL: 2473 ret = (atom1->codepoint == atom2->codepoint); 2474 break; 2475 case XML_REGEXP_RANGES: 2476 /* too hard to do in the general case */ 2477 ret = 0; 2478 default: 2479 break; 2480 } 2481 return(ret); 2482 } 2483 2484 /** 2485 * xmlFACompareAtoms: 2486 * @atom1: an atom 2487 * @atom2: an atom 2488 * @deep: if not set only compare string pointers 2489 * 2490 * Compares two atoms to check whether they intersect in some ways, 2491 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2492 * 2493 * Returns 1 if yes and 0 otherwise 2494 */ 2495 static int 2496 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2497 int ret = 1; 2498 2499 if (atom1 == atom2) 2500 return(1); 2501 if ((atom1 == NULL) || (atom2 == NULL)) 2502 return(0); 2503 2504 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2505 (atom2->type == XML_REGEXP_ANYCHAR)) 2506 return(1); 2507 2508 if (atom1->type > atom2->type) { 2509 xmlRegAtomPtr tmp; 2510 tmp = atom1; 2511 atom1 = atom2; 2512 atom2 = tmp; 2513 } 2514 if (atom1->type != atom2->type) { 2515 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2516 /* if they can't intersect at the type level break now */ 2517 if (ret == 0) 2518 return(0); 2519 } 2520 switch (atom1->type) { 2521 case XML_REGEXP_STRING: 2522 if (!deep) 2523 ret = (atom1->valuep != atom2->valuep); 2524 else 2525 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, 2526 (xmlChar *)atom2->valuep); 2527 break; 2528 case XML_REGEXP_EPSILON: 2529 goto not_determinist; 2530 case XML_REGEXP_CHARVAL: 2531 if (atom2->type == XML_REGEXP_CHARVAL) { 2532 ret = (atom1->codepoint == atom2->codepoint); 2533 } else { 2534 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2535 if (ret < 0) 2536 ret = 1; 2537 } 2538 break; 2539 case XML_REGEXP_RANGES: 2540 if (atom2->type == XML_REGEXP_RANGES) { 2541 int i, j, res; 2542 xmlRegRangePtr r1, r2; 2543 2544 /* 2545 * need to check that none of the ranges eventually matches 2546 */ 2547 for (i = 0;i < atom1->nbRanges;i++) { 2548 for (j = 0;j < atom2->nbRanges;j++) { 2549 r1 = atom1->ranges[i]; 2550 r2 = atom2->ranges[j]; 2551 res = xmlFACompareRanges(r1, r2); 2552 if (res == 1) { 2553 ret = 1; 2554 goto done; 2555 } 2556 } 2557 } 2558 ret = 0; 2559 } 2560 break; 2561 default: 2562 goto not_determinist; 2563 } 2564 done: 2565 if (atom1->neg != atom2->neg) { 2566 ret = !ret; 2567 } 2568 if (ret == 0) 2569 return(0); 2570 not_determinist: 2571 return(1); 2572 } 2573 2574 /** 2575 * xmlFARecurseDeterminism: 2576 * @ctxt: a regexp parser context 2577 * 2578 * Check whether the associated regexp is determinist, 2579 * should be called after xmlFAEliminateEpsilonTransitions() 2580 * 2581 */ 2582 static int 2583 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2584 int to, xmlRegAtomPtr atom) { 2585 int ret = 1; 2586 int res; 2587 int transnr, nbTrans; 2588 xmlRegTransPtr t1; 2589 int deep = 1; 2590 2591 if (state == NULL) 2592 return(ret); 2593 if (state->markd == XML_REGEXP_MARK_VISITED) 2594 return(ret); 2595 2596 if (ctxt->flags & AM_AUTOMATA_RNG) 2597 deep = 0; 2598 2599 /* 2600 * don't recurse on transitions potentially added in the course of 2601 * the elimination. 2602 */ 2603 nbTrans = state->nbTrans; 2604 for (transnr = 0;transnr < nbTrans;transnr++) { 2605 t1 = &(state->trans[transnr]); 2606 /* 2607 * check transitions conflicting with the one looked at 2608 */ 2609 if (t1->atom == NULL) { 2610 if (t1->to < 0) 2611 continue; 2612 state->markd = XML_REGEXP_MARK_VISITED; 2613 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2614 to, atom); 2615 state->markd = 0; 2616 if (res == 0) { 2617 ret = 0; 2618 /* t1->nd = 1; */ 2619 } 2620 continue; 2621 } 2622 if (t1->to != to) 2623 continue; 2624 if (xmlFACompareAtoms(t1->atom, atom, deep)) { 2625 ret = 0; 2626 /* mark the transition as non-deterministic */ 2627 t1->nd = 1; 2628 } 2629 } 2630 return(ret); 2631 } 2632 2633 /** 2634 * xmlFAComputesDeterminism: 2635 * @ctxt: a regexp parser context 2636 * 2637 * Check whether the associated regexp is determinist, 2638 * should be called after xmlFAEliminateEpsilonTransitions() 2639 * 2640 */ 2641 static int 2642 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2643 int statenr, transnr; 2644 xmlRegStatePtr state; 2645 xmlRegTransPtr t1, t2, last; 2646 int i; 2647 int ret = 1; 2648 int deep = 1; 2649 2650 #ifdef DEBUG_REGEXP_GRAPH 2651 printf("xmlFAComputesDeterminism\n"); 2652 xmlRegPrintCtxt(stdout, ctxt); 2653 #endif 2654 if (ctxt->determinist != -1) 2655 return(ctxt->determinist); 2656 2657 if (ctxt->flags & AM_AUTOMATA_RNG) 2658 deep = 0; 2659 2660 /* 2661 * First cleanup the automata removing cancelled transitions 2662 */ 2663 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2664 state = ctxt->states[statenr]; 2665 if (state == NULL) 2666 continue; 2667 if (state->nbTrans < 2) 2668 continue; 2669 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2670 t1 = &(state->trans[transnr]); 2671 /* 2672 * Determinism checks in case of counted or all transitions 2673 * will have to be handled separately 2674 */ 2675 if (t1->atom == NULL) { 2676 /* t1->nd = 1; */ 2677 continue; 2678 } 2679 if (t1->to == -1) /* eliminated */ 2680 continue; 2681 for (i = 0;i < transnr;i++) { 2682 t2 = &(state->trans[i]); 2683 if (t2->to == -1) /* eliminated */ 2684 continue; 2685 if (t2->atom != NULL) { 2686 if (t1->to == t2->to) { 2687 /* 2688 * Here we use deep because we want to keep the 2689 * transitions which indicate a conflict 2690 */ 2691 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) && 2692 (t1->counter == t2->counter) && 2693 (t1->count == t2->count)) 2694 t2->to = -1; /* eliminated */ 2695 } 2696 } 2697 } 2698 } 2699 } 2700 2701 /* 2702 * Check for all states that there aren't 2 transitions 2703 * with the same atom and a different target. 2704 */ 2705 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2706 state = ctxt->states[statenr]; 2707 if (state == NULL) 2708 continue; 2709 if (state->nbTrans < 2) 2710 continue; 2711 last = NULL; 2712 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2713 t1 = &(state->trans[transnr]); 2714 /* 2715 * Determinism checks in case of counted or all transitions 2716 * will have to be handled separately 2717 */ 2718 if (t1->atom == NULL) { 2719 continue; 2720 } 2721 if (t1->to == -1) /* eliminated */ 2722 continue; 2723 for (i = 0;i < transnr;i++) { 2724 t2 = &(state->trans[i]); 2725 if (t2->to == -1) /* eliminated */ 2726 continue; 2727 if (t2->atom != NULL) { 2728 /* 2729 * But here we don't use deep because we want to 2730 * find transitions which indicate a conflict 2731 */ 2732 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) { 2733 ret = 0; 2734 /* mark the transitions as non-deterministic ones */ 2735 t1->nd = 1; 2736 t2->nd = 1; 2737 last = t1; 2738 } 2739 } else if (t1->to != -1) { 2740 /* 2741 * do the closure in case of remaining specific 2742 * epsilon transitions like choices or all 2743 */ 2744 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2745 t2->to, t2->atom); 2746 /* don't shortcut the computation so all non deterministic 2747 transition get marked down 2748 if (ret == 0) 2749 return(0); 2750 */ 2751 if (ret == 0) { 2752 t1->nd = 1; 2753 /* t2->nd = 1; */ 2754 last = t1; 2755 } 2756 } 2757 } 2758 /* don't shortcut the computation so all non deterministic 2759 transition get marked down 2760 if (ret == 0) 2761 break; */ 2762 } 2763 2764 /* 2765 * mark specifically the last non-deterministic transition 2766 * from a state since there is no need to set-up rollback 2767 * from it 2768 */ 2769 if (last != NULL) { 2770 last->nd = 2; 2771 } 2772 2773 /* don't shortcut the computation so all non deterministic 2774 transition get marked down 2775 if (ret == 0) 2776 break; */ 2777 } 2778 2779 ctxt->determinist = ret; 2780 return(ret); 2781 } 2782 2783 /************************************************************************ 2784 * * 2785 * Routines to check input against transition atoms * 2786 * * 2787 ************************************************************************/ 2788 2789 static int 2790 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2791 int start, int end, const xmlChar *blockName) { 2792 int ret = 0; 2793 2794 switch (type) { 2795 case XML_REGEXP_STRING: 2796 case XML_REGEXP_SUBREG: 2797 case XML_REGEXP_RANGES: 2798 case XML_REGEXP_EPSILON: 2799 return(-1); 2800 case XML_REGEXP_ANYCHAR: 2801 ret = ((codepoint != '\n') && (codepoint != '\r')); 2802 break; 2803 case XML_REGEXP_CHARVAL: 2804 ret = ((codepoint >= start) && (codepoint <= end)); 2805 break; 2806 case XML_REGEXP_NOTSPACE: 2807 neg = !neg; 2808 case XML_REGEXP_ANYSPACE: 2809 ret = ((codepoint == '\n') || (codepoint == '\r') || 2810 (codepoint == '\t') || (codepoint == ' ')); 2811 break; 2812 case XML_REGEXP_NOTINITNAME: 2813 neg = !neg; 2814 case XML_REGEXP_INITNAME: 2815 ret = (IS_LETTER(codepoint) || 2816 (codepoint == '_') || (codepoint == ':')); 2817 break; 2818 case XML_REGEXP_NOTNAMECHAR: 2819 neg = !neg; 2820 case XML_REGEXP_NAMECHAR: 2821 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2822 (codepoint == '.') || (codepoint == '-') || 2823 (codepoint == '_') || (codepoint == ':') || 2824 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2825 break; 2826 case XML_REGEXP_NOTDECIMAL: 2827 neg = !neg; 2828 case XML_REGEXP_DECIMAL: 2829 ret = xmlUCSIsCatNd(codepoint); 2830 break; 2831 case XML_REGEXP_REALCHAR: 2832 neg = !neg; 2833 case XML_REGEXP_NOTREALCHAR: 2834 ret = xmlUCSIsCatP(codepoint); 2835 if (ret == 0) 2836 ret = xmlUCSIsCatZ(codepoint); 2837 if (ret == 0) 2838 ret = xmlUCSIsCatC(codepoint); 2839 break; 2840 case XML_REGEXP_LETTER: 2841 ret = xmlUCSIsCatL(codepoint); 2842 break; 2843 case XML_REGEXP_LETTER_UPPERCASE: 2844 ret = xmlUCSIsCatLu(codepoint); 2845 break; 2846 case XML_REGEXP_LETTER_LOWERCASE: 2847 ret = xmlUCSIsCatLl(codepoint); 2848 break; 2849 case XML_REGEXP_LETTER_TITLECASE: 2850 ret = xmlUCSIsCatLt(codepoint); 2851 break; 2852 case XML_REGEXP_LETTER_MODIFIER: 2853 ret = xmlUCSIsCatLm(codepoint); 2854 break; 2855 case XML_REGEXP_LETTER_OTHERS: 2856 ret = xmlUCSIsCatLo(codepoint); 2857 break; 2858 case XML_REGEXP_MARK: 2859 ret = xmlUCSIsCatM(codepoint); 2860 break; 2861 case XML_REGEXP_MARK_NONSPACING: 2862 ret = xmlUCSIsCatMn(codepoint); 2863 break; 2864 case XML_REGEXP_MARK_SPACECOMBINING: 2865 ret = xmlUCSIsCatMc(codepoint); 2866 break; 2867 case XML_REGEXP_MARK_ENCLOSING: 2868 ret = xmlUCSIsCatMe(codepoint); 2869 break; 2870 case XML_REGEXP_NUMBER: 2871 ret = xmlUCSIsCatN(codepoint); 2872 break; 2873 case XML_REGEXP_NUMBER_DECIMAL: 2874 ret = xmlUCSIsCatNd(codepoint); 2875 break; 2876 case XML_REGEXP_NUMBER_LETTER: 2877 ret = xmlUCSIsCatNl(codepoint); 2878 break; 2879 case XML_REGEXP_NUMBER_OTHERS: 2880 ret = xmlUCSIsCatNo(codepoint); 2881 break; 2882 case XML_REGEXP_PUNCT: 2883 ret = xmlUCSIsCatP(codepoint); 2884 break; 2885 case XML_REGEXP_PUNCT_CONNECTOR: 2886 ret = xmlUCSIsCatPc(codepoint); 2887 break; 2888 case XML_REGEXP_PUNCT_DASH: 2889 ret = xmlUCSIsCatPd(codepoint); 2890 break; 2891 case XML_REGEXP_PUNCT_OPEN: 2892 ret = xmlUCSIsCatPs(codepoint); 2893 break; 2894 case XML_REGEXP_PUNCT_CLOSE: 2895 ret = xmlUCSIsCatPe(codepoint); 2896 break; 2897 case XML_REGEXP_PUNCT_INITQUOTE: 2898 ret = xmlUCSIsCatPi(codepoint); 2899 break; 2900 case XML_REGEXP_PUNCT_FINQUOTE: 2901 ret = xmlUCSIsCatPf(codepoint); 2902 break; 2903 case XML_REGEXP_PUNCT_OTHERS: 2904 ret = xmlUCSIsCatPo(codepoint); 2905 break; 2906 case XML_REGEXP_SEPAR: 2907 ret = xmlUCSIsCatZ(codepoint); 2908 break; 2909 case XML_REGEXP_SEPAR_SPACE: 2910 ret = xmlUCSIsCatZs(codepoint); 2911 break; 2912 case XML_REGEXP_SEPAR_LINE: 2913 ret = xmlUCSIsCatZl(codepoint); 2914 break; 2915 case XML_REGEXP_SEPAR_PARA: 2916 ret = xmlUCSIsCatZp(codepoint); 2917 break; 2918 case XML_REGEXP_SYMBOL: 2919 ret = xmlUCSIsCatS(codepoint); 2920 break; 2921 case XML_REGEXP_SYMBOL_MATH: 2922 ret = xmlUCSIsCatSm(codepoint); 2923 break; 2924 case XML_REGEXP_SYMBOL_CURRENCY: 2925 ret = xmlUCSIsCatSc(codepoint); 2926 break; 2927 case XML_REGEXP_SYMBOL_MODIFIER: 2928 ret = xmlUCSIsCatSk(codepoint); 2929 break; 2930 case XML_REGEXP_SYMBOL_OTHERS: 2931 ret = xmlUCSIsCatSo(codepoint); 2932 break; 2933 case XML_REGEXP_OTHER: 2934 ret = xmlUCSIsCatC(codepoint); 2935 break; 2936 case XML_REGEXP_OTHER_CONTROL: 2937 ret = xmlUCSIsCatCc(codepoint); 2938 break; 2939 case XML_REGEXP_OTHER_FORMAT: 2940 ret = xmlUCSIsCatCf(codepoint); 2941 break; 2942 case XML_REGEXP_OTHER_PRIVATE: 2943 ret = xmlUCSIsCatCo(codepoint); 2944 break; 2945 case XML_REGEXP_OTHER_NA: 2946 /* ret = xmlUCSIsCatCn(codepoint); */ 2947 /* Seems it doesn't exist anymore in recent Unicode releases */ 2948 ret = 0; 2949 break; 2950 case XML_REGEXP_BLOCK_NAME: 2951 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2952 break; 2953 } 2954 if (neg) 2955 return(!ret); 2956 return(ret); 2957 } 2958 2959 static int 2960 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2961 int i, ret = 0; 2962 xmlRegRangePtr range; 2963 2964 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2965 return(-1); 2966 2967 switch (atom->type) { 2968 case XML_REGEXP_SUBREG: 2969 case XML_REGEXP_EPSILON: 2970 return(-1); 2971 case XML_REGEXP_CHARVAL: 2972 return(codepoint == atom->codepoint); 2973 case XML_REGEXP_RANGES: { 2974 int accept = 0; 2975 2976 for (i = 0;i < atom->nbRanges;i++) { 2977 range = atom->ranges[i]; 2978 if (range->neg == 2) { 2979 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2980 0, range->start, range->end, 2981 range->blockName); 2982 if (ret != 0) 2983 return(0); /* excluded char */ 2984 } else if (range->neg) { 2985 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2986 0, range->start, range->end, 2987 range->blockName); 2988 if (ret == 0) 2989 accept = 1; 2990 else 2991 return(0); 2992 } else { 2993 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2994 0, range->start, range->end, 2995 range->blockName); 2996 if (ret != 0) 2997 accept = 1; /* might still be excluded */ 2998 } 2999 } 3000 return(accept); 3001 } 3002 case XML_REGEXP_STRING: 3003 printf("TODO: XML_REGEXP_STRING\n"); 3004 return(-1); 3005 case XML_REGEXP_ANYCHAR: 3006 case XML_REGEXP_ANYSPACE: 3007 case XML_REGEXP_NOTSPACE: 3008 case XML_REGEXP_INITNAME: 3009 case XML_REGEXP_NOTINITNAME: 3010 case XML_REGEXP_NAMECHAR: 3011 case XML_REGEXP_NOTNAMECHAR: 3012 case XML_REGEXP_DECIMAL: 3013 case XML_REGEXP_NOTDECIMAL: 3014 case XML_REGEXP_REALCHAR: 3015 case XML_REGEXP_NOTREALCHAR: 3016 case XML_REGEXP_LETTER: 3017 case XML_REGEXP_LETTER_UPPERCASE: 3018 case XML_REGEXP_LETTER_LOWERCASE: 3019 case XML_REGEXP_LETTER_TITLECASE: 3020 case XML_REGEXP_LETTER_MODIFIER: 3021 case XML_REGEXP_LETTER_OTHERS: 3022 case XML_REGEXP_MARK: 3023 case XML_REGEXP_MARK_NONSPACING: 3024 case XML_REGEXP_MARK_SPACECOMBINING: 3025 case XML_REGEXP_MARK_ENCLOSING: 3026 case XML_REGEXP_NUMBER: 3027 case XML_REGEXP_NUMBER_DECIMAL: 3028 case XML_REGEXP_NUMBER_LETTER: 3029 case XML_REGEXP_NUMBER_OTHERS: 3030 case XML_REGEXP_PUNCT: 3031 case XML_REGEXP_PUNCT_CONNECTOR: 3032 case XML_REGEXP_PUNCT_DASH: 3033 case XML_REGEXP_PUNCT_OPEN: 3034 case XML_REGEXP_PUNCT_CLOSE: 3035 case XML_REGEXP_PUNCT_INITQUOTE: 3036 case XML_REGEXP_PUNCT_FINQUOTE: 3037 case XML_REGEXP_PUNCT_OTHERS: 3038 case XML_REGEXP_SEPAR: 3039 case XML_REGEXP_SEPAR_SPACE: 3040 case XML_REGEXP_SEPAR_LINE: 3041 case XML_REGEXP_SEPAR_PARA: 3042 case XML_REGEXP_SYMBOL: 3043 case XML_REGEXP_SYMBOL_MATH: 3044 case XML_REGEXP_SYMBOL_CURRENCY: 3045 case XML_REGEXP_SYMBOL_MODIFIER: 3046 case XML_REGEXP_SYMBOL_OTHERS: 3047 case XML_REGEXP_OTHER: 3048 case XML_REGEXP_OTHER_CONTROL: 3049 case XML_REGEXP_OTHER_FORMAT: 3050 case XML_REGEXP_OTHER_PRIVATE: 3051 case XML_REGEXP_OTHER_NA: 3052 case XML_REGEXP_BLOCK_NAME: 3053 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 3054 (const xmlChar *)atom->valuep); 3055 if (atom->neg) 3056 ret = !ret; 3057 break; 3058 } 3059 return(ret); 3060 } 3061 3062 /************************************************************************ 3063 * * 3064 * Saving and restoring state of an execution context * 3065 * * 3066 ************************************************************************/ 3067 3068 #ifdef DEBUG_REGEXP_EXEC 3069 static void 3070 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 3071 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 3072 if (exec->inputStack != NULL) { 3073 int i; 3074 printf(": "); 3075 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 3076 printf("%s ", (const char *) 3077 exec->inputStack[exec->inputStackNr - (i + 1)].value); 3078 } else { 3079 printf(": %s", &(exec->inputString[exec->index])); 3080 } 3081 printf("\n"); 3082 } 3083 #endif 3084 3085 static void 3086 xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 3087 #ifdef DEBUG_REGEXP_EXEC 3088 printf("saving "); 3089 exec->transno++; 3090 xmlFARegDebugExec(exec); 3091 exec->transno--; 3092 #endif 3093 #ifdef MAX_PUSH 3094 if (exec->nbPush > MAX_PUSH) { 3095 return; 3096 } 3097 exec->nbPush++; 3098 #endif 3099 3100 if (exec->maxRollbacks == 0) { 3101 exec->maxRollbacks = 4; 3102 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 3103 sizeof(xmlRegExecRollback)); 3104 if (exec->rollbacks == NULL) { 3105 xmlRegexpErrMemory(NULL, "saving regexp"); 3106 exec->maxRollbacks = 0; 3107 return; 3108 } 3109 memset(exec->rollbacks, 0, 3110 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3111 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 3112 xmlRegExecRollback *tmp; 3113 int len = exec->maxRollbacks; 3114 3115 exec->maxRollbacks *= 2; 3116 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 3117 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3118 if (tmp == NULL) { 3119 xmlRegexpErrMemory(NULL, "saving regexp"); 3120 exec->maxRollbacks /= 2; 3121 return; 3122 } 3123 exec->rollbacks = tmp; 3124 tmp = &exec->rollbacks[len]; 3125 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 3126 } 3127 exec->rollbacks[exec->nbRollbacks].state = exec->state; 3128 exec->rollbacks[exec->nbRollbacks].index = exec->index; 3129 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 3130 if (exec->comp->nbCounters > 0) { 3131 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3132 exec->rollbacks[exec->nbRollbacks].counts = (int *) 3133 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 3134 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3135 xmlRegexpErrMemory(NULL, "saving regexp"); 3136 exec->status = -5; 3137 return; 3138 } 3139 } 3140 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 3141 exec->comp->nbCounters * sizeof(int)); 3142 } 3143 exec->nbRollbacks++; 3144 } 3145 3146 static void 3147 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 3148 if (exec->nbRollbacks <= 0) { 3149 exec->status = -1; 3150 #ifdef DEBUG_REGEXP_EXEC 3151 printf("rollback failed on empty stack\n"); 3152 #endif 3153 return; 3154 } 3155 exec->nbRollbacks--; 3156 exec->state = exec->rollbacks[exec->nbRollbacks].state; 3157 exec->index = exec->rollbacks[exec->nbRollbacks].index; 3158 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 3159 if (exec->comp->nbCounters > 0) { 3160 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3161 fprintf(stderr, "exec save: allocation failed"); 3162 exec->status = -6; 3163 return; 3164 } 3165 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 3166 exec->comp->nbCounters * sizeof(int)); 3167 } 3168 3169 #ifdef DEBUG_REGEXP_EXEC 3170 printf("restored "); 3171 xmlFARegDebugExec(exec); 3172 #endif 3173 } 3174 3175 /************************************************************************ 3176 * * 3177 * Verifier, running an input against a compiled regexp * 3178 * * 3179 ************************************************************************/ 3180 3181 static int 3182 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 3183 xmlRegExecCtxt execval; 3184 xmlRegExecCtxtPtr exec = &execval; 3185 int ret, codepoint = 0, len, deter; 3186 3187 exec->inputString = content; 3188 exec->index = 0; 3189 exec->nbPush = 0; 3190 exec->determinist = 1; 3191 exec->maxRollbacks = 0; 3192 exec->nbRollbacks = 0; 3193 exec->rollbacks = NULL; 3194 exec->status = 0; 3195 exec->comp = comp; 3196 exec->state = comp->states[0]; 3197 exec->transno = 0; 3198 exec->transcount = 0; 3199 exec->inputStack = NULL; 3200 exec->inputStackMax = 0; 3201 if (comp->nbCounters > 0) { 3202 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3203 if (exec->counts == NULL) { 3204 xmlRegexpErrMemory(NULL, "running regexp"); 3205 return(-1); 3206 } 3207 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3208 } else 3209 exec->counts = NULL; 3210 while ((exec->status == 0) && (exec->state != NULL) && 3211 ((exec->inputString[exec->index] != 0) || 3212 ((exec->state != NULL) && 3213 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3214 xmlRegTransPtr trans; 3215 xmlRegAtomPtr atom; 3216 3217 /* 3218 * If end of input on non-terminal state, rollback, however we may 3219 * still have epsilon like transition for counted transitions 3220 * on counters, in that case don't break too early. Additionally, 3221 * if we are working on a range like "AB{0,2}", where B is not present, 3222 * we don't want to break. 3223 */ 3224 len = 1; 3225 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3226 /* 3227 * if there is a transition, we must check if 3228 * atom allows minOccurs of 0 3229 */ 3230 if (exec->transno < exec->state->nbTrans) { 3231 trans = &exec->state->trans[exec->transno]; 3232 if (trans->to >=0) { 3233 atom = trans->atom; 3234 if (!((atom->min == 0) && (atom->max > 0))) 3235 goto rollback; 3236 } 3237 } else 3238 goto rollback; 3239 } 3240 3241 exec->transcount = 0; 3242 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3243 trans = &exec->state->trans[exec->transno]; 3244 if (trans->to < 0) 3245 continue; 3246 atom = trans->atom; 3247 ret = 0; 3248 deter = 1; 3249 if (trans->count >= 0) { 3250 int count; 3251 xmlRegCounterPtr counter; 3252 3253 if (exec->counts == NULL) { 3254 exec->status = -1; 3255 goto error; 3256 } 3257 /* 3258 * A counted transition. 3259 */ 3260 3261 count = exec->counts[trans->count]; 3262 counter = &exec->comp->counters[trans->count]; 3263 #ifdef DEBUG_REGEXP_EXEC 3264 printf("testing count %d: val %d, min %d, max %d\n", 3265 trans->count, count, counter->min, counter->max); 3266 #endif 3267 ret = ((count >= counter->min) && (count <= counter->max)); 3268 if ((ret) && (counter->min != counter->max)) 3269 deter = 0; 3270 } else if (atom == NULL) { 3271 fprintf(stderr, "epsilon transition left at runtime\n"); 3272 exec->status = -2; 3273 break; 3274 } else if (exec->inputString[exec->index] != 0) { 3275 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3276 ret = xmlRegCheckCharacter(atom, codepoint); 3277 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3278 xmlRegStatePtr to = comp->states[trans->to]; 3279 3280 /* 3281 * this is a multiple input sequence 3282 * If there is a counter associated increment it now. 3283 * before potentially saving and rollback 3284 * do not increment if the counter is already over the 3285 * maximum limit in which case get to next transition 3286 */ 3287 if (trans->counter >= 0) { 3288 xmlRegCounterPtr counter; 3289 3290 if ((exec->counts == NULL) || 3291 (exec->comp == NULL) || 3292 (exec->comp->counters == NULL)) { 3293 exec->status = -1; 3294 goto error; 3295 } 3296 counter = &exec->comp->counters[trans->counter]; 3297 if (exec->counts[trans->counter] >= counter->max) 3298 continue; /* for loop on transitions */ 3299 3300 #ifdef DEBUG_REGEXP_EXEC 3301 printf("Increasing count %d\n", trans->counter); 3302 #endif 3303 exec->counts[trans->counter]++; 3304 } 3305 if (exec->state->nbTrans > exec->transno + 1) { 3306 xmlFARegExecSave(exec); 3307 } 3308 exec->transcount = 1; 3309 do { 3310 /* 3311 * Try to progress as much as possible on the input 3312 */ 3313 if (exec->transcount == atom->max) { 3314 break; 3315 } 3316 exec->index += len; 3317 /* 3318 * End of input: stop here 3319 */ 3320 if (exec->inputString[exec->index] == 0) { 3321 exec->index -= len; 3322 break; 3323 } 3324 if (exec->transcount >= atom->min) { 3325 int transno = exec->transno; 3326 xmlRegStatePtr state = exec->state; 3327 3328 /* 3329 * The transition is acceptable save it 3330 */ 3331 exec->transno = -1; /* trick */ 3332 exec->state = to; 3333 xmlFARegExecSave(exec); 3334 exec->transno = transno; 3335 exec->state = state; 3336 } 3337 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3338 len); 3339 ret = xmlRegCheckCharacter(atom, codepoint); 3340 exec->transcount++; 3341 } while (ret == 1); 3342 if (exec->transcount < atom->min) 3343 ret = 0; 3344 3345 /* 3346 * If the last check failed but one transition was found 3347 * possible, rollback 3348 */ 3349 if (ret < 0) 3350 ret = 0; 3351 if (ret == 0) { 3352 goto rollback; 3353 } 3354 if (trans->counter >= 0) { 3355 if (exec->counts == NULL) { 3356 exec->status = -1; 3357 goto error; 3358 } 3359 #ifdef DEBUG_REGEXP_EXEC 3360 printf("Decreasing count %d\n", trans->counter); 3361 #endif 3362 exec->counts[trans->counter]--; 3363 } 3364 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3365 /* 3366 * we don't match on the codepoint, but minOccurs of 0 3367 * says that's ok. Setting len to 0 inhibits stepping 3368 * over the codepoint. 3369 */ 3370 exec->transcount = 1; 3371 len = 0; 3372 ret = 1; 3373 } 3374 } else if ((atom->min == 0) && (atom->max > 0)) { 3375 /* another spot to match when minOccurs is 0 */ 3376 exec->transcount = 1; 3377 len = 0; 3378 ret = 1; 3379 } 3380 if (ret == 1) { 3381 if ((trans->nd == 1) || 3382 ((trans->count >= 0) && (deter == 0) && 3383 (exec->state->nbTrans > exec->transno + 1))) { 3384 #ifdef DEBUG_REGEXP_EXEC 3385 if (trans->nd == 1) 3386 printf("Saving on nd transition atom %d for %c at %d\n", 3387 trans->atom->no, codepoint, exec->index); 3388 else 3389 printf("Saving on counted transition count %d for %c at %d\n", 3390 trans->count, codepoint, exec->index); 3391 #endif 3392 xmlFARegExecSave(exec); 3393 } 3394 if (trans->counter >= 0) { 3395 xmlRegCounterPtr counter; 3396 3397 /* make sure we don't go over the counter maximum value */ 3398 if ((exec->counts == NULL) || 3399 (exec->comp == NULL) || 3400 (exec->comp->counters == NULL)) { 3401 exec->status = -1; 3402 goto error; 3403 } 3404 counter = &exec->comp->counters[trans->counter]; 3405 if (exec->counts[trans->counter] >= counter->max) 3406 continue; /* for loop on transitions */ 3407 #ifdef DEBUG_REGEXP_EXEC 3408 printf("Increasing count %d\n", trans->counter); 3409 #endif 3410 exec->counts[trans->counter]++; 3411 } 3412 if ((trans->count >= 0) && 3413 (trans->count < REGEXP_ALL_COUNTER)) { 3414 if (exec->counts == NULL) { 3415 exec->status = -1; 3416 goto error; 3417 } 3418 #ifdef DEBUG_REGEXP_EXEC 3419 printf("resetting count %d on transition\n", 3420 trans->count); 3421 #endif 3422 exec->counts[trans->count] = 0; 3423 } 3424 #ifdef DEBUG_REGEXP_EXEC 3425 printf("entering state %d\n", trans->to); 3426 #endif 3427 exec->state = comp->states[trans->to]; 3428 exec->transno = 0; 3429 if (trans->atom != NULL) { 3430 exec->index += len; 3431 } 3432 goto progress; 3433 } else if (ret < 0) { 3434 exec->status = -4; 3435 break; 3436 } 3437 } 3438 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3439 rollback: 3440 /* 3441 * Failed to find a way out 3442 */ 3443 exec->determinist = 0; 3444 #ifdef DEBUG_REGEXP_EXEC 3445 printf("rollback from state %d on %d:%c\n", exec->state->no, 3446 codepoint,codepoint); 3447 #endif 3448 xmlFARegExecRollBack(exec); 3449 } 3450 progress: 3451 continue; 3452 } 3453 error: 3454 if (exec->rollbacks != NULL) { 3455 if (exec->counts != NULL) { 3456 int i; 3457 3458 for (i = 0;i < exec->maxRollbacks;i++) 3459 if (exec->rollbacks[i].counts != NULL) 3460 xmlFree(exec->rollbacks[i].counts); 3461 } 3462 xmlFree(exec->rollbacks); 3463 } 3464 if (exec->state == NULL) 3465 return(-1); 3466 if (exec->counts != NULL) 3467 xmlFree(exec->counts); 3468 if (exec->status == 0) 3469 return(1); 3470 if (exec->status == -1) { 3471 if (exec->nbPush > MAX_PUSH) 3472 return(-1); 3473 return(0); 3474 } 3475 return(exec->status); 3476 } 3477 3478 /************************************************************************ 3479 * * 3480 * Progressive interface to the verifier one atom at a time * 3481 * * 3482 ************************************************************************/ 3483 #ifdef DEBUG_ERR 3484 static void testerr(xmlRegExecCtxtPtr exec); 3485 #endif 3486 3487 /** 3488 * xmlRegNewExecCtxt: 3489 * @comp: a precompiled regular expression 3490 * @callback: a callback function used for handling progresses in the 3491 * automata matching phase 3492 * @data: the context data associated to the callback in this context 3493 * 3494 * Build a context used for progressive evaluation of a regexp. 3495 * 3496 * Returns the new context 3497 */ 3498 xmlRegExecCtxtPtr 3499 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3500 xmlRegExecCtxtPtr exec; 3501 3502 if (comp == NULL) 3503 return(NULL); 3504 if ((comp->compact == NULL) && (comp->states == NULL)) 3505 return(NULL); 3506 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3507 if (exec == NULL) { 3508 xmlRegexpErrMemory(NULL, "creating execution context"); 3509 return(NULL); 3510 } 3511 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3512 exec->inputString = NULL; 3513 exec->index = 0; 3514 exec->determinist = 1; 3515 exec->maxRollbacks = 0; 3516 exec->nbRollbacks = 0; 3517 exec->rollbacks = NULL; 3518 exec->status = 0; 3519 exec->comp = comp; 3520 if (comp->compact == NULL) 3521 exec->state = comp->states[0]; 3522 exec->transno = 0; 3523 exec->transcount = 0; 3524 exec->callback = callback; 3525 exec->data = data; 3526 if (comp->nbCounters > 0) { 3527 /* 3528 * For error handling, exec->counts is allocated twice the size 3529 * the second half is used to store the data in case of rollback 3530 */ 3531 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3532 * 2); 3533 if (exec->counts == NULL) { 3534 xmlRegexpErrMemory(NULL, "creating execution context"); 3535 xmlFree(exec); 3536 return(NULL); 3537 } 3538 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3539 exec->errCounts = &exec->counts[comp->nbCounters]; 3540 } else { 3541 exec->counts = NULL; 3542 exec->errCounts = NULL; 3543 } 3544 exec->inputStackMax = 0; 3545 exec->inputStackNr = 0; 3546 exec->inputStack = NULL; 3547 exec->errStateNo = -1; 3548 exec->errString = NULL; 3549 exec->nbPush = 0; 3550 return(exec); 3551 } 3552 3553 /** 3554 * xmlRegFreeExecCtxt: 3555 * @exec: a regular expression evaulation context 3556 * 3557 * Free the structures associated to a regular expression evaulation context. 3558 */ 3559 void 3560 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3561 if (exec == NULL) 3562 return; 3563 3564 if (exec->rollbacks != NULL) { 3565 if (exec->counts != NULL) { 3566 int i; 3567 3568 for (i = 0;i < exec->maxRollbacks;i++) 3569 if (exec->rollbacks[i].counts != NULL) 3570 xmlFree(exec->rollbacks[i].counts); 3571 } 3572 xmlFree(exec->rollbacks); 3573 } 3574 if (exec->counts != NULL) 3575 xmlFree(exec->counts); 3576 if (exec->inputStack != NULL) { 3577 int i; 3578 3579 for (i = 0;i < exec->inputStackNr;i++) { 3580 if (exec->inputStack[i].value != NULL) 3581 xmlFree(exec->inputStack[i].value); 3582 } 3583 xmlFree(exec->inputStack); 3584 } 3585 if (exec->errString != NULL) 3586 xmlFree(exec->errString); 3587 xmlFree(exec); 3588 } 3589 3590 static void 3591 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3592 void *data) { 3593 #ifdef DEBUG_PUSH 3594 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3595 #endif 3596 if (exec->inputStackMax == 0) { 3597 exec->inputStackMax = 4; 3598 exec->inputStack = (xmlRegInputTokenPtr) 3599 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3600 if (exec->inputStack == NULL) { 3601 xmlRegexpErrMemory(NULL, "pushing input string"); 3602 exec->inputStackMax = 0; 3603 return; 3604 } 3605 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3606 xmlRegInputTokenPtr tmp; 3607 3608 exec->inputStackMax *= 2; 3609 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3610 exec->inputStackMax * sizeof(xmlRegInputToken)); 3611 if (tmp == NULL) { 3612 xmlRegexpErrMemory(NULL, "pushing input string"); 3613 exec->inputStackMax /= 2; 3614 return; 3615 } 3616 exec->inputStack = tmp; 3617 } 3618 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3619 exec->inputStack[exec->inputStackNr].data = data; 3620 exec->inputStackNr++; 3621 exec->inputStack[exec->inputStackNr].value = NULL; 3622 exec->inputStack[exec->inputStackNr].data = NULL; 3623 } 3624 3625 /** 3626 * xmlRegStrEqualWildcard: 3627 * @expStr: the string to be evaluated 3628 * @valStr: the validation string 3629 * 3630 * Checks if both strings are equal or have the same content. "*" 3631 * can be used as a wildcard in @valStr; "|" is used as a seperator of 3632 * substrings in both @expStr and @valStr. 3633 * 3634 * Returns 1 if the comparison is satisfied and the number of substrings 3635 * is equal, 0 otherwise. 3636 */ 3637 3638 static int 3639 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3640 if (expStr == valStr) return(1); 3641 if (expStr == NULL) return(0); 3642 if (valStr == NULL) return(0); 3643 do { 3644 /* 3645 * Eval if we have a wildcard for the current item. 3646 */ 3647 if (*expStr != *valStr) { 3648 /* if one of them starts with a wildcard make valStr be it */ 3649 if (*valStr == '*') { 3650 const xmlChar *tmp; 3651 3652 tmp = valStr; 3653 valStr = expStr; 3654 expStr = tmp; 3655 } 3656 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3657 do { 3658 if (*valStr == XML_REG_STRING_SEPARATOR) 3659 break; 3660 valStr++; 3661 } while (*valStr != 0); 3662 continue; 3663 } else 3664 return(0); 3665 } 3666 expStr++; 3667 valStr++; 3668 } while (*valStr != 0); 3669 if (*expStr != 0) 3670 return (0); 3671 else 3672 return (1); 3673 } 3674 3675 /** 3676 * xmlRegCompactPushString: 3677 * @exec: a regexp execution context 3678 * @comp: the precompiled exec with a compact table 3679 * @value: a string token input 3680 * @data: data associated to the token to reuse in callbacks 3681 * 3682 * Push one input token in the execution context 3683 * 3684 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3685 * a negative value in case of error. 3686 */ 3687 static int 3688 xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3689 xmlRegexpPtr comp, 3690 const xmlChar *value, 3691 void *data) { 3692 int state = exec->index; 3693 int i, target; 3694 3695 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3696 return(-1); 3697 3698 if (value == NULL) { 3699 /* 3700 * are we at a final state ? 3701 */ 3702 if (comp->compact[state * (comp->nbstrings + 1)] == 3703 XML_REGEXP_FINAL_STATE) 3704 return(1); 3705 return(0); 3706 } 3707 3708 #ifdef DEBUG_PUSH 3709 printf("value pushed: %s\n", value); 3710 #endif 3711 3712 /* 3713 * Examine all outside transitions from current state 3714 */ 3715 for (i = 0;i < comp->nbstrings;i++) { 3716 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3717 if ((target > 0) && (target <= comp->nbstates)) { 3718 target--; /* to avoid 0 */ 3719 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3720 exec->index = target; 3721 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3722 exec->callback(exec->data, value, 3723 comp->transdata[state * comp->nbstrings + i], data); 3724 } 3725 #ifdef DEBUG_PUSH 3726 printf("entering state %d\n", target); 3727 #endif 3728 if (comp->compact[target * (comp->nbstrings + 1)] == 3729 XML_REGEXP_SINK_STATE) 3730 goto error; 3731 3732 if (comp->compact[target * (comp->nbstrings + 1)] == 3733 XML_REGEXP_FINAL_STATE) 3734 return(1); 3735 return(0); 3736 } 3737 } 3738 } 3739 /* 3740 * Failed to find an exit transition out from current state for the 3741 * current token 3742 */ 3743 #ifdef DEBUG_PUSH 3744 printf("failed to find a transition for %s on state %d\n", value, state); 3745 #endif 3746 error: 3747 if (exec->errString != NULL) 3748 xmlFree(exec->errString); 3749 exec->errString = xmlStrdup(value); 3750 exec->errStateNo = state; 3751 exec->status = -1; 3752 #ifdef DEBUG_ERR 3753 testerr(exec); 3754 #endif 3755 return(-1); 3756 } 3757 3758 /** 3759 * xmlRegExecPushStringInternal: 3760 * @exec: a regexp execution context or NULL to indicate the end 3761 * @value: a string token input 3762 * @data: data associated to the token to reuse in callbacks 3763 * @compound: value was assembled from 2 strings 3764 * 3765 * Push one input token in the execution context 3766 * 3767 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3768 * a negative value in case of error. 3769 */ 3770 static int 3771 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3772 void *data, int compound) { 3773 xmlRegTransPtr trans; 3774 xmlRegAtomPtr atom; 3775 int ret; 3776 int final = 0; 3777 int progress = 1; 3778 3779 if (exec == NULL) 3780 return(-1); 3781 if (exec->comp == NULL) 3782 return(-1); 3783 if (exec->status != 0) 3784 return(exec->status); 3785 3786 if (exec->comp->compact != NULL) 3787 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3788 3789 if (value == NULL) { 3790 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3791 return(1); 3792 final = 1; 3793 } 3794 3795 #ifdef DEBUG_PUSH 3796 printf("value pushed: %s\n", value); 3797 #endif 3798 /* 3799 * If we have an active rollback stack push the new value there 3800 * and get back to where we were left 3801 */ 3802 if ((value != NULL) && (exec->inputStackNr > 0)) { 3803 xmlFARegExecSaveInputString(exec, value, data); 3804 value = exec->inputStack[exec->index].value; 3805 data = exec->inputStack[exec->index].data; 3806 #ifdef DEBUG_PUSH 3807 printf("value loaded: %s\n", value); 3808 #endif 3809 } 3810 3811 while ((exec->status == 0) && 3812 ((value != NULL) || 3813 ((final == 1) && 3814 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3815 3816 /* 3817 * End of input on non-terminal state, rollback, however we may 3818 * still have epsilon like transition for counted transitions 3819 * on counters, in that case don't break too early. 3820 */ 3821 if ((value == NULL) && (exec->counts == NULL)) 3822 goto rollback; 3823 3824 exec->transcount = 0; 3825 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3826 trans = &exec->state->trans[exec->transno]; 3827 if (trans->to < 0) 3828 continue; 3829 atom = trans->atom; 3830 ret = 0; 3831 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3832 int i; 3833 int count; 3834 xmlRegTransPtr t; 3835 xmlRegCounterPtr counter; 3836 3837 ret = 0; 3838 3839 #ifdef DEBUG_PUSH 3840 printf("testing all lax %d\n", trans->count); 3841 #endif 3842 /* 3843 * Check all counted transitions from the current state 3844 */ 3845 if ((value == NULL) && (final)) { 3846 ret = 1; 3847 } else if (value != NULL) { 3848 for (i = 0;i < exec->state->nbTrans;i++) { 3849 t = &exec->state->trans[i]; 3850 if ((t->counter < 0) || (t == trans)) 3851 continue; 3852 counter = &exec->comp->counters[t->counter]; 3853 count = exec->counts[t->counter]; 3854 if ((count < counter->max) && 3855 (t->atom != NULL) && 3856 (xmlStrEqual(value, t->atom->valuep))) { 3857 ret = 0; 3858 break; 3859 } 3860 if ((count >= counter->min) && 3861 (count < counter->max) && 3862 (t->atom != NULL) && 3863 (xmlStrEqual(value, t->atom->valuep))) { 3864 ret = 1; 3865 break; 3866 } 3867 } 3868 } 3869 } else if (trans->count == REGEXP_ALL_COUNTER) { 3870 int i; 3871 int count; 3872 xmlRegTransPtr t; 3873 xmlRegCounterPtr counter; 3874 3875 ret = 1; 3876 3877 #ifdef DEBUG_PUSH 3878 printf("testing all %d\n", trans->count); 3879 #endif 3880 /* 3881 * Check all counted transitions from the current state 3882 */ 3883 for (i = 0;i < exec->state->nbTrans;i++) { 3884 t = &exec->state->trans[i]; 3885 if ((t->counter < 0) || (t == trans)) 3886 continue; 3887 counter = &exec->comp->counters[t->counter]; 3888 count = exec->counts[t->counter]; 3889 if ((count < counter->min) || (count > counter->max)) { 3890 ret = 0; 3891 break; 3892 } 3893 } 3894 } else if (trans->count >= 0) { 3895 int count; 3896 xmlRegCounterPtr counter; 3897 3898 /* 3899 * A counted transition. 3900 */ 3901 3902 count = exec->counts[trans->count]; 3903 counter = &exec->comp->counters[trans->count]; 3904 #ifdef DEBUG_PUSH 3905 printf("testing count %d: val %d, min %d, max %d\n", 3906 trans->count, count, counter->min, counter->max); 3907 #endif 3908 ret = ((count >= counter->min) && (count <= counter->max)); 3909 } else if (atom == NULL) { 3910 fprintf(stderr, "epsilon transition left at runtime\n"); 3911 exec->status = -2; 3912 break; 3913 } else if (value != NULL) { 3914 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3915 if (atom->neg) { 3916 ret = !ret; 3917 if (!compound) 3918 ret = 0; 3919 } 3920 if ((ret == 1) && (trans->counter >= 0)) { 3921 xmlRegCounterPtr counter; 3922 int count; 3923 3924 count = exec->counts[trans->counter]; 3925 counter = &exec->comp->counters[trans->counter]; 3926 if (count >= counter->max) 3927 ret = 0; 3928 } 3929 3930 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3931 xmlRegStatePtr to = exec->comp->states[trans->to]; 3932 3933 /* 3934 * this is a multiple input sequence 3935 */ 3936 if (exec->state->nbTrans > exec->transno + 1) { 3937 if (exec->inputStackNr <= 0) { 3938 xmlFARegExecSaveInputString(exec, value, data); 3939 } 3940 xmlFARegExecSave(exec); 3941 } 3942 exec->transcount = 1; 3943 do { 3944 /* 3945 * Try to progress as much as possible on the input 3946 */ 3947 if (exec->transcount == atom->max) { 3948 break; 3949 } 3950 exec->index++; 3951 value = exec->inputStack[exec->index].value; 3952 data = exec->inputStack[exec->index].data; 3953 #ifdef DEBUG_PUSH 3954 printf("value loaded: %s\n", value); 3955 #endif 3956 3957 /* 3958 * End of input: stop here 3959 */ 3960 if (value == NULL) { 3961 exec->index --; 3962 break; 3963 } 3964 if (exec->transcount >= atom->min) { 3965 int transno = exec->transno; 3966 xmlRegStatePtr state = exec->state; 3967 3968 /* 3969 * The transition is acceptable save it 3970 */ 3971 exec->transno = -1; /* trick */ 3972 exec->state = to; 3973 if (exec->inputStackNr <= 0) { 3974 xmlFARegExecSaveInputString(exec, value, data); 3975 } 3976 xmlFARegExecSave(exec); 3977 exec->transno = transno; 3978 exec->state = state; 3979 } 3980 ret = xmlStrEqual(value, atom->valuep); 3981 exec->transcount++; 3982 } while (ret == 1); 3983 if (exec->transcount < atom->min) 3984 ret = 0; 3985 3986 /* 3987 * If the last check failed but one transition was found 3988 * possible, rollback 3989 */ 3990 if (ret < 0) 3991 ret = 0; 3992 if (ret == 0) { 3993 goto rollback; 3994 } 3995 } 3996 } 3997 if (ret == 1) { 3998 if ((exec->callback != NULL) && (atom != NULL) && 3999 (data != NULL)) { 4000 exec->callback(exec->data, atom->valuep, 4001 atom->data, data); 4002 } 4003 if (exec->state->nbTrans > exec->transno + 1) { 4004 if (exec->inputStackNr <= 0) { 4005 xmlFARegExecSaveInputString(exec, value, data); 4006 } 4007 xmlFARegExecSave(exec); 4008 } 4009 if (trans->counter >= 0) { 4010 #ifdef DEBUG_PUSH 4011 printf("Increasing count %d\n", trans->counter); 4012 #endif 4013 exec->counts[trans->counter]++; 4014 } 4015 if ((trans->count >= 0) && 4016 (trans->count < REGEXP_ALL_COUNTER)) { 4017 #ifdef DEBUG_REGEXP_EXEC 4018 printf("resetting count %d on transition\n", 4019 trans->count); 4020 #endif 4021 exec->counts[trans->count] = 0; 4022 } 4023 #ifdef DEBUG_PUSH 4024 printf("entering state %d\n", trans->to); 4025 #endif 4026 if ((exec->comp->states[trans->to] != NULL) && 4027 (exec->comp->states[trans->to]->type == 4028 XML_REGEXP_SINK_STATE)) { 4029 /* 4030 * entering a sink state, save the current state as error 4031 * state. 4032 */ 4033 if (exec->errString != NULL) 4034 xmlFree(exec->errString); 4035 exec->errString = xmlStrdup(value); 4036 exec->errState = exec->state; 4037 memcpy(exec->errCounts, exec->counts, 4038 exec->comp->nbCounters * sizeof(int)); 4039 } 4040 exec->state = exec->comp->states[trans->to]; 4041 exec->transno = 0; 4042 if (trans->atom != NULL) { 4043 if (exec->inputStack != NULL) { 4044 exec->index++; 4045 if (exec->index < exec->inputStackNr) { 4046 value = exec->inputStack[exec->index].value; 4047 data = exec->inputStack[exec->index].data; 4048 #ifdef DEBUG_PUSH 4049 printf("value loaded: %s\n", value); 4050 #endif 4051 } else { 4052 value = NULL; 4053 data = NULL; 4054 #ifdef DEBUG_PUSH 4055 printf("end of input\n"); 4056 #endif 4057 } 4058 } else { 4059 value = NULL; 4060 data = NULL; 4061 #ifdef DEBUG_PUSH 4062 printf("end of input\n"); 4063 #endif 4064 } 4065 } 4066 goto progress; 4067 } else if (ret < 0) { 4068 exec->status = -4; 4069 break; 4070 } 4071 } 4072 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4073 rollback: 4074 /* 4075 * if we didn't yet rollback on the current input 4076 * store the current state as the error state. 4077 */ 4078 if ((progress) && (exec->state != NULL) && 4079 (exec->state->type != XML_REGEXP_SINK_STATE)) { 4080 progress = 0; 4081 if (exec->errString != NULL) 4082 xmlFree(exec->errString); 4083 exec->errString = xmlStrdup(value); 4084 exec->errState = exec->state; 4085 memcpy(exec->errCounts, exec->counts, 4086 exec->comp->nbCounters * sizeof(int)); 4087 } 4088 4089 /* 4090 * Failed to find a way out 4091 */ 4092 exec->determinist = 0; 4093 xmlFARegExecRollBack(exec); 4094 if (exec->status == 0) { 4095 value = exec->inputStack[exec->index].value; 4096 data = exec->inputStack[exec->index].data; 4097 #ifdef DEBUG_PUSH 4098 printf("value loaded: %s\n", value); 4099 #endif 4100 } 4101 } 4102 continue; 4103 progress: 4104 progress = 1; 4105 continue; 4106 } 4107 if (exec->status == 0) { 4108 return(exec->state->type == XML_REGEXP_FINAL_STATE); 4109 } 4110 #ifdef DEBUG_ERR 4111 if (exec->status < 0) { 4112 testerr(exec); 4113 } 4114 #endif 4115 return(exec->status); 4116 } 4117 4118 /** 4119 * xmlRegExecPushString: 4120 * @exec: a regexp execution context or NULL to indicate the end 4121 * @value: a string token input 4122 * @data: data associated to the token to reuse in callbacks 4123 * 4124 * Push one input token in the execution context 4125 * 4126 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4127 * a negative value in case of error. 4128 */ 4129 int 4130 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 4131 void *data) { 4132 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 4133 } 4134 4135 /** 4136 * xmlRegExecPushString2: 4137 * @exec: a regexp execution context or NULL to indicate the end 4138 * @value: the first string token input 4139 * @value2: the second string token input 4140 * @data: data associated to the token to reuse in callbacks 4141 * 4142 * Push one input token in the execution context 4143 * 4144 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4145 * a negative value in case of error. 4146 */ 4147 int 4148 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 4149 const xmlChar *value2, void *data) { 4150 xmlChar buf[150]; 4151 int lenn, lenp, ret; 4152 xmlChar *str; 4153 4154 if (exec == NULL) 4155 return(-1); 4156 if (exec->comp == NULL) 4157 return(-1); 4158 if (exec->status != 0) 4159 return(exec->status); 4160 4161 if (value2 == NULL) 4162 return(xmlRegExecPushString(exec, value, data)); 4163 4164 lenn = strlen((char *) value2); 4165 lenp = strlen((char *) value); 4166 4167 if (150 < lenn + lenp + 2) { 4168 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4169 if (str == NULL) { 4170 exec->status = -1; 4171 return(-1); 4172 } 4173 } else { 4174 str = buf; 4175 } 4176 memcpy(&str[0], value, lenp); 4177 str[lenp] = XML_REG_STRING_SEPARATOR; 4178 memcpy(&str[lenp + 1], value2, lenn); 4179 str[lenn + lenp + 1] = 0; 4180 4181 if (exec->comp->compact != NULL) 4182 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 4183 else 4184 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 4185 4186 if (str != buf) 4187 xmlFree(str); 4188 return(ret); 4189 } 4190 4191 /** 4192 * xmlRegExecGetValues: 4193 * @exec: a regexp execution context 4194 * @err: error extraction or normal one 4195 * @nbval: pointer to the number of accepted values IN/OUT 4196 * @nbneg: return number of negative transitions 4197 * @values: pointer to the array of acceptable values 4198 * @terminal: return value if this was a terminal state 4199 * 4200 * Extract informations from the regexp execution, internal routine to 4201 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 4202 * 4203 * Returns: 0 in case of success or -1 in case of error. 4204 */ 4205 static int 4206 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 4207 int *nbval, int *nbneg, 4208 xmlChar **values, int *terminal) { 4209 int maxval; 4210 int nb = 0; 4211 4212 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 4213 (values == NULL) || (*nbval <= 0)) 4214 return(-1); 4215 4216 maxval = *nbval; 4217 *nbval = 0; 4218 *nbneg = 0; 4219 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4220 xmlRegexpPtr comp; 4221 int target, i, state; 4222 4223 comp = exec->comp; 4224 4225 if (err) { 4226 if (exec->errStateNo == -1) return(-1); 4227 state = exec->errStateNo; 4228 } else { 4229 state = exec->index; 4230 } 4231 if (terminal != NULL) { 4232 if (comp->compact[state * (comp->nbstrings + 1)] == 4233 XML_REGEXP_FINAL_STATE) 4234 *terminal = 1; 4235 else 4236 *terminal = 0; 4237 } 4238 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4239 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4240 if ((target > 0) && (target <= comp->nbstates) && 4241 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4242 XML_REGEXP_SINK_STATE)) { 4243 values[nb++] = comp->stringMap[i]; 4244 (*nbval)++; 4245 } 4246 } 4247 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4248 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4249 if ((target > 0) && (target <= comp->nbstates) && 4250 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4251 XML_REGEXP_SINK_STATE)) { 4252 values[nb++] = comp->stringMap[i]; 4253 (*nbneg)++; 4254 } 4255 } 4256 } else { 4257 int transno; 4258 xmlRegTransPtr trans; 4259 xmlRegAtomPtr atom; 4260 xmlRegStatePtr state; 4261 4262 if (terminal != NULL) { 4263 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4264 *terminal = 1; 4265 else 4266 *terminal = 0; 4267 } 4268 4269 if (err) { 4270 if (exec->errState == NULL) return(-1); 4271 state = exec->errState; 4272 } else { 4273 if (exec->state == NULL) return(-1); 4274 state = exec->state; 4275 } 4276 for (transno = 0; 4277 (transno < state->nbTrans) && (nb < maxval); 4278 transno++) { 4279 trans = &state->trans[transno]; 4280 if (trans->to < 0) 4281 continue; 4282 atom = trans->atom; 4283 if ((atom == NULL) || (atom->valuep == NULL)) 4284 continue; 4285 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4286 /* this should not be reached but ... */ 4287 TODO; 4288 } else if (trans->count == REGEXP_ALL_COUNTER) { 4289 /* this should not be reached but ... */ 4290 TODO; 4291 } else if (trans->counter >= 0) { 4292 xmlRegCounterPtr counter = NULL; 4293 int count; 4294 4295 if (err) 4296 count = exec->errCounts[trans->counter]; 4297 else 4298 count = exec->counts[trans->counter]; 4299 if (exec->comp != NULL) 4300 counter = &exec->comp->counters[trans->counter]; 4301 if ((counter == NULL) || (count < counter->max)) { 4302 if (atom->neg) 4303 values[nb++] = (xmlChar *) atom->valuep2; 4304 else 4305 values[nb++] = (xmlChar *) atom->valuep; 4306 (*nbval)++; 4307 } 4308 } else { 4309 if ((exec->comp->states[trans->to] != NULL) && 4310 (exec->comp->states[trans->to]->type != 4311 XML_REGEXP_SINK_STATE)) { 4312 if (atom->neg) 4313 values[nb++] = (xmlChar *) atom->valuep2; 4314 else 4315 values[nb++] = (xmlChar *) atom->valuep; 4316 (*nbval)++; 4317 } 4318 } 4319 } 4320 for (transno = 0; 4321 (transno < state->nbTrans) && (nb < maxval); 4322 transno++) { 4323 trans = &state->trans[transno]; 4324 if (trans->to < 0) 4325 continue; 4326 atom = trans->atom; 4327 if ((atom == NULL) || (atom->valuep == NULL)) 4328 continue; 4329 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4330 continue; 4331 } else if (trans->count == REGEXP_ALL_COUNTER) { 4332 continue; 4333 } else if (trans->counter >= 0) { 4334 continue; 4335 } else { 4336 if ((exec->comp->states[trans->to] != NULL) && 4337 (exec->comp->states[trans->to]->type == 4338 XML_REGEXP_SINK_STATE)) { 4339 if (atom->neg) 4340 values[nb++] = (xmlChar *) atom->valuep2; 4341 else 4342 values[nb++] = (xmlChar *) atom->valuep; 4343 (*nbneg)++; 4344 } 4345 } 4346 } 4347 } 4348 return(0); 4349 } 4350 4351 /** 4352 * xmlRegExecNextValues: 4353 * @exec: a regexp execution context 4354 * @nbval: pointer to the number of accepted values IN/OUT 4355 * @nbneg: return number of negative transitions 4356 * @values: pointer to the array of acceptable values 4357 * @terminal: return value if this was a terminal state 4358 * 4359 * Extract informations from the regexp execution, 4360 * the parameter @values must point to an array of @nbval string pointers 4361 * on return nbval will contain the number of possible strings in that 4362 * state and the @values array will be updated with them. The string values 4363 * returned will be freed with the @exec context and don't need to be 4364 * deallocated. 4365 * 4366 * Returns: 0 in case of success or -1 in case of error. 4367 */ 4368 int 4369 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4370 xmlChar **values, int *terminal) { 4371 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4372 } 4373 4374 /** 4375 * xmlRegExecErrInfo: 4376 * @exec: a regexp execution context generating an error 4377 * @string: return value for the error string 4378 * @nbval: pointer to the number of accepted values IN/OUT 4379 * @nbneg: return number of negative transitions 4380 * @values: pointer to the array of acceptable values 4381 * @terminal: return value if this was a terminal state 4382 * 4383 * Extract error informations from the regexp execution, the parameter 4384 * @string will be updated with the value pushed and not accepted, 4385 * the parameter @values must point to an array of @nbval string pointers 4386 * on return nbval will contain the number of possible strings in that 4387 * state and the @values array will be updated with them. The string values 4388 * returned will be freed with the @exec context and don't need to be 4389 * deallocated. 4390 * 4391 * Returns: 0 in case of success or -1 in case of error. 4392 */ 4393 int 4394 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4395 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4396 if (exec == NULL) 4397 return(-1); 4398 if (string != NULL) { 4399 if (exec->status != 0) 4400 *string = exec->errString; 4401 else 4402 *string = NULL; 4403 } 4404 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4405 } 4406 4407 #ifdef DEBUG_ERR 4408 static void testerr(xmlRegExecCtxtPtr exec) { 4409 const xmlChar *string; 4410 xmlChar *values[5]; 4411 int nb = 5; 4412 int nbneg; 4413 int terminal; 4414 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4415 } 4416 #endif 4417 4418 #if 0 4419 static int 4420 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4421 xmlRegTransPtr trans; 4422 xmlRegAtomPtr atom; 4423 int ret; 4424 int codepoint, len; 4425 4426 if (exec == NULL) 4427 return(-1); 4428 if (exec->status != 0) 4429 return(exec->status); 4430 4431 while ((exec->status == 0) && 4432 ((exec->inputString[exec->index] != 0) || 4433 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4434 4435 /* 4436 * End of input on non-terminal state, rollback, however we may 4437 * still have epsilon like transition for counted transitions 4438 * on counters, in that case don't break too early. 4439 */ 4440 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4441 goto rollback; 4442 4443 exec->transcount = 0; 4444 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4445 trans = &exec->state->trans[exec->transno]; 4446 if (trans->to < 0) 4447 continue; 4448 atom = trans->atom; 4449 ret = 0; 4450 if (trans->count >= 0) { 4451 int count; 4452 xmlRegCounterPtr counter; 4453 4454 /* 4455 * A counted transition. 4456 */ 4457 4458 count = exec->counts[trans->count]; 4459 counter = &exec->comp->counters[trans->count]; 4460 #ifdef DEBUG_REGEXP_EXEC 4461 printf("testing count %d: val %d, min %d, max %d\n", 4462 trans->count, count, counter->min, counter->max); 4463 #endif 4464 ret = ((count >= counter->min) && (count <= counter->max)); 4465 } else if (atom == NULL) { 4466 fprintf(stderr, "epsilon transition left at runtime\n"); 4467 exec->status = -2; 4468 break; 4469 } else if (exec->inputString[exec->index] != 0) { 4470 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4471 ret = xmlRegCheckCharacter(atom, codepoint); 4472 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4473 xmlRegStatePtr to = exec->comp->states[trans->to]; 4474 4475 /* 4476 * this is a multiple input sequence 4477 */ 4478 if (exec->state->nbTrans > exec->transno + 1) { 4479 xmlFARegExecSave(exec); 4480 } 4481 exec->transcount = 1; 4482 do { 4483 /* 4484 * Try to progress as much as possible on the input 4485 */ 4486 if (exec->transcount == atom->max) { 4487 break; 4488 } 4489 exec->index += len; 4490 /* 4491 * End of input: stop here 4492 */ 4493 if (exec->inputString[exec->index] == 0) { 4494 exec->index -= len; 4495 break; 4496 } 4497 if (exec->transcount >= atom->min) { 4498 int transno = exec->transno; 4499 xmlRegStatePtr state = exec->state; 4500 4501 /* 4502 * The transition is acceptable save it 4503 */ 4504 exec->transno = -1; /* trick */ 4505 exec->state = to; 4506 xmlFARegExecSave(exec); 4507 exec->transno = transno; 4508 exec->state = state; 4509 } 4510 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4511 len); 4512 ret = xmlRegCheckCharacter(atom, codepoint); 4513 exec->transcount++; 4514 } while (ret == 1); 4515 if (exec->transcount < atom->min) 4516 ret = 0; 4517 4518 /* 4519 * If the last check failed but one transition was found 4520 * possible, rollback 4521 */ 4522 if (ret < 0) 4523 ret = 0; 4524 if (ret == 0) { 4525 goto rollback; 4526 } 4527 } 4528 } 4529 if (ret == 1) { 4530 if (exec->state->nbTrans > exec->transno + 1) { 4531 xmlFARegExecSave(exec); 4532 } 4533 /* 4534 * restart count for expressions like this ((abc){2})* 4535 */ 4536 if (trans->count >= 0) { 4537 #ifdef DEBUG_REGEXP_EXEC 4538 printf("Reset count %d\n", trans->count); 4539 #endif 4540 exec->counts[trans->count] = 0; 4541 } 4542 if (trans->counter >= 0) { 4543 #ifdef DEBUG_REGEXP_EXEC 4544 printf("Increasing count %d\n", trans->counter); 4545 #endif 4546 exec->counts[trans->counter]++; 4547 } 4548 #ifdef DEBUG_REGEXP_EXEC 4549 printf("entering state %d\n", trans->to); 4550 #endif 4551 exec->state = exec->comp->states[trans->to]; 4552 exec->transno = 0; 4553 if (trans->atom != NULL) { 4554 exec->index += len; 4555 } 4556 goto progress; 4557 } else if (ret < 0) { 4558 exec->status = -4; 4559 break; 4560 } 4561 } 4562 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4563 rollback: 4564 /* 4565 * Failed to find a way out 4566 */ 4567 exec->determinist = 0; 4568 xmlFARegExecRollBack(exec); 4569 } 4570 progress: 4571 continue; 4572 } 4573 } 4574 #endif 4575 /************************************************************************ 4576 * * 4577 * Parser for the Schemas Datatype Regular Expressions * 4578 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4579 * * 4580 ************************************************************************/ 4581 4582 /** 4583 * xmlFAIsChar: 4584 * @ctxt: a regexp parser context 4585 * 4586 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4587 */ 4588 static int 4589 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4590 int cur; 4591 int len; 4592 4593 cur = CUR_SCHAR(ctxt->cur, len); 4594 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4595 (cur == '*') || (cur == '+') || (cur == '(') || 4596 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4597 (cur == 0x5D) || (cur == 0)) 4598 return(-1); 4599 return(cur); 4600 } 4601 4602 /** 4603 * xmlFAParseCharProp: 4604 * @ctxt: a regexp parser context 4605 * 4606 * [27] charProp ::= IsCategory | IsBlock 4607 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4608 * Separators | Symbols | Others 4609 * [29] Letters ::= 'L' [ultmo]? 4610 * [30] Marks ::= 'M' [nce]? 4611 * [31] Numbers ::= 'N' [dlo]? 4612 * [32] Punctuation ::= 'P' [cdseifo]? 4613 * [33] Separators ::= 'Z' [slp]? 4614 * [34] Symbols ::= 'S' [mcko]? 4615 * [35] Others ::= 'C' [cfon]? 4616 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4617 */ 4618 static void 4619 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4620 int cur; 4621 xmlRegAtomType type = (xmlRegAtomType) 0; 4622 xmlChar *blockName = NULL; 4623 4624 cur = CUR; 4625 if (cur == 'L') { 4626 NEXT; 4627 cur = CUR; 4628 if (cur == 'u') { 4629 NEXT; 4630 type = XML_REGEXP_LETTER_UPPERCASE; 4631 } else if (cur == 'l') { 4632 NEXT; 4633 type = XML_REGEXP_LETTER_LOWERCASE; 4634 } else if (cur == 't') { 4635 NEXT; 4636 type = XML_REGEXP_LETTER_TITLECASE; 4637 } else if (cur == 'm') { 4638 NEXT; 4639 type = XML_REGEXP_LETTER_MODIFIER; 4640 } else if (cur == 'o') { 4641 NEXT; 4642 type = XML_REGEXP_LETTER_OTHERS; 4643 } else { 4644 type = XML_REGEXP_LETTER; 4645 } 4646 } else if (cur == 'M') { 4647 NEXT; 4648 cur = CUR; 4649 if (cur == 'n') { 4650 NEXT; 4651 /* nonspacing */ 4652 type = XML_REGEXP_MARK_NONSPACING; 4653 } else if (cur == 'c') { 4654 NEXT; 4655 /* spacing combining */ 4656 type = XML_REGEXP_MARK_SPACECOMBINING; 4657 } else if (cur == 'e') { 4658 NEXT; 4659 /* enclosing */ 4660 type = XML_REGEXP_MARK_ENCLOSING; 4661 } else { 4662 /* all marks */ 4663 type = XML_REGEXP_MARK; 4664 } 4665 } else if (cur == 'N') { 4666 NEXT; 4667 cur = CUR; 4668 if (cur == 'd') { 4669 NEXT; 4670 /* digital */ 4671 type = XML_REGEXP_NUMBER_DECIMAL; 4672 } else if (cur == 'l') { 4673 NEXT; 4674 /* letter */ 4675 type = XML_REGEXP_NUMBER_LETTER; 4676 } else if (cur == 'o') { 4677 NEXT; 4678 /* other */ 4679 type = XML_REGEXP_NUMBER_OTHERS; 4680 } else { 4681 /* all numbers */ 4682 type = XML_REGEXP_NUMBER; 4683 } 4684 } else if (cur == 'P') { 4685 NEXT; 4686 cur = CUR; 4687 if (cur == 'c') { 4688 NEXT; 4689 /* connector */ 4690 type = XML_REGEXP_PUNCT_CONNECTOR; 4691 } else if (cur == 'd') { 4692 NEXT; 4693 /* dash */ 4694 type = XML_REGEXP_PUNCT_DASH; 4695 } else if (cur == 's') { 4696 NEXT; 4697 /* open */ 4698 type = XML_REGEXP_PUNCT_OPEN; 4699 } else if (cur == 'e') { 4700 NEXT; 4701 /* close */ 4702 type = XML_REGEXP_PUNCT_CLOSE; 4703 } else if (cur == 'i') { 4704 NEXT; 4705 /* initial quote */ 4706 type = XML_REGEXP_PUNCT_INITQUOTE; 4707 } else if (cur == 'f') { 4708 NEXT; 4709 /* final quote */ 4710 type = XML_REGEXP_PUNCT_FINQUOTE; 4711 } else if (cur == 'o') { 4712 NEXT; 4713 /* other */ 4714 type = XML_REGEXP_PUNCT_OTHERS; 4715 } else { 4716 /* all punctuation */ 4717 type = XML_REGEXP_PUNCT; 4718 } 4719 } else if (cur == 'Z') { 4720 NEXT; 4721 cur = CUR; 4722 if (cur == 's') { 4723 NEXT; 4724 /* space */ 4725 type = XML_REGEXP_SEPAR_SPACE; 4726 } else if (cur == 'l') { 4727 NEXT; 4728 /* line */ 4729 type = XML_REGEXP_SEPAR_LINE; 4730 } else if (cur == 'p') { 4731 NEXT; 4732 /* paragraph */ 4733 type = XML_REGEXP_SEPAR_PARA; 4734 } else { 4735 /* all separators */ 4736 type = XML_REGEXP_SEPAR; 4737 } 4738 } else if (cur == 'S') { 4739 NEXT; 4740 cur = CUR; 4741 if (cur == 'm') { 4742 NEXT; 4743 type = XML_REGEXP_SYMBOL_MATH; 4744 /* math */ 4745 } else if (cur == 'c') { 4746 NEXT; 4747 type = XML_REGEXP_SYMBOL_CURRENCY; 4748 /* currency */ 4749 } else if (cur == 'k') { 4750 NEXT; 4751 type = XML_REGEXP_SYMBOL_MODIFIER; 4752 /* modifiers */ 4753 } else if (cur == 'o') { 4754 NEXT; 4755 type = XML_REGEXP_SYMBOL_OTHERS; 4756 /* other */ 4757 } else { 4758 /* all symbols */ 4759 type = XML_REGEXP_SYMBOL; 4760 } 4761 } else if (cur == 'C') { 4762 NEXT; 4763 cur = CUR; 4764 if (cur == 'c') { 4765 NEXT; 4766 /* control */ 4767 type = XML_REGEXP_OTHER_CONTROL; 4768 } else if (cur == 'f') { 4769 NEXT; 4770 /* format */ 4771 type = XML_REGEXP_OTHER_FORMAT; 4772 } else if (cur == 'o') { 4773 NEXT; 4774 /* private use */ 4775 type = XML_REGEXP_OTHER_PRIVATE; 4776 } else if (cur == 'n') { 4777 NEXT; 4778 /* not assigned */ 4779 type = XML_REGEXP_OTHER_NA; 4780 } else { 4781 /* all others */ 4782 type = XML_REGEXP_OTHER; 4783 } 4784 } else if (cur == 'I') { 4785 const xmlChar *start; 4786 NEXT; 4787 cur = CUR; 4788 if (cur != 's') { 4789 ERROR("IsXXXX expected"); 4790 return; 4791 } 4792 NEXT; 4793 start = ctxt->cur; 4794 cur = CUR; 4795 if (((cur >= 'a') && (cur <= 'z')) || 4796 ((cur >= 'A') && (cur <= 'Z')) || 4797 ((cur >= '0') && (cur <= '9')) || 4798 (cur == 0x2D)) { 4799 NEXT; 4800 cur = CUR; 4801 while (((cur >= 'a') && (cur <= 'z')) || 4802 ((cur >= 'A') && (cur <= 'Z')) || 4803 ((cur >= '0') && (cur <= '9')) || 4804 (cur == 0x2D)) { 4805 NEXT; 4806 cur = CUR; 4807 } 4808 } 4809 type = XML_REGEXP_BLOCK_NAME; 4810 blockName = xmlStrndup(start, ctxt->cur - start); 4811 } else { 4812 ERROR("Unknown char property"); 4813 return; 4814 } 4815 if (ctxt->atom == NULL) { 4816 ctxt->atom = xmlRegNewAtom(ctxt, type); 4817 if (ctxt->atom != NULL) 4818 ctxt->atom->valuep = blockName; 4819 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4820 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4821 type, 0, 0, blockName); 4822 } 4823 } 4824 4825 /** 4826 * xmlFAParseCharClassEsc: 4827 * @ctxt: a regexp parser context 4828 * 4829 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4830 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4831 * [25] catEsc ::= '\p{' charProp '}' 4832 * [26] complEsc ::= '\P{' charProp '}' 4833 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4834 */ 4835 static void 4836 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4837 int cur; 4838 4839 if (CUR == '.') { 4840 if (ctxt->atom == NULL) { 4841 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4842 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4843 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4844 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4845 } 4846 NEXT; 4847 return; 4848 } 4849 if (CUR != '\\') { 4850 ERROR("Escaped sequence: expecting \\"); 4851 return; 4852 } 4853 NEXT; 4854 cur = CUR; 4855 if (cur == 'p') { 4856 NEXT; 4857 if (CUR != '{') { 4858 ERROR("Expecting '{'"); 4859 return; 4860 } 4861 NEXT; 4862 xmlFAParseCharProp(ctxt); 4863 if (CUR != '}') { 4864 ERROR("Expecting '}'"); 4865 return; 4866 } 4867 NEXT; 4868 } else if (cur == 'P') { 4869 NEXT; 4870 if (CUR != '{') { 4871 ERROR("Expecting '{'"); 4872 return; 4873 } 4874 NEXT; 4875 xmlFAParseCharProp(ctxt); 4876 ctxt->atom->neg = 1; 4877 if (CUR != '}') { 4878 ERROR("Expecting '}'"); 4879 return; 4880 } 4881 NEXT; 4882 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4883 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4884 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4885 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4886 (cur == 0x5E)) { 4887 if (ctxt->atom == NULL) { 4888 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4889 if (ctxt->atom != NULL) { 4890 switch (cur) { 4891 case 'n': 4892 ctxt->atom->codepoint = '\n'; 4893 break; 4894 case 'r': 4895 ctxt->atom->codepoint = '\r'; 4896 break; 4897 case 't': 4898 ctxt->atom->codepoint = '\t'; 4899 break; 4900 default: 4901 ctxt->atom->codepoint = cur; 4902 } 4903 } 4904 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4905 switch (cur) { 4906 case 'n': 4907 cur = '\n'; 4908 break; 4909 case 'r': 4910 cur = '\r'; 4911 break; 4912 case 't': 4913 cur = '\t'; 4914 break; 4915 } 4916 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4917 XML_REGEXP_CHARVAL, cur, cur, NULL); 4918 } 4919 NEXT; 4920 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4921 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4922 (cur == 'w') || (cur == 'W')) { 4923 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4924 4925 switch (cur) { 4926 case 's': 4927 type = XML_REGEXP_ANYSPACE; 4928 break; 4929 case 'S': 4930 type = XML_REGEXP_NOTSPACE; 4931 break; 4932 case 'i': 4933 type = XML_REGEXP_INITNAME; 4934 break; 4935 case 'I': 4936 type = XML_REGEXP_NOTINITNAME; 4937 break; 4938 case 'c': 4939 type = XML_REGEXP_NAMECHAR; 4940 break; 4941 case 'C': 4942 type = XML_REGEXP_NOTNAMECHAR; 4943 break; 4944 case 'd': 4945 type = XML_REGEXP_DECIMAL; 4946 break; 4947 case 'D': 4948 type = XML_REGEXP_NOTDECIMAL; 4949 break; 4950 case 'w': 4951 type = XML_REGEXP_REALCHAR; 4952 break; 4953 case 'W': 4954 type = XML_REGEXP_NOTREALCHAR; 4955 break; 4956 } 4957 NEXT; 4958 if (ctxt->atom == NULL) { 4959 ctxt->atom = xmlRegNewAtom(ctxt, type); 4960 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4961 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4962 type, 0, 0, NULL); 4963 } 4964 } else { 4965 ERROR("Wrong escape sequence, misuse of character '\\'"); 4966 } 4967 } 4968 4969 /** 4970 * xmlFAParseCharRange: 4971 * @ctxt: a regexp parser context 4972 * 4973 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 4974 * [18] seRange ::= charOrEsc '-' charOrEsc 4975 * [20] charOrEsc ::= XmlChar | SingleCharEsc 4976 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 4977 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 4978 */ 4979 static void 4980 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 4981 int cur, len; 4982 int start = -1; 4983 int end = -1; 4984 4985 if (CUR == '\0') { 4986 ERROR("Expecting ']'"); 4987 return; 4988 } 4989 4990 cur = CUR; 4991 if (cur == '\\') { 4992 NEXT; 4993 cur = CUR; 4994 switch (cur) { 4995 case 'n': start = 0xA; break; 4996 case 'r': start = 0xD; break; 4997 case 't': start = 0x9; break; 4998 case '\\': case '|': case '.': case '-': case '^': case '?': 4999 case '*': case '+': case '{': case '}': case '(': case ')': 5000 case '[': case ']': 5001 start = cur; break; 5002 default: 5003 ERROR("Invalid escape value"); 5004 return; 5005 } 5006 end = start; 5007 len = 1; 5008 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5009 end = start = CUR_SCHAR(ctxt->cur, len); 5010 } else { 5011 ERROR("Expecting a char range"); 5012 return; 5013 } 5014 /* 5015 * Since we are "inside" a range, we can assume ctxt->cur is past 5016 * the start of ctxt->string, and PREV should be safe 5017 */ 5018 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) { 5019 NEXTL(len); 5020 return; 5021 } 5022 NEXTL(len); 5023 cur = CUR; 5024 if ((cur != '-') || (NXT(1) == ']')) { 5025 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5026 XML_REGEXP_CHARVAL, start, end, NULL); 5027 return; 5028 } 5029 NEXT; 5030 cur = CUR; 5031 if (cur == '\\') { 5032 NEXT; 5033 cur = CUR; 5034 switch (cur) { 5035 case 'n': end = 0xA; break; 5036 case 'r': end = 0xD; break; 5037 case 't': end = 0x9; break; 5038 case '\\': case '|': case '.': case '-': case '^': case '?': 5039 case '*': case '+': case '{': case '}': case '(': case ')': 5040 case '[': case ']': 5041 end = cur; break; 5042 default: 5043 ERROR("Invalid escape value"); 5044 return; 5045 } 5046 len = 1; 5047 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5048 end = CUR_SCHAR(ctxt->cur, len); 5049 } else { 5050 ERROR("Expecting the end of a char range"); 5051 return; 5052 } 5053 NEXTL(len); 5054 /* TODO check that the values are acceptable character ranges for XML */ 5055 if (end < start) { 5056 ERROR("End of range is before start of range"); 5057 } else { 5058 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5059 XML_REGEXP_CHARVAL, start, end, NULL); 5060 } 5061 return; 5062 } 5063 5064 /** 5065 * xmlFAParsePosCharGroup: 5066 * @ctxt: a regexp parser context 5067 * 5068 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 5069 */ 5070 static void 5071 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 5072 do { 5073 if (CUR == '\\') { 5074 xmlFAParseCharClassEsc(ctxt); 5075 } else { 5076 xmlFAParseCharRange(ctxt); 5077 } 5078 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 5079 (CUR != 0) && (ctxt->error == 0)); 5080 } 5081 5082 /** 5083 * xmlFAParseCharGroup: 5084 * @ctxt: a regexp parser context 5085 * 5086 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 5087 * [15] negCharGroup ::= '^' posCharGroup 5088 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 5089 * [12] charClassExpr ::= '[' charGroup ']' 5090 */ 5091 static void 5092 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 5093 int n = ctxt->neg; 5094 while ((CUR != ']') && (ctxt->error == 0)) { 5095 if (CUR == '^') { 5096 int neg = ctxt->neg; 5097 5098 NEXT; 5099 ctxt->neg = !ctxt->neg; 5100 xmlFAParsePosCharGroup(ctxt); 5101 ctxt->neg = neg; 5102 } else if ((CUR == '-') && (NXT(1) == '[')) { 5103 int neg = ctxt->neg; 5104 ctxt->neg = 2; 5105 NEXT; /* eat the '-' */ 5106 NEXT; /* eat the '[' */ 5107 xmlFAParseCharGroup(ctxt); 5108 if (CUR == ']') { 5109 NEXT; 5110 } else { 5111 ERROR("charClassExpr: ']' expected"); 5112 break; 5113 } 5114 ctxt->neg = neg; 5115 break; 5116 } else if (CUR != ']') { 5117 xmlFAParsePosCharGroup(ctxt); 5118 } 5119 } 5120 ctxt->neg = n; 5121 } 5122 5123 /** 5124 * xmlFAParseCharClass: 5125 * @ctxt: a regexp parser context 5126 * 5127 * [11] charClass ::= charClassEsc | charClassExpr 5128 * [12] charClassExpr ::= '[' charGroup ']' 5129 */ 5130 static void 5131 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 5132 if (CUR == '[') { 5133 NEXT; 5134 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 5135 if (ctxt->atom == NULL) 5136 return; 5137 xmlFAParseCharGroup(ctxt); 5138 if (CUR == ']') { 5139 NEXT; 5140 } else { 5141 ERROR("xmlFAParseCharClass: ']' expected"); 5142 } 5143 } else { 5144 xmlFAParseCharClassEsc(ctxt); 5145 } 5146 } 5147 5148 /** 5149 * xmlFAParseQuantExact: 5150 * @ctxt: a regexp parser context 5151 * 5152 * [8] QuantExact ::= [0-9]+ 5153 * 5154 * Returns 0 if success or -1 in case of error 5155 */ 5156 static int 5157 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 5158 int ret = 0; 5159 int ok = 0; 5160 5161 while ((CUR >= '0') && (CUR <= '9')) { 5162 ret = ret * 10 + (CUR - '0'); 5163 ok = 1; 5164 NEXT; 5165 } 5166 if (ok != 1) { 5167 return(-1); 5168 } 5169 return(ret); 5170 } 5171 5172 /** 5173 * xmlFAParseQuantifier: 5174 * @ctxt: a regexp parser context 5175 * 5176 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 5177 * [5] quantity ::= quantRange | quantMin | QuantExact 5178 * [6] quantRange ::= QuantExact ',' QuantExact 5179 * [7] quantMin ::= QuantExact ',' 5180 * [8] QuantExact ::= [0-9]+ 5181 */ 5182 static int 5183 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5184 int cur; 5185 5186 cur = CUR; 5187 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5188 if (ctxt->atom != NULL) { 5189 if (cur == '?') 5190 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5191 else if (cur == '*') 5192 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5193 else if (cur == '+') 5194 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5195 } 5196 NEXT; 5197 return(1); 5198 } 5199 if (cur == '{') { 5200 int min = 0, max = 0; 5201 5202 NEXT; 5203 cur = xmlFAParseQuantExact(ctxt); 5204 if (cur >= 0) 5205 min = cur; 5206 if (CUR == ',') { 5207 NEXT; 5208 if (CUR == '}') 5209 max = INT_MAX; 5210 else { 5211 cur = xmlFAParseQuantExact(ctxt); 5212 if (cur >= 0) 5213 max = cur; 5214 else { 5215 ERROR("Improper quantifier"); 5216 } 5217 } 5218 } 5219 if (CUR == '}') { 5220 NEXT; 5221 } else { 5222 ERROR("Unterminated quantifier"); 5223 } 5224 if (max == 0) 5225 max = min; 5226 if (ctxt->atom != NULL) { 5227 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5228 ctxt->atom->min = min; 5229 ctxt->atom->max = max; 5230 } 5231 return(1); 5232 } 5233 return(0); 5234 } 5235 5236 /** 5237 * xmlFAParseAtom: 5238 * @ctxt: a regexp parser context 5239 * 5240 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5241 */ 5242 static int 5243 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5244 int codepoint, len; 5245 5246 codepoint = xmlFAIsChar(ctxt); 5247 if (codepoint > 0) { 5248 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5249 if (ctxt->atom == NULL) 5250 return(-1); 5251 codepoint = CUR_SCHAR(ctxt->cur, len); 5252 ctxt->atom->codepoint = codepoint; 5253 NEXTL(len); 5254 return(1); 5255 } else if (CUR == '|') { 5256 return(0); 5257 } else if (CUR == 0) { 5258 return(0); 5259 } else if (CUR == ')') { 5260 return(0); 5261 } else if (CUR == '(') { 5262 xmlRegStatePtr start, oldend, start0; 5263 5264 NEXT; 5265 /* 5266 * this extra Epsilon transition is needed if we count with 0 allowed 5267 * unfortunately this can't be known at that point 5268 */ 5269 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5270 start0 = ctxt->state; 5271 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5272 start = ctxt->state; 5273 oldend = ctxt->end; 5274 ctxt->end = NULL; 5275 ctxt->atom = NULL; 5276 xmlFAParseRegExp(ctxt, 0); 5277 if (CUR == ')') { 5278 NEXT; 5279 } else { 5280 ERROR("xmlFAParseAtom: expecting ')'"); 5281 } 5282 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5283 if (ctxt->atom == NULL) 5284 return(-1); 5285 ctxt->atom->start = start; 5286 ctxt->atom->start0 = start0; 5287 ctxt->atom->stop = ctxt->state; 5288 ctxt->end = oldend; 5289 return(1); 5290 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5291 xmlFAParseCharClass(ctxt); 5292 return(1); 5293 } 5294 return(0); 5295 } 5296 5297 /** 5298 * xmlFAParsePiece: 5299 * @ctxt: a regexp parser context 5300 * 5301 * [3] piece ::= atom quantifier? 5302 */ 5303 static int 5304 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5305 int ret; 5306 5307 ctxt->atom = NULL; 5308 ret = xmlFAParseAtom(ctxt); 5309 if (ret == 0) 5310 return(0); 5311 if (ctxt->atom == NULL) { 5312 ERROR("internal: no atom generated"); 5313 } 5314 xmlFAParseQuantifier(ctxt); 5315 return(1); 5316 } 5317 5318 /** 5319 * xmlFAParseBranch: 5320 * @ctxt: a regexp parser context 5321 * @to: optional target to the end of the branch 5322 * 5323 * @to is used to optimize by removing duplicate path in automata 5324 * in expressions like (a|b)(c|d) 5325 * 5326 * [2] branch ::= piece* 5327 */ 5328 static int 5329 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5330 xmlRegStatePtr previous; 5331 int ret; 5332 5333 previous = ctxt->state; 5334 ret = xmlFAParsePiece(ctxt); 5335 if (ret != 0) { 5336 if (xmlFAGenerateTransitions(ctxt, previous, 5337 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5338 return(-1); 5339 previous = ctxt->state; 5340 ctxt->atom = NULL; 5341 } 5342 while ((ret != 0) && (ctxt->error == 0)) { 5343 ret = xmlFAParsePiece(ctxt); 5344 if (ret != 0) { 5345 if (xmlFAGenerateTransitions(ctxt, previous, 5346 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5347 return(-1); 5348 previous = ctxt->state; 5349 ctxt->atom = NULL; 5350 } 5351 } 5352 return(0); 5353 } 5354 5355 /** 5356 * xmlFAParseRegExp: 5357 * @ctxt: a regexp parser context 5358 * @top: is this the top-level expression ? 5359 * 5360 * [1] regExp ::= branch ( '|' branch )* 5361 */ 5362 static void 5363 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5364 xmlRegStatePtr start, end; 5365 5366 /* if not top start should have been generated by an epsilon trans */ 5367 start = ctxt->state; 5368 ctxt->end = NULL; 5369 xmlFAParseBranch(ctxt, NULL); 5370 if (top) { 5371 #ifdef DEBUG_REGEXP_GRAPH 5372 printf("State %d is final\n", ctxt->state->no); 5373 #endif 5374 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5375 } 5376 if (CUR != '|') { 5377 ctxt->end = ctxt->state; 5378 return; 5379 } 5380 end = ctxt->state; 5381 while ((CUR == '|') && (ctxt->error == 0)) { 5382 NEXT; 5383 if (CUR == 0) { 5384 ERROR("expecting a branch after |") 5385 return; 5386 } 5387 ctxt->state = start; 5388 ctxt->end = NULL; 5389 xmlFAParseBranch(ctxt, end); 5390 } 5391 if (!top) { 5392 ctxt->state = end; 5393 ctxt->end = end; 5394 } 5395 } 5396 5397 /************************************************************************ 5398 * * 5399 * The basic API * 5400 * * 5401 ************************************************************************/ 5402 5403 /** 5404 * xmlRegexpPrint: 5405 * @output: the file for the output debug 5406 * @regexp: the compiled regexp 5407 * 5408 * Print the content of the compiled regular expression 5409 */ 5410 void 5411 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5412 int i; 5413 5414 if (output == NULL) 5415 return; 5416 fprintf(output, " regexp: "); 5417 if (regexp == NULL) { 5418 fprintf(output, "NULL\n"); 5419 return; 5420 } 5421 fprintf(output, "'%s' ", regexp->string); 5422 fprintf(output, "\n"); 5423 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5424 for (i = 0;i < regexp->nbAtoms; i++) { 5425 fprintf(output, " %02d ", i); 5426 xmlRegPrintAtom(output, regexp->atoms[i]); 5427 } 5428 fprintf(output, "%d states:", regexp->nbStates); 5429 fprintf(output, "\n"); 5430 for (i = 0;i < regexp->nbStates; i++) { 5431 xmlRegPrintState(output, regexp->states[i]); 5432 } 5433 fprintf(output, "%d counters:\n", regexp->nbCounters); 5434 for (i = 0;i < regexp->nbCounters; i++) { 5435 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5436 regexp->counters[i].max); 5437 } 5438 } 5439 5440 /** 5441 * xmlRegexpCompile: 5442 * @regexp: a regular expression string 5443 * 5444 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5445 * Appendix F and builds an automata suitable for testing strings against 5446 * that regular expression 5447 * 5448 * Returns the compiled expression or NULL in case of error 5449 */ 5450 xmlRegexpPtr 5451 xmlRegexpCompile(const xmlChar *regexp) { 5452 xmlRegexpPtr ret; 5453 xmlRegParserCtxtPtr ctxt; 5454 5455 ctxt = xmlRegNewParserCtxt(regexp); 5456 if (ctxt == NULL) 5457 return(NULL); 5458 5459 /* initialize the parser */ 5460 ctxt->end = NULL; 5461 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5462 xmlRegStatePush(ctxt, ctxt->start); 5463 5464 /* parse the expression building an automata */ 5465 xmlFAParseRegExp(ctxt, 1); 5466 if (CUR != 0) { 5467 ERROR("xmlFAParseRegExp: extra characters"); 5468 } 5469 if (ctxt->error != 0) { 5470 xmlRegFreeParserCtxt(ctxt); 5471 return(NULL); 5472 } 5473 ctxt->end = ctxt->state; 5474 ctxt->start->type = XML_REGEXP_START_STATE; 5475 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5476 5477 /* remove the Epsilon except for counted transitions */ 5478 xmlFAEliminateEpsilonTransitions(ctxt); 5479 5480 5481 if (ctxt->error != 0) { 5482 xmlRegFreeParserCtxt(ctxt); 5483 return(NULL); 5484 } 5485 ret = xmlRegEpxFromParse(ctxt); 5486 xmlRegFreeParserCtxt(ctxt); 5487 return(ret); 5488 } 5489 5490 /** 5491 * xmlRegexpExec: 5492 * @comp: the compiled regular expression 5493 * @content: the value to check against the regular expression 5494 * 5495 * Check if the regular expression generates the value 5496 * 5497 * Returns 1 if it matches, 0 if not and a negative value in case of error 5498 */ 5499 int 5500 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5501 if ((comp == NULL) || (content == NULL)) 5502 return(-1); 5503 return(xmlFARegExec(comp, content)); 5504 } 5505 5506 /** 5507 * xmlRegexpIsDeterminist: 5508 * @comp: the compiled regular expression 5509 * 5510 * Check if the regular expression is determinist 5511 * 5512 * Returns 1 if it yes, 0 if not and a negative value in case of error 5513 */ 5514 int 5515 xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5516 xmlAutomataPtr am; 5517 int ret; 5518 5519 if (comp == NULL) 5520 return(-1); 5521 if (comp->determinist != -1) 5522 return(comp->determinist); 5523 5524 am = xmlNewAutomata(); 5525 if (am->states != NULL) { 5526 int i; 5527 5528 for (i = 0;i < am->nbStates;i++) 5529 xmlRegFreeState(am->states[i]); 5530 xmlFree(am->states); 5531 } 5532 am->nbAtoms = comp->nbAtoms; 5533 am->atoms = comp->atoms; 5534 am->nbStates = comp->nbStates; 5535 am->states = comp->states; 5536 am->determinist = -1; 5537 am->flags = comp->flags; 5538 ret = xmlFAComputesDeterminism(am); 5539 am->atoms = NULL; 5540 am->states = NULL; 5541 xmlFreeAutomata(am); 5542 comp->determinist = ret; 5543 return(ret); 5544 } 5545 5546 /** 5547 * xmlRegFreeRegexp: 5548 * @regexp: the regexp 5549 * 5550 * Free a regexp 5551 */ 5552 void 5553 xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5554 int i; 5555 if (regexp == NULL) 5556 return; 5557 5558 if (regexp->string != NULL) 5559 xmlFree(regexp->string); 5560 if (regexp->states != NULL) { 5561 for (i = 0;i < regexp->nbStates;i++) 5562 xmlRegFreeState(regexp->states[i]); 5563 xmlFree(regexp->states); 5564 } 5565 if (regexp->atoms != NULL) { 5566 for (i = 0;i < regexp->nbAtoms;i++) 5567 xmlRegFreeAtom(regexp->atoms[i]); 5568 xmlFree(regexp->atoms); 5569 } 5570 if (regexp->counters != NULL) 5571 xmlFree(regexp->counters); 5572 if (regexp->compact != NULL) 5573 xmlFree(regexp->compact); 5574 if (regexp->transdata != NULL) 5575 xmlFree(regexp->transdata); 5576 if (regexp->stringMap != NULL) { 5577 for (i = 0; i < regexp->nbstrings;i++) 5578 xmlFree(regexp->stringMap[i]); 5579 xmlFree(regexp->stringMap); 5580 } 5581 5582 xmlFree(regexp); 5583 } 5584 5585 #ifdef LIBXML_AUTOMATA_ENABLED 5586 /************************************************************************ 5587 * * 5588 * The Automata interface * 5589 * * 5590 ************************************************************************/ 5591 5592 /** 5593 * xmlNewAutomata: 5594 * 5595 * Create a new automata 5596 * 5597 * Returns the new object or NULL in case of failure 5598 */ 5599 xmlAutomataPtr 5600 xmlNewAutomata(void) { 5601 xmlAutomataPtr ctxt; 5602 5603 ctxt = xmlRegNewParserCtxt(NULL); 5604 if (ctxt == NULL) 5605 return(NULL); 5606 5607 /* initialize the parser */ 5608 ctxt->end = NULL; 5609 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5610 if (ctxt->start == NULL) { 5611 xmlFreeAutomata(ctxt); 5612 return(NULL); 5613 } 5614 ctxt->start->type = XML_REGEXP_START_STATE; 5615 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5616 xmlRegFreeState(ctxt->start); 5617 xmlFreeAutomata(ctxt); 5618 return(NULL); 5619 } 5620 ctxt->flags = 0; 5621 5622 return(ctxt); 5623 } 5624 5625 /** 5626 * xmlFreeAutomata: 5627 * @am: an automata 5628 * 5629 * Free an automata 5630 */ 5631 void 5632 xmlFreeAutomata(xmlAutomataPtr am) { 5633 if (am == NULL) 5634 return; 5635 xmlRegFreeParserCtxt(am); 5636 } 5637 5638 /** 5639 * xmlAutomataSetFlags: 5640 * @am: an automata 5641 * @flags: a set of internal flags 5642 * 5643 * Set some flags on the automata 5644 */ 5645 void 5646 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) { 5647 if (am == NULL) 5648 return; 5649 am->flags |= flags; 5650 } 5651 5652 /** 5653 * xmlAutomataGetInitState: 5654 * @am: an automata 5655 * 5656 * Initial state lookup 5657 * 5658 * Returns the initial state of the automata 5659 */ 5660 xmlAutomataStatePtr 5661 xmlAutomataGetInitState(xmlAutomataPtr am) { 5662 if (am == NULL) 5663 return(NULL); 5664 return(am->start); 5665 } 5666 5667 /** 5668 * xmlAutomataSetFinalState: 5669 * @am: an automata 5670 * @state: a state in this automata 5671 * 5672 * Makes that state a final state 5673 * 5674 * Returns 0 or -1 in case of error 5675 */ 5676 int 5677 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5678 if ((am == NULL) || (state == NULL)) 5679 return(-1); 5680 state->type = XML_REGEXP_FINAL_STATE; 5681 return(0); 5682 } 5683 5684 /** 5685 * xmlAutomataNewTransition: 5686 * @am: an automata 5687 * @from: the starting point of the transition 5688 * @to: the target point of the transition or NULL 5689 * @token: the input string associated to that transition 5690 * @data: data passed to the callback function if the transition is activated 5691 * 5692 * If @to is NULL, this creates first a new target state in the automata 5693 * and then adds a transition from the @from state to the target state 5694 * activated by the value of @token 5695 * 5696 * Returns the target state or NULL in case of error 5697 */ 5698 xmlAutomataStatePtr 5699 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5700 xmlAutomataStatePtr to, const xmlChar *token, 5701 void *data) { 5702 xmlRegAtomPtr atom; 5703 5704 if ((am == NULL) || (from == NULL) || (token == NULL)) 5705 return(NULL); 5706 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5707 if (atom == NULL) 5708 return(NULL); 5709 atom->data = data; 5710 if (atom == NULL) 5711 return(NULL); 5712 atom->valuep = xmlStrdup(token); 5713 5714 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5715 xmlRegFreeAtom(atom); 5716 return(NULL); 5717 } 5718 if (to == NULL) 5719 return(am->state); 5720 return(to); 5721 } 5722 5723 /** 5724 * xmlAutomataNewTransition2: 5725 * @am: an automata 5726 * @from: the starting point of the transition 5727 * @to: the target point of the transition or NULL 5728 * @token: the first input string associated to that transition 5729 * @token2: the second input string associated to that transition 5730 * @data: data passed to the callback function if the transition is activated 5731 * 5732 * If @to is NULL, this creates first a new target state in the automata 5733 * and then adds a transition from the @from state to the target state 5734 * activated by the value of @token 5735 * 5736 * Returns the target state or NULL in case of error 5737 */ 5738 xmlAutomataStatePtr 5739 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5740 xmlAutomataStatePtr to, const xmlChar *token, 5741 const xmlChar *token2, void *data) { 5742 xmlRegAtomPtr atom; 5743 5744 if ((am == NULL) || (from == NULL) || (token == NULL)) 5745 return(NULL); 5746 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5747 if (atom == NULL) 5748 return(NULL); 5749 atom->data = data; 5750 if ((token2 == NULL) || (*token2 == 0)) { 5751 atom->valuep = xmlStrdup(token); 5752 } else { 5753 int lenn, lenp; 5754 xmlChar *str; 5755 5756 lenn = strlen((char *) token2); 5757 lenp = strlen((char *) token); 5758 5759 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5760 if (str == NULL) { 5761 xmlRegFreeAtom(atom); 5762 return(NULL); 5763 } 5764 memcpy(&str[0], token, lenp); 5765 str[lenp] = '|'; 5766 memcpy(&str[lenp + 1], token2, lenn); 5767 str[lenn + lenp + 1] = 0; 5768 5769 atom->valuep = str; 5770 } 5771 5772 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5773 xmlRegFreeAtom(atom); 5774 return(NULL); 5775 } 5776 if (to == NULL) 5777 return(am->state); 5778 return(to); 5779 } 5780 5781 /** 5782 * xmlAutomataNewNegTrans: 5783 * @am: an automata 5784 * @from: the starting point of the transition 5785 * @to: the target point of the transition or NULL 5786 * @token: the first input string associated to that transition 5787 * @token2: the second input string associated to that transition 5788 * @data: data passed to the callback function if the transition is activated 5789 * 5790 * If @to is NULL, this creates first a new target state in the automata 5791 * and then adds a transition from the @from state to the target state 5792 * activated by any value except (@token,@token2) 5793 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5794 # the semantic of XSD ##other 5795 * 5796 * Returns the target state or NULL in case of error 5797 */ 5798 xmlAutomataStatePtr 5799 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5800 xmlAutomataStatePtr to, const xmlChar *token, 5801 const xmlChar *token2, void *data) { 5802 xmlRegAtomPtr atom; 5803 xmlChar err_msg[200]; 5804 5805 if ((am == NULL) || (from == NULL) || (token == NULL)) 5806 return(NULL); 5807 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5808 if (atom == NULL) 5809 return(NULL); 5810 atom->data = data; 5811 atom->neg = 1; 5812 if ((token2 == NULL) || (*token2 == 0)) { 5813 atom->valuep = xmlStrdup(token); 5814 } else { 5815 int lenn, lenp; 5816 xmlChar *str; 5817 5818 lenn = strlen((char *) token2); 5819 lenp = strlen((char *) token); 5820 5821 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5822 if (str == NULL) { 5823 xmlRegFreeAtom(atom); 5824 return(NULL); 5825 } 5826 memcpy(&str[0], token, lenp); 5827 str[lenp] = '|'; 5828 memcpy(&str[lenp + 1], token2, lenn); 5829 str[lenn + lenp + 1] = 0; 5830 5831 atom->valuep = str; 5832 } 5833 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5834 err_msg[199] = 0; 5835 atom->valuep2 = xmlStrdup(err_msg); 5836 5837 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5838 xmlRegFreeAtom(atom); 5839 return(NULL); 5840 } 5841 am->negs++; 5842 if (to == NULL) 5843 return(am->state); 5844 return(to); 5845 } 5846 5847 /** 5848 * xmlAutomataNewCountTrans2: 5849 * @am: an automata 5850 * @from: the starting point of the transition 5851 * @to: the target point of the transition or NULL 5852 * @token: the input string associated to that transition 5853 * @token2: the second input string associated to that transition 5854 * @min: the minimum successive occurences of token 5855 * @max: the maximum successive occurences of token 5856 * @data: data associated to the transition 5857 * 5858 * If @to is NULL, this creates first a new target state in the automata 5859 * and then adds a transition from the @from state to the target state 5860 * activated by a succession of input of value @token and @token2 and 5861 * whose number is between @min and @max 5862 * 5863 * Returns the target state or NULL in case of error 5864 */ 5865 xmlAutomataStatePtr 5866 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5867 xmlAutomataStatePtr to, const xmlChar *token, 5868 const xmlChar *token2, 5869 int min, int max, void *data) { 5870 xmlRegAtomPtr atom; 5871 int counter; 5872 5873 if ((am == NULL) || (from == NULL) || (token == NULL)) 5874 return(NULL); 5875 if (min < 0) 5876 return(NULL); 5877 if ((max < min) || (max < 1)) 5878 return(NULL); 5879 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5880 if (atom == NULL) 5881 return(NULL); 5882 if ((token2 == NULL) || (*token2 == 0)) { 5883 atom->valuep = xmlStrdup(token); 5884 } else { 5885 int lenn, lenp; 5886 xmlChar *str; 5887 5888 lenn = strlen((char *) token2); 5889 lenp = strlen((char *) token); 5890 5891 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5892 if (str == NULL) { 5893 xmlRegFreeAtom(atom); 5894 return(NULL); 5895 } 5896 memcpy(&str[0], token, lenp); 5897 str[lenp] = '|'; 5898 memcpy(&str[lenp + 1], token2, lenn); 5899 str[lenn + lenp + 1] = 0; 5900 5901 atom->valuep = str; 5902 } 5903 atom->data = data; 5904 if (min == 0) 5905 atom->min = 1; 5906 else 5907 atom->min = min; 5908 atom->max = max; 5909 5910 /* 5911 * associate a counter to the transition. 5912 */ 5913 counter = xmlRegGetCounter(am); 5914 am->counters[counter].min = min; 5915 am->counters[counter].max = max; 5916 5917 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5918 if (to == NULL) { 5919 to = xmlRegNewState(am); 5920 xmlRegStatePush(am, to); 5921 } 5922 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5923 xmlRegAtomPush(am, atom); 5924 am->state = to; 5925 5926 if (to == NULL) 5927 to = am->state; 5928 if (to == NULL) 5929 return(NULL); 5930 if (min == 0) 5931 xmlFAGenerateEpsilonTransition(am, from, to); 5932 return(to); 5933 } 5934 5935 /** 5936 * xmlAutomataNewCountTrans: 5937 * @am: an automata 5938 * @from: the starting point of the transition 5939 * @to: the target point of the transition or NULL 5940 * @token: the input string associated to that transition 5941 * @min: the minimum successive occurences of token 5942 * @max: the maximum successive occurences of token 5943 * @data: data associated to the transition 5944 * 5945 * If @to is NULL, this creates first a new target state in the automata 5946 * and then adds a transition from the @from state to the target state 5947 * activated by a succession of input of value @token and whose number 5948 * is between @min and @max 5949 * 5950 * Returns the target state or NULL in case of error 5951 */ 5952 xmlAutomataStatePtr 5953 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5954 xmlAutomataStatePtr to, const xmlChar *token, 5955 int min, int max, void *data) { 5956 xmlRegAtomPtr atom; 5957 int counter; 5958 5959 if ((am == NULL) || (from == NULL) || (token == NULL)) 5960 return(NULL); 5961 if (min < 0) 5962 return(NULL); 5963 if ((max < min) || (max < 1)) 5964 return(NULL); 5965 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5966 if (atom == NULL) 5967 return(NULL); 5968 atom->valuep = xmlStrdup(token); 5969 atom->data = data; 5970 if (min == 0) 5971 atom->min = 1; 5972 else 5973 atom->min = min; 5974 atom->max = max; 5975 5976 /* 5977 * associate a counter to the transition. 5978 */ 5979 counter = xmlRegGetCounter(am); 5980 am->counters[counter].min = min; 5981 am->counters[counter].max = max; 5982 5983 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5984 if (to == NULL) { 5985 to = xmlRegNewState(am); 5986 xmlRegStatePush(am, to); 5987 } 5988 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5989 xmlRegAtomPush(am, atom); 5990 am->state = to; 5991 5992 if (to == NULL) 5993 to = am->state; 5994 if (to == NULL) 5995 return(NULL); 5996 if (min == 0) 5997 xmlFAGenerateEpsilonTransition(am, from, to); 5998 return(to); 5999 } 6000 6001 /** 6002 * xmlAutomataNewOnceTrans2: 6003 * @am: an automata 6004 * @from: the starting point of the transition 6005 * @to: the target point of the transition or NULL 6006 * @token: the input string associated to that transition 6007 * @token2: the second input string associated to that transition 6008 * @min: the minimum successive occurences of token 6009 * @max: the maximum successive occurences of token 6010 * @data: data associated to the transition 6011 * 6012 * If @to is NULL, this creates first a new target state in the automata 6013 * and then adds a transition from the @from state to the target state 6014 * activated by a succession of input of value @token and @token2 and whose 6015 * number is between @min and @max, moreover that transition can only be 6016 * crossed once. 6017 * 6018 * Returns the target state or NULL in case of error 6019 */ 6020 xmlAutomataStatePtr 6021 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 6022 xmlAutomataStatePtr to, const xmlChar *token, 6023 const xmlChar *token2, 6024 int min, int max, void *data) { 6025 xmlRegAtomPtr atom; 6026 int counter; 6027 6028 if ((am == NULL) || (from == NULL) || (token == NULL)) 6029 return(NULL); 6030 if (min < 1) 6031 return(NULL); 6032 if ((max < min) || (max < 1)) 6033 return(NULL); 6034 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6035 if (atom == NULL) 6036 return(NULL); 6037 if ((token2 == NULL) || (*token2 == 0)) { 6038 atom->valuep = xmlStrdup(token); 6039 } else { 6040 int lenn, lenp; 6041 xmlChar *str; 6042 6043 lenn = strlen((char *) token2); 6044 lenp = strlen((char *) token); 6045 6046 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 6047 if (str == NULL) { 6048 xmlRegFreeAtom(atom); 6049 return(NULL); 6050 } 6051 memcpy(&str[0], token, lenp); 6052 str[lenp] = '|'; 6053 memcpy(&str[lenp + 1], token2, lenn); 6054 str[lenn + lenp + 1] = 0; 6055 6056 atom->valuep = str; 6057 } 6058 atom->data = data; 6059 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6060 atom->min = min; 6061 atom->max = max; 6062 /* 6063 * associate a counter to the transition. 6064 */ 6065 counter = xmlRegGetCounter(am); 6066 am->counters[counter].min = 1; 6067 am->counters[counter].max = 1; 6068 6069 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6070 if (to == NULL) { 6071 to = xmlRegNewState(am); 6072 xmlRegStatePush(am, to); 6073 } 6074 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6075 xmlRegAtomPush(am, atom); 6076 am->state = to; 6077 return(to); 6078 } 6079 6080 6081 6082 /** 6083 * xmlAutomataNewOnceTrans: 6084 * @am: an automata 6085 * @from: the starting point of the transition 6086 * @to: the target point of the transition or NULL 6087 * @token: the input string associated to that transition 6088 * @min: the minimum successive occurences of token 6089 * @max: the maximum successive occurences of token 6090 * @data: data associated to the transition 6091 * 6092 * If @to is NULL, this creates first a new target state in the automata 6093 * and then adds a transition from the @from state to the target state 6094 * activated by a succession of input of value @token and whose number 6095 * is between @min and @max, moreover that transition can only be crossed 6096 * once. 6097 * 6098 * Returns the target state or NULL in case of error 6099 */ 6100 xmlAutomataStatePtr 6101 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6102 xmlAutomataStatePtr to, const xmlChar *token, 6103 int min, int max, void *data) { 6104 xmlRegAtomPtr atom; 6105 int counter; 6106 6107 if ((am == NULL) || (from == NULL) || (token == NULL)) 6108 return(NULL); 6109 if (min < 1) 6110 return(NULL); 6111 if ((max < min) || (max < 1)) 6112 return(NULL); 6113 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6114 if (atom == NULL) 6115 return(NULL); 6116 atom->valuep = xmlStrdup(token); 6117 atom->data = data; 6118 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6119 atom->min = min; 6120 atom->max = max; 6121 /* 6122 * associate a counter to the transition. 6123 */ 6124 counter = xmlRegGetCounter(am); 6125 am->counters[counter].min = 1; 6126 am->counters[counter].max = 1; 6127 6128 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6129 if (to == NULL) { 6130 to = xmlRegNewState(am); 6131 xmlRegStatePush(am, to); 6132 } 6133 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6134 xmlRegAtomPush(am, atom); 6135 am->state = to; 6136 return(to); 6137 } 6138 6139 /** 6140 * xmlAutomataNewState: 6141 * @am: an automata 6142 * 6143 * Create a new disconnected state in the automata 6144 * 6145 * Returns the new state or NULL in case of error 6146 */ 6147 xmlAutomataStatePtr 6148 xmlAutomataNewState(xmlAutomataPtr am) { 6149 xmlAutomataStatePtr to; 6150 6151 if (am == NULL) 6152 return(NULL); 6153 to = xmlRegNewState(am); 6154 xmlRegStatePush(am, to); 6155 return(to); 6156 } 6157 6158 /** 6159 * xmlAutomataNewEpsilon: 6160 * @am: an automata 6161 * @from: the starting point of the transition 6162 * @to: the target point of the transition or NULL 6163 * 6164 * If @to is NULL, this creates first a new target state in the automata 6165 * and then adds an epsilon transition from the @from state to the 6166 * target state 6167 * 6168 * Returns the target state or NULL in case of error 6169 */ 6170 xmlAutomataStatePtr 6171 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 6172 xmlAutomataStatePtr to) { 6173 if ((am == NULL) || (from == NULL)) 6174 return(NULL); 6175 xmlFAGenerateEpsilonTransition(am, from, to); 6176 if (to == NULL) 6177 return(am->state); 6178 return(to); 6179 } 6180 6181 /** 6182 * xmlAutomataNewAllTrans: 6183 * @am: an automata 6184 * @from: the starting point of the transition 6185 * @to: the target point of the transition or NULL 6186 * @lax: allow to transition if not all all transitions have been activated 6187 * 6188 * If @to is NULL, this creates first a new target state in the automata 6189 * and then adds a an ALL transition from the @from state to the 6190 * target state. That transition is an epsilon transition allowed only when 6191 * all transitions from the @from node have been activated. 6192 * 6193 * Returns the target state or NULL in case of error 6194 */ 6195 xmlAutomataStatePtr 6196 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6197 xmlAutomataStatePtr to, int lax) { 6198 if ((am == NULL) || (from == NULL)) 6199 return(NULL); 6200 xmlFAGenerateAllTransition(am, from, to, lax); 6201 if (to == NULL) 6202 return(am->state); 6203 return(to); 6204 } 6205 6206 /** 6207 * xmlAutomataNewCounter: 6208 * @am: an automata 6209 * @min: the minimal value on the counter 6210 * @max: the maximal value on the counter 6211 * 6212 * Create a new counter 6213 * 6214 * Returns the counter number or -1 in case of error 6215 */ 6216 int 6217 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 6218 int ret; 6219 6220 if (am == NULL) 6221 return(-1); 6222 6223 ret = xmlRegGetCounter(am); 6224 if (ret < 0) 6225 return(-1); 6226 am->counters[ret].min = min; 6227 am->counters[ret].max = max; 6228 return(ret); 6229 } 6230 6231 /** 6232 * xmlAutomataNewCountedTrans: 6233 * @am: an automata 6234 * @from: the starting point of the transition 6235 * @to: the target point of the transition or NULL 6236 * @counter: the counter associated to that transition 6237 * 6238 * If @to is NULL, this creates first a new target state in the automata 6239 * and then adds an epsilon transition from the @from state to the target state 6240 * which will increment the counter provided 6241 * 6242 * Returns the target state or NULL in case of error 6243 */ 6244 xmlAutomataStatePtr 6245 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6246 xmlAutomataStatePtr to, int counter) { 6247 if ((am == NULL) || (from == NULL) || (counter < 0)) 6248 return(NULL); 6249 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 6250 if (to == NULL) 6251 return(am->state); 6252 return(to); 6253 } 6254 6255 /** 6256 * xmlAutomataNewCounterTrans: 6257 * @am: an automata 6258 * @from: the starting point of the transition 6259 * @to: the target point of the transition or NULL 6260 * @counter: the counter associated to that transition 6261 * 6262 * If @to is NULL, this creates first a new target state in the automata 6263 * and then adds an epsilon transition from the @from state to the target state 6264 * which will be allowed only if the counter is within the right range. 6265 * 6266 * Returns the target state or NULL in case of error 6267 */ 6268 xmlAutomataStatePtr 6269 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6270 xmlAutomataStatePtr to, int counter) { 6271 if ((am == NULL) || (from == NULL) || (counter < 0)) 6272 return(NULL); 6273 xmlFAGenerateCountedTransition(am, from, to, counter); 6274 if (to == NULL) 6275 return(am->state); 6276 return(to); 6277 } 6278 6279 /** 6280 * xmlAutomataCompile: 6281 * @am: an automata 6282 * 6283 * Compile the automata into a Reg Exp ready for being executed. 6284 * The automata should be free after this point. 6285 * 6286 * Returns the compiled regexp or NULL in case of error 6287 */ 6288 xmlRegexpPtr 6289 xmlAutomataCompile(xmlAutomataPtr am) { 6290 xmlRegexpPtr ret; 6291 6292 if ((am == NULL) || (am->error != 0)) return(NULL); 6293 xmlFAEliminateEpsilonTransitions(am); 6294 /* xmlFAComputesDeterminism(am); */ 6295 ret = xmlRegEpxFromParse(am); 6296 6297 return(ret); 6298 } 6299 6300 /** 6301 * xmlAutomataIsDeterminist: 6302 * @am: an automata 6303 * 6304 * Checks if an automata is determinist. 6305 * 6306 * Returns 1 if true, 0 if not, and -1 in case of error 6307 */ 6308 int 6309 xmlAutomataIsDeterminist(xmlAutomataPtr am) { 6310 int ret; 6311 6312 if (am == NULL) 6313 return(-1); 6314 6315 ret = xmlFAComputesDeterminism(am); 6316 return(ret); 6317 } 6318 #endif /* LIBXML_AUTOMATA_ENABLED */ 6319 6320 #ifdef LIBXML_EXPR_ENABLED 6321 /************************************************************************ 6322 * * 6323 * Formal Expression handling code * 6324 * * 6325 ************************************************************************/ 6326 /************************************************************************ 6327 * * 6328 * Expression handling context * 6329 * * 6330 ************************************************************************/ 6331 6332 struct _xmlExpCtxt { 6333 xmlDictPtr dict; 6334 xmlExpNodePtr *table; 6335 int size; 6336 int nbElems; 6337 int nb_nodes; 6338 int maxNodes; 6339 const char *expr; 6340 const char *cur; 6341 int nb_cons; 6342 int tabSize; 6343 }; 6344 6345 /** 6346 * xmlExpNewCtxt: 6347 * @maxNodes: the maximum number of nodes 6348 * @dict: optional dictionnary to use internally 6349 * 6350 * Creates a new context for manipulating expressions 6351 * 6352 * Returns the context or NULL in case of error 6353 */ 6354 xmlExpCtxtPtr 6355 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 6356 xmlExpCtxtPtr ret; 6357 int size = 256; 6358 6359 if (maxNodes <= 4096) 6360 maxNodes = 4096; 6361 6362 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 6363 if (ret == NULL) 6364 return(NULL); 6365 memset(ret, 0, sizeof(xmlExpCtxt)); 6366 ret->size = size; 6367 ret->nbElems = 0; 6368 ret->maxNodes = maxNodes; 6369 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 6370 if (ret->table == NULL) { 6371 xmlFree(ret); 6372 return(NULL); 6373 } 6374 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 6375 if (dict == NULL) { 6376 ret->dict = xmlDictCreate(); 6377 if (ret->dict == NULL) { 6378 xmlFree(ret->table); 6379 xmlFree(ret); 6380 return(NULL); 6381 } 6382 } else { 6383 ret->dict = dict; 6384 xmlDictReference(ret->dict); 6385 } 6386 return(ret); 6387 } 6388 6389 /** 6390 * xmlExpFreeCtxt: 6391 * @ctxt: an expression context 6392 * 6393 * Free an expression context 6394 */ 6395 void 6396 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 6397 if (ctxt == NULL) 6398 return; 6399 xmlDictFree(ctxt->dict); 6400 if (ctxt->table != NULL) 6401 xmlFree(ctxt->table); 6402 xmlFree(ctxt); 6403 } 6404 6405 /************************************************************************ 6406 * * 6407 * Structure associated to an expression node * 6408 * * 6409 ************************************************************************/ 6410 #define MAX_NODES 10000 6411 6412 /* #define DEBUG_DERIV */ 6413 6414 /* 6415 * TODO: 6416 * - Wildcards 6417 * - public API for creation 6418 * 6419 * Started 6420 * - regression testing 6421 * 6422 * Done 6423 * - split into module and test tool 6424 * - memleaks 6425 */ 6426 6427 typedef enum { 6428 XML_EXP_NILABLE = (1 << 0) 6429 } xmlExpNodeInfo; 6430 6431 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 6432 6433 struct _xmlExpNode { 6434 unsigned char type;/* xmlExpNodeType */ 6435 unsigned char info;/* OR of xmlExpNodeInfo */ 6436 unsigned short key; /* the hash key */ 6437 unsigned int ref; /* The number of references */ 6438 int c_max; /* the maximum length it can consume */ 6439 xmlExpNodePtr exp_left; 6440 xmlExpNodePtr next;/* the next node in the hash table or free list */ 6441 union { 6442 struct { 6443 int f_min; 6444 int f_max; 6445 } count; 6446 struct { 6447 xmlExpNodePtr f_right; 6448 } children; 6449 const xmlChar *f_str; 6450 } field; 6451 }; 6452 6453 #define exp_min field.count.f_min 6454 #define exp_max field.count.f_max 6455 /* #define exp_left field.children.f_left */ 6456 #define exp_right field.children.f_right 6457 #define exp_str field.f_str 6458 6459 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 6460 static xmlExpNode forbiddenExpNode = { 6461 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6462 }; 6463 xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 6464 static xmlExpNode emptyExpNode = { 6465 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6466 }; 6467 xmlExpNodePtr emptyExp = &emptyExpNode; 6468 6469 /************************************************************************ 6470 * * 6471 * The custom hash table for unicity and canonicalization * 6472 * of sub-expressions pointers * 6473 * * 6474 ************************************************************************/ 6475 /* 6476 * xmlExpHashNameComputeKey: 6477 * Calculate the hash key for a token 6478 */ 6479 static unsigned short 6480 xmlExpHashNameComputeKey(const xmlChar *name) { 6481 unsigned short value = 0L; 6482 char ch; 6483 6484 if (name != NULL) { 6485 value += 30 * (*name); 6486 while ((ch = *name++) != 0) { 6487 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 6488 } 6489 } 6490 return (value); 6491 } 6492 6493 /* 6494 * xmlExpHashComputeKey: 6495 * Calculate the hash key for a compound expression 6496 */ 6497 static unsigned short 6498 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 6499 xmlExpNodePtr right) { 6500 unsigned long value; 6501 unsigned short ret; 6502 6503 switch (type) { 6504 case XML_EXP_SEQ: 6505 value = left->key; 6506 value += right->key; 6507 value *= 3; 6508 ret = (unsigned short) value; 6509 break; 6510 case XML_EXP_OR: 6511 value = left->key; 6512 value += right->key; 6513 value *= 7; 6514 ret = (unsigned short) value; 6515 break; 6516 case XML_EXP_COUNT: 6517 value = left->key; 6518 value += right->key; 6519 ret = (unsigned short) value; 6520 break; 6521 default: 6522 ret = 0; 6523 } 6524 return(ret); 6525 } 6526 6527 6528 static xmlExpNodePtr 6529 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6530 xmlExpNodePtr ret; 6531 6532 if (ctxt->nb_nodes >= MAX_NODES) 6533 return(NULL); 6534 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6535 if (ret == NULL) 6536 return(NULL); 6537 memset(ret, 0, sizeof(xmlExpNode)); 6538 ret->type = type; 6539 ret->next = NULL; 6540 ctxt->nb_nodes++; 6541 ctxt->nb_cons++; 6542 return(ret); 6543 } 6544 6545 /** 6546 * xmlExpHashGetEntry: 6547 * @table: the hash table 6548 * 6549 * Get the unique entry from the hash table. The entry is created if 6550 * needed. @left and @right are consumed, i.e. their ref count will 6551 * be decremented by the operation. 6552 * 6553 * Returns the pointer or NULL in case of error 6554 */ 6555 static xmlExpNodePtr 6556 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6557 xmlExpNodePtr left, xmlExpNodePtr right, 6558 const xmlChar *name, int min, int max) { 6559 unsigned short kbase, key; 6560 xmlExpNodePtr entry; 6561 xmlExpNodePtr insert; 6562 6563 if (ctxt == NULL) 6564 return(NULL); 6565 6566 /* 6567 * Check for duplicate and insertion location. 6568 */ 6569 if (type == XML_EXP_ATOM) { 6570 kbase = xmlExpHashNameComputeKey(name); 6571 } else if (type == XML_EXP_COUNT) { 6572 /* COUNT reduction rule 1 */ 6573 /* a{1} -> a */ 6574 if (min == max) { 6575 if (min == 1) { 6576 return(left); 6577 } 6578 if (min == 0) { 6579 xmlExpFree(ctxt, left); 6580 return(emptyExp); 6581 } 6582 } 6583 if (min < 0) { 6584 xmlExpFree(ctxt, left); 6585 return(forbiddenExp); 6586 } 6587 if (max == -1) 6588 kbase = min + 79; 6589 else 6590 kbase = max - min; 6591 kbase += left->key; 6592 } else if (type == XML_EXP_OR) { 6593 /* Forbid reduction rules */ 6594 if (left->type == XML_EXP_FORBID) { 6595 xmlExpFree(ctxt, left); 6596 return(right); 6597 } 6598 if (right->type == XML_EXP_FORBID) { 6599 xmlExpFree(ctxt, right); 6600 return(left); 6601 } 6602 6603 /* OR reduction rule 1 */ 6604 /* a | a reduced to a */ 6605 if (left == right) { 6606 left->ref--; 6607 return(left); 6608 } 6609 /* OR canonicalization rule 1 */ 6610 /* linearize (a | b) | c into a | (b | c) */ 6611 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6612 xmlExpNodePtr tmp = left; 6613 left = right; 6614 right = tmp; 6615 } 6616 /* OR reduction rule 2 */ 6617 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6618 if (right->type == XML_EXP_OR) { 6619 if ((left == right->exp_left) || 6620 (left == right->exp_right)) { 6621 xmlExpFree(ctxt, left); 6622 return(right); 6623 } 6624 } 6625 /* OR canonicalization rule 2 */ 6626 /* linearize (a | b) | c into a | (b | c) */ 6627 if (left->type == XML_EXP_OR) { 6628 xmlExpNodePtr tmp; 6629 6630 /* OR canonicalization rule 2 */ 6631 if ((left->exp_right->type != XML_EXP_OR) && 6632 (left->exp_right->key < left->exp_left->key)) { 6633 tmp = left->exp_right; 6634 left->exp_right = left->exp_left; 6635 left->exp_left = tmp; 6636 } 6637 left->exp_right->ref++; 6638 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6639 NULL, 0, 0); 6640 left->exp_left->ref++; 6641 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6642 NULL, 0, 0); 6643 6644 xmlExpFree(ctxt, left); 6645 return(tmp); 6646 } 6647 if (right->type == XML_EXP_OR) { 6648 /* Ordering in the tree */ 6649 /* C | (A | B) -> A | (B | C) */ 6650 if (left->key > right->exp_right->key) { 6651 xmlExpNodePtr tmp; 6652 right->exp_right->ref++; 6653 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6654 left, NULL, 0, 0); 6655 right->exp_left->ref++; 6656 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6657 tmp, NULL, 0, 0); 6658 xmlExpFree(ctxt, right); 6659 return(tmp); 6660 } 6661 /* Ordering in the tree */ 6662 /* B | (A | C) -> A | (B | C) */ 6663 if (left->key > right->exp_left->key) { 6664 xmlExpNodePtr tmp; 6665 right->exp_right->ref++; 6666 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6667 right->exp_right, NULL, 0, 0); 6668 right->exp_left->ref++; 6669 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6670 tmp, NULL, 0, 0); 6671 xmlExpFree(ctxt, right); 6672 return(tmp); 6673 } 6674 } 6675 /* we know both types are != XML_EXP_OR here */ 6676 else if (left->key > right->key) { 6677 xmlExpNodePtr tmp = left; 6678 left = right; 6679 right = tmp; 6680 } 6681 kbase = xmlExpHashComputeKey(type, left, right); 6682 } else if (type == XML_EXP_SEQ) { 6683 /* Forbid reduction rules */ 6684 if (left->type == XML_EXP_FORBID) { 6685 xmlExpFree(ctxt, right); 6686 return(left); 6687 } 6688 if (right->type == XML_EXP_FORBID) { 6689 xmlExpFree(ctxt, left); 6690 return(right); 6691 } 6692 /* Empty reduction rules */ 6693 if (right->type == XML_EXP_EMPTY) { 6694 return(left); 6695 } 6696 if (left->type == XML_EXP_EMPTY) { 6697 return(right); 6698 } 6699 kbase = xmlExpHashComputeKey(type, left, right); 6700 } else 6701 return(NULL); 6702 6703 key = kbase % ctxt->size; 6704 if (ctxt->table[key] != NULL) { 6705 for (insert = ctxt->table[key]; insert != NULL; 6706 insert = insert->next) { 6707 if ((insert->key == kbase) && 6708 (insert->type == type)) { 6709 if (type == XML_EXP_ATOM) { 6710 if (name == insert->exp_str) { 6711 insert->ref++; 6712 return(insert); 6713 } 6714 } else if (type == XML_EXP_COUNT) { 6715 if ((insert->exp_min == min) && (insert->exp_max == max) && 6716 (insert->exp_left == left)) { 6717 insert->ref++; 6718 left->ref--; 6719 return(insert); 6720 } 6721 } else if ((insert->exp_left == left) && 6722 (insert->exp_right == right)) { 6723 insert->ref++; 6724 left->ref--; 6725 right->ref--; 6726 return(insert); 6727 } 6728 } 6729 } 6730 } 6731 6732 entry = xmlExpNewNode(ctxt, type); 6733 if (entry == NULL) 6734 return(NULL); 6735 entry->key = kbase; 6736 if (type == XML_EXP_ATOM) { 6737 entry->exp_str = name; 6738 entry->c_max = 1; 6739 } else if (type == XML_EXP_COUNT) { 6740 entry->exp_min = min; 6741 entry->exp_max = max; 6742 entry->exp_left = left; 6743 if ((min == 0) || (IS_NILLABLE(left))) 6744 entry->info |= XML_EXP_NILABLE; 6745 if (max < 0) 6746 entry->c_max = -1; 6747 else 6748 entry->c_max = max * entry->exp_left->c_max; 6749 } else { 6750 entry->exp_left = left; 6751 entry->exp_right = right; 6752 if (type == XML_EXP_OR) { 6753 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6754 entry->info |= XML_EXP_NILABLE; 6755 if ((entry->exp_left->c_max == -1) || 6756 (entry->exp_right->c_max == -1)) 6757 entry->c_max = -1; 6758 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6759 entry->c_max = entry->exp_left->c_max; 6760 else 6761 entry->c_max = entry->exp_right->c_max; 6762 } else { 6763 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6764 entry->info |= XML_EXP_NILABLE; 6765 if ((entry->exp_left->c_max == -1) || 6766 (entry->exp_right->c_max == -1)) 6767 entry->c_max = -1; 6768 else 6769 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6770 } 6771 } 6772 entry->ref = 1; 6773 if (ctxt->table[key] != NULL) 6774 entry->next = ctxt->table[key]; 6775 6776 ctxt->table[key] = entry; 6777 ctxt->nbElems++; 6778 6779 return(entry); 6780 } 6781 6782 /** 6783 * xmlExpFree: 6784 * @ctxt: the expression context 6785 * @exp: the expression 6786 * 6787 * Dereference the expression 6788 */ 6789 void 6790 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6791 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6792 return; 6793 exp->ref--; 6794 if (exp->ref == 0) { 6795 unsigned short key; 6796 6797 /* Unlink it first from the hash table */ 6798 key = exp->key % ctxt->size; 6799 if (ctxt->table[key] == exp) { 6800 ctxt->table[key] = exp->next; 6801 } else { 6802 xmlExpNodePtr tmp; 6803 6804 tmp = ctxt->table[key]; 6805 while (tmp != NULL) { 6806 if (tmp->next == exp) { 6807 tmp->next = exp->next; 6808 break; 6809 } 6810 tmp = tmp->next; 6811 } 6812 } 6813 6814 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6815 xmlExpFree(ctxt, exp->exp_left); 6816 xmlExpFree(ctxt, exp->exp_right); 6817 } else if (exp->type == XML_EXP_COUNT) { 6818 xmlExpFree(ctxt, exp->exp_left); 6819 } 6820 xmlFree(exp); 6821 ctxt->nb_nodes--; 6822 } 6823 } 6824 6825 /** 6826 * xmlExpRef: 6827 * @exp: the expression 6828 * 6829 * Increase the reference count of the expression 6830 */ 6831 void 6832 xmlExpRef(xmlExpNodePtr exp) { 6833 if (exp != NULL) 6834 exp->ref++; 6835 } 6836 6837 /** 6838 * xmlExpNewAtom: 6839 * @ctxt: the expression context 6840 * @name: the atom name 6841 * @len: the atom name length in byte (or -1); 6842 * 6843 * Get the atom associated to this name from that context 6844 * 6845 * Returns the node or NULL in case of error 6846 */ 6847 xmlExpNodePtr 6848 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 6849 if ((ctxt == NULL) || (name == NULL)) 6850 return(NULL); 6851 name = xmlDictLookup(ctxt->dict, name, len); 6852 if (name == NULL) 6853 return(NULL); 6854 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 6855 } 6856 6857 /** 6858 * xmlExpNewOr: 6859 * @ctxt: the expression context 6860 * @left: left expression 6861 * @right: right expression 6862 * 6863 * Get the atom associated to the choice @left | @right 6864 * Note that @left and @right are consumed in the operation, to keep 6865 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6866 * this is true even in case of failure (unless ctxt == NULL). 6867 * 6868 * Returns the node or NULL in case of error 6869 */ 6870 xmlExpNodePtr 6871 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6872 if (ctxt == NULL) 6873 return(NULL); 6874 if ((left == NULL) || (right == NULL)) { 6875 xmlExpFree(ctxt, left); 6876 xmlExpFree(ctxt, right); 6877 return(NULL); 6878 } 6879 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 6880 } 6881 6882 /** 6883 * xmlExpNewSeq: 6884 * @ctxt: the expression context 6885 * @left: left expression 6886 * @right: right expression 6887 * 6888 * Get the atom associated to the sequence @left , @right 6889 * Note that @left and @right are consumed in the operation, to keep 6890 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6891 * this is true even in case of failure (unless ctxt == NULL). 6892 * 6893 * Returns the node or NULL in case of error 6894 */ 6895 xmlExpNodePtr 6896 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6897 if (ctxt == NULL) 6898 return(NULL); 6899 if ((left == NULL) || (right == NULL)) { 6900 xmlExpFree(ctxt, left); 6901 xmlExpFree(ctxt, right); 6902 return(NULL); 6903 } 6904 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 6905 } 6906 6907 /** 6908 * xmlExpNewRange: 6909 * @ctxt: the expression context 6910 * @subset: the expression to be repeated 6911 * @min: the lower bound for the repetition 6912 * @max: the upper bound for the repetition, -1 means infinite 6913 * 6914 * Get the atom associated to the range (@subset){@min, @max} 6915 * Note that @subset is consumed in the operation, to keep 6916 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 6917 * this is true even in case of failure (unless ctxt == NULL). 6918 * 6919 * Returns the node or NULL in case of error 6920 */ 6921 xmlExpNodePtr 6922 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 6923 if (ctxt == NULL) 6924 return(NULL); 6925 if ((subset == NULL) || (min < 0) || (max < -1) || 6926 ((max >= 0) && (min > max))) { 6927 xmlExpFree(ctxt, subset); 6928 return(NULL); 6929 } 6930 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 6931 NULL, NULL, min, max)); 6932 } 6933 6934 /************************************************************************ 6935 * * 6936 * Public API for operations on expressions * 6937 * * 6938 ************************************************************************/ 6939 6940 static int 6941 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6942 const xmlChar**list, int len, int nb) { 6943 int tmp, tmp2; 6944 tail: 6945 switch (exp->type) { 6946 case XML_EXP_EMPTY: 6947 return(0); 6948 case XML_EXP_ATOM: 6949 for (tmp = 0;tmp < nb;tmp++) 6950 if (list[tmp] == exp->exp_str) 6951 return(0); 6952 if (nb >= len) 6953 return(-2); 6954 list[nb] = exp->exp_str; 6955 return(1); 6956 case XML_EXP_COUNT: 6957 exp = exp->exp_left; 6958 goto tail; 6959 case XML_EXP_SEQ: 6960 case XML_EXP_OR: 6961 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 6962 if (tmp < 0) 6963 return(tmp); 6964 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 6965 nb + tmp); 6966 if (tmp2 < 0) 6967 return(tmp2); 6968 return(tmp + tmp2); 6969 } 6970 return(-1); 6971 } 6972 6973 /** 6974 * xmlExpGetLanguage: 6975 * @ctxt: the expression context 6976 * @exp: the expression 6977 * @langList: where to store the tokens 6978 * @len: the allocated length of @list 6979 * 6980 * Find all the strings used in @exp and store them in @list 6981 * 6982 * Returns the number of unique strings found, -1 in case of errors and 6983 * -2 if there is more than @len strings 6984 */ 6985 int 6986 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6987 const xmlChar**langList, int len) { 6988 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) 6989 return(-1); 6990 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); 6991 } 6992 6993 static int 6994 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6995 const xmlChar**list, int len, int nb) { 6996 int tmp, tmp2; 6997 tail: 6998 switch (exp->type) { 6999 case XML_EXP_FORBID: 7000 return(0); 7001 case XML_EXP_EMPTY: 7002 return(0); 7003 case XML_EXP_ATOM: 7004 for (tmp = 0;tmp < nb;tmp++) 7005 if (list[tmp] == exp->exp_str) 7006 return(0); 7007 if (nb >= len) 7008 return(-2); 7009 list[nb] = exp->exp_str; 7010 return(1); 7011 case XML_EXP_COUNT: 7012 exp = exp->exp_left; 7013 goto tail; 7014 case XML_EXP_SEQ: 7015 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7016 if (tmp < 0) 7017 return(tmp); 7018 if (IS_NILLABLE(exp->exp_left)) { 7019 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7020 nb + tmp); 7021 if (tmp2 < 0) 7022 return(tmp2); 7023 tmp += tmp2; 7024 } 7025 return(tmp); 7026 case XML_EXP_OR: 7027 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7028 if (tmp < 0) 7029 return(tmp); 7030 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7031 nb + tmp); 7032 if (tmp2 < 0) 7033 return(tmp2); 7034 return(tmp + tmp2); 7035 } 7036 return(-1); 7037 } 7038 7039 /** 7040 * xmlExpGetStart: 7041 * @ctxt: the expression context 7042 * @exp: the expression 7043 * @tokList: where to store the tokens 7044 * @len: the allocated length of @list 7045 * 7046 * Find all the strings that appears at the start of the languages 7047 * accepted by @exp and store them in @list. E.g. for (a, b) | c 7048 * it will return the list [a, c] 7049 * 7050 * Returns the number of unique strings found, -1 in case of errors and 7051 * -2 if there is more than @len strings 7052 */ 7053 int 7054 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7055 const xmlChar**tokList, int len) { 7056 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) 7057 return(-1); 7058 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); 7059 } 7060 7061 /** 7062 * xmlExpIsNillable: 7063 * @exp: the expression 7064 * 7065 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce 7066 * 7067 * Returns 1 if nillable, 0 if not and -1 in case of error 7068 */ 7069 int 7070 xmlExpIsNillable(xmlExpNodePtr exp) { 7071 if (exp == NULL) 7072 return(-1); 7073 return(IS_NILLABLE(exp) != 0); 7074 } 7075 7076 static xmlExpNodePtr 7077 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 7078 { 7079 xmlExpNodePtr ret; 7080 7081 switch (exp->type) { 7082 case XML_EXP_EMPTY: 7083 return(forbiddenExp); 7084 case XML_EXP_FORBID: 7085 return(forbiddenExp); 7086 case XML_EXP_ATOM: 7087 if (exp->exp_str == str) { 7088 #ifdef DEBUG_DERIV 7089 printf("deriv atom: equal => Empty\n"); 7090 #endif 7091 ret = emptyExp; 7092 } else { 7093 #ifdef DEBUG_DERIV 7094 printf("deriv atom: mismatch => forbid\n"); 7095 #endif 7096 /* TODO wildcards here */ 7097 ret = forbiddenExp; 7098 } 7099 return(ret); 7100 case XML_EXP_OR: { 7101 xmlExpNodePtr tmp; 7102 7103 #ifdef DEBUG_DERIV 7104 printf("deriv or: => or(derivs)\n"); 7105 #endif 7106 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7107 if (tmp == NULL) { 7108 return(NULL); 7109 } 7110 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7111 if (ret == NULL) { 7112 xmlExpFree(ctxt, tmp); 7113 return(NULL); 7114 } 7115 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 7116 NULL, 0, 0); 7117 return(ret); 7118 } 7119 case XML_EXP_SEQ: 7120 #ifdef DEBUG_DERIV 7121 printf("deriv seq: starting with left\n"); 7122 #endif 7123 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7124 if (ret == NULL) { 7125 return(NULL); 7126 } else if (ret == forbiddenExp) { 7127 if (IS_NILLABLE(exp->exp_left)) { 7128 #ifdef DEBUG_DERIV 7129 printf("deriv seq: left failed but nillable\n"); 7130 #endif 7131 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7132 } 7133 } else { 7134 #ifdef DEBUG_DERIV 7135 printf("deriv seq: left match => sequence\n"); 7136 #endif 7137 exp->exp_right->ref++; 7138 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 7139 NULL, 0, 0); 7140 } 7141 return(ret); 7142 case XML_EXP_COUNT: { 7143 int min, max; 7144 xmlExpNodePtr tmp; 7145 7146 if (exp->exp_max == 0) 7147 return(forbiddenExp); 7148 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7149 if (ret == NULL) 7150 return(NULL); 7151 if (ret == forbiddenExp) { 7152 #ifdef DEBUG_DERIV 7153 printf("deriv count: pattern mismatch => forbid\n"); 7154 #endif 7155 return(ret); 7156 } 7157 if (exp->exp_max == 1) 7158 return(ret); 7159 if (exp->exp_max < 0) /* unbounded */ 7160 max = -1; 7161 else 7162 max = exp->exp_max - 1; 7163 if (exp->exp_min > 0) 7164 min = exp->exp_min - 1; 7165 else 7166 min = 0; 7167 exp->exp_left->ref++; 7168 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 7169 NULL, min, max); 7170 if (ret == emptyExp) { 7171 #ifdef DEBUG_DERIV 7172 printf("deriv count: match to empty => new count\n"); 7173 #endif 7174 return(tmp); 7175 } 7176 #ifdef DEBUG_DERIV 7177 printf("deriv count: match => sequence with new count\n"); 7178 #endif 7179 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 7180 NULL, 0, 0)); 7181 } 7182 } 7183 return(NULL); 7184 } 7185 7186 /** 7187 * xmlExpStringDerive: 7188 * @ctxt: the expression context 7189 * @exp: the expression 7190 * @str: the string 7191 * @len: the string len in bytes if available 7192 * 7193 * Do one step of Brzozowski derivation of the expression @exp with 7194 * respect to the input string 7195 * 7196 * Returns the resulting expression or NULL in case of internal error 7197 */ 7198 xmlExpNodePtr 7199 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7200 const xmlChar *str, int len) { 7201 const xmlChar *input; 7202 7203 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 7204 return(NULL); 7205 } 7206 /* 7207 * check the string is in the dictionnary, if yes use an interned 7208 * copy, otherwise we know it's not an acceptable input 7209 */ 7210 input = xmlDictExists(ctxt->dict, str, len); 7211 if (input == NULL) { 7212 return(forbiddenExp); 7213 } 7214 return(xmlExpStringDeriveInt(ctxt, exp, input)); 7215 } 7216 7217 static int 7218 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 7219 int ret = 1; 7220 7221 if (sub->c_max == -1) { 7222 if (exp->c_max != -1) 7223 ret = 0; 7224 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 7225 ret = 0; 7226 } 7227 #if 0 7228 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 7229 ret = 0; 7230 #endif 7231 return(ret); 7232 } 7233 7234 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7235 xmlExpNodePtr sub); 7236 /** 7237 * xmlExpDivide: 7238 * @ctxt: the expressions context 7239 * @exp: the englobing expression 7240 * @sub: the subexpression 7241 * @mult: the multiple expression 7242 * @remain: the remain from the derivation of the multiple 7243 * 7244 * Check if exp is a multiple of sub, i.e. if there is a finite number n 7245 * so that sub{n} subsume exp 7246 * 7247 * Returns the multiple value if successful, 0 if it is not a multiple 7248 * and -1 in case of internel error. 7249 */ 7250 7251 static int 7252 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 7253 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 7254 int i; 7255 xmlExpNodePtr tmp, tmp2; 7256 7257 if (mult != NULL) *mult = NULL; 7258 if (remain != NULL) *remain = NULL; 7259 if (exp->c_max == -1) return(0); 7260 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 7261 7262 for (i = 1;i <= exp->c_max;i++) { 7263 sub->ref++; 7264 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7265 sub, NULL, NULL, i, i); 7266 if (tmp == NULL) { 7267 return(-1); 7268 } 7269 if (!xmlExpCheckCard(tmp, exp)) { 7270 xmlExpFree(ctxt, tmp); 7271 continue; 7272 } 7273 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 7274 if (tmp2 == NULL) { 7275 xmlExpFree(ctxt, tmp); 7276 return(-1); 7277 } 7278 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 7279 if (remain != NULL) 7280 *remain = tmp2; 7281 else 7282 xmlExpFree(ctxt, tmp2); 7283 if (mult != NULL) 7284 *mult = tmp; 7285 else 7286 xmlExpFree(ctxt, tmp); 7287 #ifdef DEBUG_DERIV 7288 printf("Divide succeeded %d\n", i); 7289 #endif 7290 return(i); 7291 } 7292 xmlExpFree(ctxt, tmp); 7293 xmlExpFree(ctxt, tmp2); 7294 } 7295 #ifdef DEBUG_DERIV 7296 printf("Divide failed\n"); 7297 #endif 7298 return(0); 7299 } 7300 7301 /** 7302 * xmlExpExpDeriveInt: 7303 * @ctxt: the expressions context 7304 * @exp: the englobing expression 7305 * @sub: the subexpression 7306 * 7307 * Try to do a step of Brzozowski derivation but at a higher level 7308 * the input being a subexpression. 7309 * 7310 * Returns the resulting expression or NULL in case of internal error 7311 */ 7312 static xmlExpNodePtr 7313 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7314 xmlExpNodePtr ret, tmp, tmp2, tmp3; 7315 const xmlChar **tab; 7316 int len, i; 7317 7318 /* 7319 * In case of equality and if the expression can only consume a finite 7320 * amount, then the derivation is empty 7321 */ 7322 if ((exp == sub) && (exp->c_max >= 0)) { 7323 #ifdef DEBUG_DERIV 7324 printf("Equal(exp, sub) and finite -> Empty\n"); 7325 #endif 7326 return(emptyExp); 7327 } 7328 /* 7329 * decompose sub sequence first 7330 */ 7331 if (sub->type == XML_EXP_EMPTY) { 7332 #ifdef DEBUG_DERIV 7333 printf("Empty(sub) -> Empty\n"); 7334 #endif 7335 exp->ref++; 7336 return(exp); 7337 } 7338 if (sub->type == XML_EXP_SEQ) { 7339 #ifdef DEBUG_DERIV 7340 printf("Seq(sub) -> decompose\n"); 7341 #endif 7342 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7343 if (tmp == NULL) 7344 return(NULL); 7345 if (tmp == forbiddenExp) 7346 return(tmp); 7347 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 7348 xmlExpFree(ctxt, tmp); 7349 return(ret); 7350 } 7351 if (sub->type == XML_EXP_OR) { 7352 #ifdef DEBUG_DERIV 7353 printf("Or(sub) -> decompose\n"); 7354 #endif 7355 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7356 if (tmp == forbiddenExp) 7357 return(tmp); 7358 if (tmp == NULL) 7359 return(NULL); 7360 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 7361 if ((ret == NULL) || (ret == forbiddenExp)) { 7362 xmlExpFree(ctxt, tmp); 7363 return(ret); 7364 } 7365 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 7366 } 7367 if (!xmlExpCheckCard(exp, sub)) { 7368 #ifdef DEBUG_DERIV 7369 printf("CheckCard(exp, sub) failed -> Forbid\n"); 7370 #endif 7371 return(forbiddenExp); 7372 } 7373 switch (exp->type) { 7374 case XML_EXP_EMPTY: 7375 if (sub == emptyExp) 7376 return(emptyExp); 7377 #ifdef DEBUG_DERIV 7378 printf("Empty(exp) -> Forbid\n"); 7379 #endif 7380 return(forbiddenExp); 7381 case XML_EXP_FORBID: 7382 #ifdef DEBUG_DERIV 7383 printf("Forbid(exp) -> Forbid\n"); 7384 #endif 7385 return(forbiddenExp); 7386 case XML_EXP_ATOM: 7387 if (sub->type == XML_EXP_ATOM) { 7388 /* TODO: handle wildcards */ 7389 if (exp->exp_str == sub->exp_str) { 7390 #ifdef DEBUG_DERIV 7391 printf("Atom match -> Empty\n"); 7392 #endif 7393 return(emptyExp); 7394 } 7395 #ifdef DEBUG_DERIV 7396 printf("Atom mismatch -> Forbid\n"); 7397 #endif 7398 return(forbiddenExp); 7399 } 7400 if ((sub->type == XML_EXP_COUNT) && 7401 (sub->exp_max == 1) && 7402 (sub->exp_left->type == XML_EXP_ATOM)) { 7403 /* TODO: handle wildcards */ 7404 if (exp->exp_str == sub->exp_left->exp_str) { 7405 #ifdef DEBUG_DERIV 7406 printf("Atom match -> Empty\n"); 7407 #endif 7408 return(emptyExp); 7409 } 7410 #ifdef DEBUG_DERIV 7411 printf("Atom mismatch -> Forbid\n"); 7412 #endif 7413 return(forbiddenExp); 7414 } 7415 #ifdef DEBUG_DERIV 7416 printf("Compex exp vs Atom -> Forbid\n"); 7417 #endif 7418 return(forbiddenExp); 7419 case XML_EXP_SEQ: 7420 /* try to get the sequence consumed only if possible */ 7421 if (xmlExpCheckCard(exp->exp_left, sub)) { 7422 /* See if the sequence can be consumed directly */ 7423 #ifdef DEBUG_DERIV 7424 printf("Seq trying left only\n"); 7425 #endif 7426 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7427 if ((ret != forbiddenExp) && (ret != NULL)) { 7428 #ifdef DEBUG_DERIV 7429 printf("Seq trying left only worked\n"); 7430 #endif 7431 /* 7432 * TODO: assumption here that we are determinist 7433 * i.e. we won't get to a nillable exp left 7434 * subset which could be matched by the right 7435 * part too. 7436 * e.g.: (a | b)+,(a | c) and 'a+,a' 7437 */ 7438 exp->exp_right->ref++; 7439 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7440 exp->exp_right, NULL, 0, 0)); 7441 } 7442 #ifdef DEBUG_DERIV 7443 } else { 7444 printf("Seq: left too short\n"); 7445 #endif 7446 } 7447 /* Try instead to decompose */ 7448 if (sub->type == XML_EXP_COUNT) { 7449 int min, max; 7450 7451 #ifdef DEBUG_DERIV 7452 printf("Seq: sub is a count\n"); 7453 #endif 7454 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7455 if (ret == NULL) 7456 return(NULL); 7457 if (ret != forbiddenExp) { 7458 #ifdef DEBUG_DERIV 7459 printf("Seq , Count match on left\n"); 7460 #endif 7461 if (sub->exp_max < 0) 7462 max = -1; 7463 else 7464 max = sub->exp_max -1; 7465 if (sub->exp_min > 0) 7466 min = sub->exp_min -1; 7467 else 7468 min = 0; 7469 exp->exp_right->ref++; 7470 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7471 exp->exp_right, NULL, 0, 0); 7472 if (tmp == NULL) 7473 return(NULL); 7474 7475 sub->exp_left->ref++; 7476 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7477 sub->exp_left, NULL, NULL, min, max); 7478 if (tmp2 == NULL) { 7479 xmlExpFree(ctxt, tmp); 7480 return(NULL); 7481 } 7482 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7483 xmlExpFree(ctxt, tmp); 7484 xmlExpFree(ctxt, tmp2); 7485 return(ret); 7486 } 7487 } 7488 /* we made no progress on structured operations */ 7489 break; 7490 case XML_EXP_OR: 7491 #ifdef DEBUG_DERIV 7492 printf("Or , trying both side\n"); 7493 #endif 7494 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7495 if (ret == NULL) 7496 return(NULL); 7497 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 7498 if (tmp == NULL) { 7499 xmlExpFree(ctxt, ret); 7500 return(NULL); 7501 } 7502 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 7503 case XML_EXP_COUNT: { 7504 int min, max; 7505 7506 if (sub->type == XML_EXP_COUNT) { 7507 /* 7508 * Try to see if the loop is completely subsumed 7509 */ 7510 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7511 if (tmp == NULL) 7512 return(NULL); 7513 if (tmp == forbiddenExp) { 7514 int mult; 7515 7516 #ifdef DEBUG_DERIV 7517 printf("Count, Count inner don't subsume\n"); 7518 #endif 7519 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7520 NULL, &tmp); 7521 if (mult <= 0) { 7522 #ifdef DEBUG_DERIV 7523 printf("Count, Count not multiple => forbidden\n"); 7524 #endif 7525 return(forbiddenExp); 7526 } 7527 if (sub->exp_max == -1) { 7528 max = -1; 7529 if (exp->exp_max == -1) { 7530 if (exp->exp_min <= sub->exp_min * mult) 7531 min = 0; 7532 else 7533 min = exp->exp_min - sub->exp_min * mult; 7534 } else { 7535 #ifdef DEBUG_DERIV 7536 printf("Count, Count finite can't subsume infinite\n"); 7537 #endif 7538 xmlExpFree(ctxt, tmp); 7539 return(forbiddenExp); 7540 } 7541 } else { 7542 if (exp->exp_max == -1) { 7543 #ifdef DEBUG_DERIV 7544 printf("Infinite loop consume mult finite loop\n"); 7545 #endif 7546 if (exp->exp_min > sub->exp_min * mult) { 7547 max = -1; 7548 min = exp->exp_min - sub->exp_min * mult; 7549 } else { 7550 max = -1; 7551 min = 0; 7552 } 7553 } else { 7554 if (exp->exp_max < sub->exp_max * mult) { 7555 #ifdef DEBUG_DERIV 7556 printf("loops max mult mismatch => forbidden\n"); 7557 #endif 7558 xmlExpFree(ctxt, tmp); 7559 return(forbiddenExp); 7560 } 7561 if (sub->exp_max * mult > exp->exp_min) 7562 min = 0; 7563 else 7564 min = exp->exp_min - sub->exp_max * mult; 7565 max = exp->exp_max - sub->exp_max * mult; 7566 } 7567 } 7568 } else if (!IS_NILLABLE(tmp)) { 7569 /* 7570 * TODO: loop here to try to grow if working on finite 7571 * blocks. 7572 */ 7573 #ifdef DEBUG_DERIV 7574 printf("Count, Count remain not nillable => forbidden\n"); 7575 #endif 7576 xmlExpFree(ctxt, tmp); 7577 return(forbiddenExp); 7578 } else if (sub->exp_max == -1) { 7579 if (exp->exp_max == -1) { 7580 if (exp->exp_min <= sub->exp_min) { 7581 #ifdef DEBUG_DERIV 7582 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7583 #endif 7584 max = -1; 7585 min = 0; 7586 } else { 7587 #ifdef DEBUG_DERIV 7588 printf("Infinite loops min => Count(X,Inf)\n"); 7589 #endif 7590 max = -1; 7591 min = exp->exp_min - sub->exp_min; 7592 } 7593 } else if (exp->exp_min > sub->exp_min) { 7594 #ifdef DEBUG_DERIV 7595 printf("loops min mismatch 1 => forbidden ???\n"); 7596 #endif 7597 xmlExpFree(ctxt, tmp); 7598 return(forbiddenExp); 7599 } else { 7600 max = -1; 7601 min = 0; 7602 } 7603 } else { 7604 if (exp->exp_max == -1) { 7605 #ifdef DEBUG_DERIV 7606 printf("Infinite loop consume finite loop\n"); 7607 #endif 7608 if (exp->exp_min > sub->exp_min) { 7609 max = -1; 7610 min = exp->exp_min - sub->exp_min; 7611 } else { 7612 max = -1; 7613 min = 0; 7614 } 7615 } else { 7616 if (exp->exp_max < sub->exp_max) { 7617 #ifdef DEBUG_DERIV 7618 printf("loops max mismatch => forbidden\n"); 7619 #endif 7620 xmlExpFree(ctxt, tmp); 7621 return(forbiddenExp); 7622 } 7623 if (sub->exp_max > exp->exp_min) 7624 min = 0; 7625 else 7626 min = exp->exp_min - sub->exp_max; 7627 max = exp->exp_max - sub->exp_max; 7628 } 7629 } 7630 #ifdef DEBUG_DERIV 7631 printf("loops match => SEQ(COUNT())\n"); 7632 #endif 7633 exp->exp_left->ref++; 7634 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7635 NULL, NULL, min, max); 7636 if (tmp2 == NULL) { 7637 return(NULL); 7638 } 7639 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7640 NULL, 0, 0); 7641 return(ret); 7642 } 7643 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7644 if (tmp == NULL) 7645 return(NULL); 7646 if (tmp == forbiddenExp) { 7647 #ifdef DEBUG_DERIV 7648 printf("loop mismatch => forbidden\n"); 7649 #endif 7650 return(forbiddenExp); 7651 } 7652 if (exp->exp_min > 0) 7653 min = exp->exp_min - 1; 7654 else 7655 min = 0; 7656 if (exp->exp_max < 0) 7657 max = -1; 7658 else 7659 max = exp->exp_max - 1; 7660 7661 #ifdef DEBUG_DERIV 7662 printf("loop match => SEQ(COUNT())\n"); 7663 #endif 7664 exp->exp_left->ref++; 7665 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7666 NULL, NULL, min, max); 7667 if (tmp2 == NULL) 7668 return(NULL); 7669 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7670 NULL, 0, 0); 7671 return(ret); 7672 } 7673 } 7674 7675 #ifdef DEBUG_DERIV 7676 printf("Fallback to derivative\n"); 7677 #endif 7678 if (IS_NILLABLE(sub)) { 7679 if (!(IS_NILLABLE(exp))) 7680 return(forbiddenExp); 7681 else 7682 ret = emptyExp; 7683 } else 7684 ret = NULL; 7685 /* 7686 * here the structured derivation made no progress so 7687 * we use the default token based derivation to force one more step 7688 */ 7689 if (ctxt->tabSize == 0) 7690 ctxt->tabSize = 40; 7691 7692 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7693 sizeof(const xmlChar *)); 7694 if (tab == NULL) { 7695 return(NULL); 7696 } 7697 7698 /* 7699 * collect all the strings accepted by the subexpression on input 7700 */ 7701 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7702 while (len < 0) { 7703 const xmlChar **temp; 7704 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7705 sizeof(const xmlChar *)); 7706 if (temp == NULL) { 7707 xmlFree((xmlChar **) tab); 7708 return(NULL); 7709 } 7710 tab = temp; 7711 ctxt->tabSize *= 2; 7712 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7713 } 7714 for (i = 0;i < len;i++) { 7715 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7716 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7717 xmlExpFree(ctxt, ret); 7718 xmlFree((xmlChar **) tab); 7719 return(tmp); 7720 } 7721 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7722 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7723 xmlExpFree(ctxt, tmp); 7724 xmlExpFree(ctxt, ret); 7725 xmlFree((xmlChar **) tab); 7726 return(tmp); 7727 } 7728 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7729 xmlExpFree(ctxt, tmp); 7730 xmlExpFree(ctxt, tmp2); 7731 7732 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7733 xmlExpFree(ctxt, ret); 7734 xmlFree((xmlChar **) tab); 7735 return(tmp3); 7736 } 7737 7738 if (ret == NULL) 7739 ret = tmp3; 7740 else { 7741 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7742 if (ret == NULL) { 7743 xmlFree((xmlChar **) tab); 7744 return(NULL); 7745 } 7746 } 7747 } 7748 xmlFree((xmlChar **) tab); 7749 return(ret); 7750 } 7751 7752 /** 7753 * xmlExpExpDerive: 7754 * @ctxt: the expressions context 7755 * @exp: the englobing expression 7756 * @sub: the subexpression 7757 * 7758 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7759 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7760 * it usually tatkes less than linear time and can handle expressions generating 7761 * infinite languages. 7762 * 7763 * Returns the resulting expression or NULL in case of internal error, the 7764 * result must be freed 7765 */ 7766 xmlExpNodePtr 7767 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7768 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7769 return(NULL); 7770 7771 /* 7772 * O(1) speedups 7773 */ 7774 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7775 #ifdef DEBUG_DERIV 7776 printf("Sub nillable and not exp : can't subsume\n"); 7777 #endif 7778 return(forbiddenExp); 7779 } 7780 if (xmlExpCheckCard(exp, sub) == 0) { 7781 #ifdef DEBUG_DERIV 7782 printf("sub generate longuer sequances than exp : can't subsume\n"); 7783 #endif 7784 return(forbiddenExp); 7785 } 7786 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7787 } 7788 7789 /** 7790 * xmlExpSubsume: 7791 * @ctxt: the expressions context 7792 * @exp: the englobing expression 7793 * @sub: the subexpression 7794 * 7795 * Check whether @exp accepts all the languages accexpted by @sub 7796 * the input being a subexpression. 7797 * 7798 * Returns 1 if true 0 if false and -1 in case of failure. 7799 */ 7800 int 7801 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7802 xmlExpNodePtr tmp; 7803 7804 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7805 return(-1); 7806 7807 /* 7808 * TODO: speedup by checking the language of sub is a subset of the 7809 * language of exp 7810 */ 7811 /* 7812 * O(1) speedups 7813 */ 7814 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7815 #ifdef DEBUG_DERIV 7816 printf("Sub nillable and not exp : can't subsume\n"); 7817 #endif 7818 return(0); 7819 } 7820 if (xmlExpCheckCard(exp, sub) == 0) { 7821 #ifdef DEBUG_DERIV 7822 printf("sub generate longuer sequances than exp : can't subsume\n"); 7823 #endif 7824 return(0); 7825 } 7826 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7827 #ifdef DEBUG_DERIV 7828 printf("Result derivation :\n"); 7829 PRINT_EXP(tmp); 7830 #endif 7831 if (tmp == NULL) 7832 return(-1); 7833 if (tmp == forbiddenExp) 7834 return(0); 7835 if (tmp == emptyExp) 7836 return(1); 7837 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 7838 xmlExpFree(ctxt, tmp); 7839 return(1); 7840 } 7841 xmlExpFree(ctxt, tmp); 7842 return(0); 7843 } 7844 7845 /************************************************************************ 7846 * * 7847 * Parsing expression * 7848 * * 7849 ************************************************************************/ 7850 7851 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 7852 7853 #undef CUR 7854 #define CUR (*ctxt->cur) 7855 #undef NEXT 7856 #define NEXT ctxt->cur++; 7857 #undef IS_BLANK 7858 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 7859 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 7860 7861 static int 7862 xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 7863 int ret = 0; 7864 7865 SKIP_BLANKS 7866 if (CUR == '*') { 7867 NEXT 7868 return(-1); 7869 } 7870 if ((CUR < '0') || (CUR > '9')) 7871 return(-1); 7872 while ((CUR >= '0') && (CUR <= '9')) { 7873 ret = ret * 10 + (CUR - '0'); 7874 NEXT 7875 } 7876 return(ret); 7877 } 7878 7879 static xmlExpNodePtr 7880 xmlExpParseOr(xmlExpCtxtPtr ctxt) { 7881 const char *base; 7882 xmlExpNodePtr ret; 7883 const xmlChar *val; 7884 7885 SKIP_BLANKS 7886 base = ctxt->cur; 7887 if (*ctxt->cur == '(') { 7888 NEXT 7889 ret = xmlExpParseExpr(ctxt); 7890 SKIP_BLANKS 7891 if (*ctxt->cur != ')') { 7892 fprintf(stderr, "unbalanced '(' : %s\n", base); 7893 xmlExpFree(ctxt, ret); 7894 return(NULL); 7895 } 7896 NEXT; 7897 SKIP_BLANKS 7898 goto parse_quantifier; 7899 } 7900 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 7901 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 7902 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 7903 NEXT; 7904 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 7905 if (val == NULL) 7906 return(NULL); 7907 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 7908 if (ret == NULL) 7909 return(NULL); 7910 SKIP_BLANKS 7911 parse_quantifier: 7912 if (CUR == '{') { 7913 int min, max; 7914 7915 NEXT 7916 min = xmlExpParseNumber(ctxt); 7917 if (min < 0) { 7918 xmlExpFree(ctxt, ret); 7919 return(NULL); 7920 } 7921 SKIP_BLANKS 7922 if (CUR == ',') { 7923 NEXT 7924 max = xmlExpParseNumber(ctxt); 7925 SKIP_BLANKS 7926 } else 7927 max = min; 7928 if (CUR != '}') { 7929 xmlExpFree(ctxt, ret); 7930 return(NULL); 7931 } 7932 NEXT 7933 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7934 min, max); 7935 SKIP_BLANKS 7936 } else if (CUR == '?') { 7937 NEXT 7938 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7939 0, 1); 7940 SKIP_BLANKS 7941 } else if (CUR == '+') { 7942 NEXT 7943 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7944 1, -1); 7945 SKIP_BLANKS 7946 } else if (CUR == '*') { 7947 NEXT 7948 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7949 0, -1); 7950 SKIP_BLANKS 7951 } 7952 return(ret); 7953 } 7954 7955 7956 static xmlExpNodePtr 7957 xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 7958 xmlExpNodePtr ret, right; 7959 7960 ret = xmlExpParseOr(ctxt); 7961 SKIP_BLANKS 7962 while (CUR == '|') { 7963 NEXT 7964 right = xmlExpParseOr(ctxt); 7965 if (right == NULL) { 7966 xmlExpFree(ctxt, ret); 7967 return(NULL); 7968 } 7969 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 7970 if (ret == NULL) 7971 return(NULL); 7972 } 7973 return(ret); 7974 } 7975 7976 static xmlExpNodePtr 7977 xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 7978 xmlExpNodePtr ret, right; 7979 7980 ret = xmlExpParseSeq(ctxt); 7981 SKIP_BLANKS 7982 while (CUR == ',') { 7983 NEXT 7984 right = xmlExpParseSeq(ctxt); 7985 if (right == NULL) { 7986 xmlExpFree(ctxt, ret); 7987 return(NULL); 7988 } 7989 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 7990 if (ret == NULL) 7991 return(NULL); 7992 } 7993 return(ret); 7994 } 7995 7996 /** 7997 * xmlExpParse: 7998 * @ctxt: the expressions context 7999 * @expr: the 0 terminated string 8000 * 8001 * Minimal parser for regexps, it understand the following constructs 8002 * - string terminals 8003 * - choice operator | 8004 * - sequence operator , 8005 * - subexpressions (...) 8006 * - usual cardinality operators + * and ? 8007 * - finite sequences { min, max } 8008 * - infinite sequences { min, * } 8009 * There is minimal checkings made especially no checking on strings values 8010 * 8011 * Returns a new expression or NULL in case of failure 8012 */ 8013 xmlExpNodePtr 8014 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 8015 xmlExpNodePtr ret; 8016 8017 ctxt->expr = expr; 8018 ctxt->cur = expr; 8019 8020 ret = xmlExpParseExpr(ctxt); 8021 SKIP_BLANKS 8022 if (*ctxt->cur != 0) { 8023 xmlExpFree(ctxt, ret); 8024 return(NULL); 8025 } 8026 return(ret); 8027 } 8028 8029 static void 8030 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 8031 xmlExpNodePtr c; 8032 8033 if (expr == NULL) return; 8034 if (glob) xmlBufferWriteChar(buf, "("); 8035 switch (expr->type) { 8036 case XML_EXP_EMPTY: 8037 xmlBufferWriteChar(buf, "empty"); 8038 break; 8039 case XML_EXP_FORBID: 8040 xmlBufferWriteChar(buf, "forbidden"); 8041 break; 8042 case XML_EXP_ATOM: 8043 xmlBufferWriteCHAR(buf, expr->exp_str); 8044 break; 8045 case XML_EXP_SEQ: 8046 c = expr->exp_left; 8047 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8048 xmlExpDumpInt(buf, c, 1); 8049 else 8050 xmlExpDumpInt(buf, c, 0); 8051 xmlBufferWriteChar(buf, " , "); 8052 c = expr->exp_right; 8053 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8054 xmlExpDumpInt(buf, c, 1); 8055 else 8056 xmlExpDumpInt(buf, c, 0); 8057 break; 8058 case XML_EXP_OR: 8059 c = expr->exp_left; 8060 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8061 xmlExpDumpInt(buf, c, 1); 8062 else 8063 xmlExpDumpInt(buf, c, 0); 8064 xmlBufferWriteChar(buf, " | "); 8065 c = expr->exp_right; 8066 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8067 xmlExpDumpInt(buf, c, 1); 8068 else 8069 xmlExpDumpInt(buf, c, 0); 8070 break; 8071 case XML_EXP_COUNT: { 8072 char rep[40]; 8073 8074 c = expr->exp_left; 8075 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8076 xmlExpDumpInt(buf, c, 1); 8077 else 8078 xmlExpDumpInt(buf, c, 0); 8079 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 8080 rep[0] = '?'; 8081 rep[1] = 0; 8082 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 8083 rep[0] = '*'; 8084 rep[1] = 0; 8085 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 8086 rep[0] = '+'; 8087 rep[1] = 0; 8088 } else if (expr->exp_max == expr->exp_min) { 8089 snprintf(rep, 39, "{%d}", expr->exp_min); 8090 } else if (expr->exp_max < 0) { 8091 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 8092 } else { 8093 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 8094 } 8095 rep[39] = 0; 8096 xmlBufferWriteChar(buf, rep); 8097 break; 8098 } 8099 default: 8100 fprintf(stderr, "Error in tree\n"); 8101 } 8102 if (glob) 8103 xmlBufferWriteChar(buf, ")"); 8104 } 8105 /** 8106 * xmlExpDump: 8107 * @buf: a buffer to receive the output 8108 * @expr: the compiled expression 8109 * 8110 * Serialize the expression as compiled to the buffer 8111 */ 8112 void 8113 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 8114 if ((buf == NULL) || (expr == NULL)) 8115 return; 8116 xmlExpDumpInt(buf, expr, 0); 8117 } 8118 8119 /** 8120 * xmlExpMaxToken: 8121 * @expr: a compiled expression 8122 * 8123 * Indicate the maximum number of input a expression can accept 8124 * 8125 * Returns the maximum length or -1 in case of error 8126 */ 8127 int 8128 xmlExpMaxToken(xmlExpNodePtr expr) { 8129 if (expr == NULL) 8130 return(-1); 8131 return(expr->c_max); 8132 } 8133 8134 /** 8135 * xmlExpCtxtNbNodes: 8136 * @ctxt: an expression context 8137 * 8138 * Debugging facility provides the number of allocated nodes at a that point 8139 * 8140 * Returns the number of nodes in use or -1 in case of error 8141 */ 8142 int 8143 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 8144 if (ctxt == NULL) 8145 return(-1); 8146 return(ctxt->nb_nodes); 8147 } 8148 8149 /** 8150 * xmlExpCtxtNbCons: 8151 * @ctxt: an expression context 8152 * 8153 * Debugging facility provides the number of allocated nodes over lifetime 8154 * 8155 * Returns the number of nodes ever allocated or -1 in case of error 8156 */ 8157 int 8158 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 8159 if (ctxt == NULL) 8160 return(-1); 8161 return(ctxt->nb_cons); 8162 } 8163 8164 #endif /* LIBXML_EXPR_ENABLED */ 8165 #define bottom_xmlregexp 8166 #include "elfgcchack.h" 8167 #endif /* LIBXML_REGEXP_ENABLED */ 8168