1 /* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard (at) redhat.com> 15 */ 16 17 #define IN_LIBXML 18 #include "libxml.h" 19 20 #ifdef LIBXML_REGEXP_ENABLED 21 22 /* #define DEBUG_ERR */ 23 24 #include <stdio.h> 25 #include <string.h> 26 #ifdef HAVE_LIMITS_H 27 #include <limits.h> 28 #endif 29 30 #include <libxml/tree.h> 31 #include <libxml/parserInternals.h> 32 #include <libxml/xmlregexp.h> 33 #include <libxml/xmlautomata.h> 34 #include <libxml/xmlunicode.h> 35 36 #ifndef INT_MAX 37 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 38 #endif 39 40 /* #define DEBUG_REGEXP_GRAPH */ 41 /* #define DEBUG_REGEXP_EXEC */ 42 /* #define DEBUG_PUSH */ 43 /* #define DEBUG_COMPACTION */ 44 45 #define MAX_PUSH 10000000 46 47 #ifdef ERROR 48 #undef ERROR 49 #endif 50 #define ERROR(str) \ 51 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 52 xmlRegexpErrCompile(ctxt, str); 53 #define NEXT ctxt->cur++ 54 #define CUR (*(ctxt->cur)) 55 #define NXT(index) (ctxt->cur[index]) 56 57 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 58 #define NEXTL(l) ctxt->cur += l; 59 #define XML_REG_STRING_SEPARATOR '|' 60 /* 61 * Need PREV to check on a '-' within a Character Group. May only be used 62 * when it's guaranteed that cur is not at the beginning of ctxt->string! 63 */ 64 #define PREV (ctxt->cur[-1]) 65 66 /** 67 * TODO: 68 * 69 * macro to flag unimplemented blocks 70 */ 71 #define TODO \ 72 xmlGenericError(xmlGenericErrorContext, \ 73 "Unimplemented block at %s:%d\n", \ 74 __FILE__, __LINE__); 75 76 /************************************************************************ 77 * * 78 * Datatypes and structures * 79 * * 80 ************************************************************************/ 81 82 /* 83 * Note: the order of the enums below is significant, do not shuffle 84 */ 85 typedef enum { 86 XML_REGEXP_EPSILON = 1, 87 XML_REGEXP_CHARVAL, 88 XML_REGEXP_RANGES, 89 XML_REGEXP_SUBREG, /* used for () sub regexps */ 90 XML_REGEXP_STRING, 91 XML_REGEXP_ANYCHAR, /* . */ 92 XML_REGEXP_ANYSPACE, /* \s */ 93 XML_REGEXP_NOTSPACE, /* \S */ 94 XML_REGEXP_INITNAME, /* \l */ 95 XML_REGEXP_NOTINITNAME, /* \L */ 96 XML_REGEXP_NAMECHAR, /* \c */ 97 XML_REGEXP_NOTNAMECHAR, /* \C */ 98 XML_REGEXP_DECIMAL, /* \d */ 99 XML_REGEXP_NOTDECIMAL, /* \D */ 100 XML_REGEXP_REALCHAR, /* \w */ 101 XML_REGEXP_NOTREALCHAR, /* \W */ 102 XML_REGEXP_LETTER = 100, 103 XML_REGEXP_LETTER_UPPERCASE, 104 XML_REGEXP_LETTER_LOWERCASE, 105 XML_REGEXP_LETTER_TITLECASE, 106 XML_REGEXP_LETTER_MODIFIER, 107 XML_REGEXP_LETTER_OTHERS, 108 XML_REGEXP_MARK, 109 XML_REGEXP_MARK_NONSPACING, 110 XML_REGEXP_MARK_SPACECOMBINING, 111 XML_REGEXP_MARK_ENCLOSING, 112 XML_REGEXP_NUMBER, 113 XML_REGEXP_NUMBER_DECIMAL, 114 XML_REGEXP_NUMBER_LETTER, 115 XML_REGEXP_NUMBER_OTHERS, 116 XML_REGEXP_PUNCT, 117 XML_REGEXP_PUNCT_CONNECTOR, 118 XML_REGEXP_PUNCT_DASH, 119 XML_REGEXP_PUNCT_OPEN, 120 XML_REGEXP_PUNCT_CLOSE, 121 XML_REGEXP_PUNCT_INITQUOTE, 122 XML_REGEXP_PUNCT_FINQUOTE, 123 XML_REGEXP_PUNCT_OTHERS, 124 XML_REGEXP_SEPAR, 125 XML_REGEXP_SEPAR_SPACE, 126 XML_REGEXP_SEPAR_LINE, 127 XML_REGEXP_SEPAR_PARA, 128 XML_REGEXP_SYMBOL, 129 XML_REGEXP_SYMBOL_MATH, 130 XML_REGEXP_SYMBOL_CURRENCY, 131 XML_REGEXP_SYMBOL_MODIFIER, 132 XML_REGEXP_SYMBOL_OTHERS, 133 XML_REGEXP_OTHER, 134 XML_REGEXP_OTHER_CONTROL, 135 XML_REGEXP_OTHER_FORMAT, 136 XML_REGEXP_OTHER_PRIVATE, 137 XML_REGEXP_OTHER_NA, 138 XML_REGEXP_BLOCK_NAME 139 } xmlRegAtomType; 140 141 typedef enum { 142 XML_REGEXP_QUANT_EPSILON = 1, 143 XML_REGEXP_QUANT_ONCE, 144 XML_REGEXP_QUANT_OPT, 145 XML_REGEXP_QUANT_MULT, 146 XML_REGEXP_QUANT_PLUS, 147 XML_REGEXP_QUANT_ONCEONLY, 148 XML_REGEXP_QUANT_ALL, 149 XML_REGEXP_QUANT_RANGE 150 } xmlRegQuantType; 151 152 typedef enum { 153 XML_REGEXP_START_STATE = 1, 154 XML_REGEXP_FINAL_STATE, 155 XML_REGEXP_TRANS_STATE, 156 XML_REGEXP_SINK_STATE, 157 XML_REGEXP_UNREACH_STATE 158 } xmlRegStateType; 159 160 typedef enum { 161 XML_REGEXP_MARK_NORMAL = 0, 162 XML_REGEXP_MARK_START, 163 XML_REGEXP_MARK_VISITED 164 } xmlRegMarkedType; 165 166 typedef struct _xmlRegRange xmlRegRange; 167 typedef xmlRegRange *xmlRegRangePtr; 168 169 struct _xmlRegRange { 170 int neg; /* 0 normal, 1 not, 2 exclude */ 171 xmlRegAtomType type; 172 int start; 173 int end; 174 xmlChar *blockName; 175 }; 176 177 typedef struct _xmlRegAtom xmlRegAtom; 178 typedef xmlRegAtom *xmlRegAtomPtr; 179 180 typedef struct _xmlAutomataState xmlRegState; 181 typedef xmlRegState *xmlRegStatePtr; 182 183 struct _xmlRegAtom { 184 int no; 185 xmlRegAtomType type; 186 xmlRegQuantType quant; 187 int min; 188 int max; 189 190 void *valuep; 191 void *valuep2; 192 int neg; 193 int codepoint; 194 xmlRegStatePtr start; 195 xmlRegStatePtr start0; 196 xmlRegStatePtr stop; 197 int maxRanges; 198 int nbRanges; 199 xmlRegRangePtr *ranges; 200 void *data; 201 }; 202 203 typedef struct _xmlRegCounter xmlRegCounter; 204 typedef xmlRegCounter *xmlRegCounterPtr; 205 206 struct _xmlRegCounter { 207 int min; 208 int max; 209 }; 210 211 typedef struct _xmlRegTrans xmlRegTrans; 212 typedef xmlRegTrans *xmlRegTransPtr; 213 214 struct _xmlRegTrans { 215 xmlRegAtomPtr atom; 216 int to; 217 int counter; 218 int count; 219 int nd; 220 }; 221 222 struct _xmlAutomataState { 223 xmlRegStateType type; 224 xmlRegMarkedType mark; 225 xmlRegMarkedType markd; 226 xmlRegMarkedType reached; 227 int no; 228 int maxTrans; 229 int nbTrans; 230 xmlRegTrans *trans; 231 /* knowing states ponting to us can speed things up */ 232 int maxTransTo; 233 int nbTransTo; 234 int *transTo; 235 }; 236 237 typedef struct _xmlAutomata xmlRegParserCtxt; 238 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 239 240 #define AM_AUTOMATA_RNG 1 241 242 struct _xmlAutomata { 243 xmlChar *string; 244 xmlChar *cur; 245 246 int error; 247 int neg; 248 249 xmlRegStatePtr start; 250 xmlRegStatePtr end; 251 xmlRegStatePtr state; 252 253 xmlRegAtomPtr atom; 254 255 int maxAtoms; 256 int nbAtoms; 257 xmlRegAtomPtr *atoms; 258 259 int maxStates; 260 int nbStates; 261 xmlRegStatePtr *states; 262 263 int maxCounters; 264 int nbCounters; 265 xmlRegCounter *counters; 266 267 int determinist; 268 int negs; 269 int flags; 270 }; 271 272 struct _xmlRegexp { 273 xmlChar *string; 274 int nbStates; 275 xmlRegStatePtr *states; 276 int nbAtoms; 277 xmlRegAtomPtr *atoms; 278 int nbCounters; 279 xmlRegCounter *counters; 280 int determinist; 281 int flags; 282 /* 283 * That's the compact form for determinists automatas 284 */ 285 int nbstates; 286 int *compact; 287 void **transdata; 288 int nbstrings; 289 xmlChar **stringMap; 290 }; 291 292 typedef struct _xmlRegExecRollback xmlRegExecRollback; 293 typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 294 295 struct _xmlRegExecRollback { 296 xmlRegStatePtr state;/* the current state */ 297 int index; /* the index in the input stack */ 298 int nextbranch; /* the next transition to explore in that state */ 299 int *counts; /* save the automata state if it has some */ 300 }; 301 302 typedef struct _xmlRegInputToken xmlRegInputToken; 303 typedef xmlRegInputToken *xmlRegInputTokenPtr; 304 305 struct _xmlRegInputToken { 306 xmlChar *value; 307 void *data; 308 }; 309 310 struct _xmlRegExecCtxt { 311 int status; /* execution status != 0 indicate an error */ 312 int determinist; /* did we find an indeterministic behaviour */ 313 xmlRegexpPtr comp; /* the compiled regexp */ 314 xmlRegExecCallbacks callback; 315 void *data; 316 317 xmlRegStatePtr state;/* the current state */ 318 int transno; /* the current transition on that state */ 319 int transcount; /* the number of chars in char counted transitions */ 320 321 /* 322 * A stack of rollback states 323 */ 324 int maxRollbacks; 325 int nbRollbacks; 326 xmlRegExecRollback *rollbacks; 327 328 /* 329 * The state of the automata if any 330 */ 331 int *counts; 332 333 /* 334 * The input stack 335 */ 336 int inputStackMax; 337 int inputStackNr; 338 int index; 339 int *charStack; 340 const xmlChar *inputString; /* when operating on characters */ 341 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 342 343 /* 344 * error handling 345 */ 346 int errStateNo; /* the error state number */ 347 xmlRegStatePtr errState; /* the error state */ 348 xmlChar *errString; /* the string raising the error */ 349 int *errCounts; /* counters at the error state */ 350 int nbPush; 351 }; 352 353 #define REGEXP_ALL_COUNTER 0x123456 354 #define REGEXP_ALL_LAX_COUNTER 0x123457 355 356 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 357 static void xmlRegFreeState(xmlRegStatePtr state); 358 static void xmlRegFreeAtom(xmlRegAtomPtr atom); 359 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 360 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 361 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 362 int neg, int start, int end, const xmlChar *blockName); 363 364 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags); 365 366 /************************************************************************ 367 * * 368 * Regexp memory error handler * 369 * * 370 ************************************************************************/ 371 /** 372 * xmlRegexpErrMemory: 373 * @extra: extra information 374 * 375 * Handle an out of memory condition 376 */ 377 static void 378 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 379 { 380 const char *regexp = NULL; 381 if (ctxt != NULL) { 382 regexp = (const char *) ctxt->string; 383 ctxt->error = XML_ERR_NO_MEMORY; 384 } 385 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 386 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 387 regexp, NULL, 0, 0, 388 "Memory allocation failed : %s\n", extra); 389 } 390 391 /** 392 * xmlRegexpErrCompile: 393 * @extra: extra information 394 * 395 * Handle a compilation failure 396 */ 397 static void 398 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 399 { 400 const char *regexp = NULL; 401 int idx = 0; 402 403 if (ctxt != NULL) { 404 regexp = (const char *) ctxt->string; 405 idx = ctxt->cur - ctxt->string; 406 ctxt->error = XML_REGEXP_COMPILE_ERROR; 407 } 408 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 409 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 410 regexp, NULL, idx, 0, 411 "failed to compile: %s\n", extra); 412 } 413 414 /************************************************************************ 415 * * 416 * Allocation/Deallocation * 417 * * 418 ************************************************************************/ 419 420 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 421 /** 422 * xmlRegEpxFromParse: 423 * @ctxt: the parser context used to build it 424 * 425 * Allocate a new regexp and fill it with the result from the parser 426 * 427 * Returns the new regexp or NULL in case of error 428 */ 429 static xmlRegexpPtr 430 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 431 xmlRegexpPtr ret; 432 433 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 434 if (ret == NULL) { 435 xmlRegexpErrMemory(ctxt, "compiling regexp"); 436 return(NULL); 437 } 438 memset(ret, 0, sizeof(xmlRegexp)); 439 ret->string = ctxt->string; 440 ret->nbStates = ctxt->nbStates; 441 ret->states = ctxt->states; 442 ret->nbAtoms = ctxt->nbAtoms; 443 ret->atoms = ctxt->atoms; 444 ret->nbCounters = ctxt->nbCounters; 445 ret->counters = ctxt->counters; 446 ret->determinist = ctxt->determinist; 447 ret->flags = ctxt->flags; 448 if (ret->determinist == -1) { 449 xmlRegexpIsDeterminist(ret); 450 } 451 452 if ((ret->determinist != 0) && 453 (ret->nbCounters == 0) && 454 (ctxt->negs == 0) && 455 (ret->atoms != NULL) && 456 (ret->atoms[0] != NULL) && 457 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 458 int i, j, nbstates = 0, nbatoms = 0; 459 int *stateRemap; 460 int *stringRemap; 461 int *transitions; 462 void **transdata; 463 xmlChar **stringMap; 464 xmlChar *value; 465 466 /* 467 * Switch to a compact representation 468 * 1/ counting the effective number of states left 469 * 2/ counting the unique number of atoms, and check that 470 * they are all of the string type 471 * 3/ build a table state x atom for the transitions 472 */ 473 474 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 475 if (stateRemap == NULL) { 476 xmlRegexpErrMemory(ctxt, "compiling regexp"); 477 xmlFree(ret); 478 return(NULL); 479 } 480 for (i = 0;i < ret->nbStates;i++) { 481 if (ret->states[i] != NULL) { 482 stateRemap[i] = nbstates; 483 nbstates++; 484 } else { 485 stateRemap[i] = -1; 486 } 487 } 488 #ifdef DEBUG_COMPACTION 489 printf("Final: %d states\n", nbstates); 490 #endif 491 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 492 if (stringMap == NULL) { 493 xmlRegexpErrMemory(ctxt, "compiling regexp"); 494 xmlFree(stateRemap); 495 xmlFree(ret); 496 return(NULL); 497 } 498 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 499 if (stringRemap == NULL) { 500 xmlRegexpErrMemory(ctxt, "compiling regexp"); 501 xmlFree(stringMap); 502 xmlFree(stateRemap); 503 xmlFree(ret); 504 return(NULL); 505 } 506 for (i = 0;i < ret->nbAtoms;i++) { 507 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 508 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 509 value = ret->atoms[i]->valuep; 510 for (j = 0;j < nbatoms;j++) { 511 if (xmlStrEqual(stringMap[j], value)) { 512 stringRemap[i] = j; 513 break; 514 } 515 } 516 if (j >= nbatoms) { 517 stringRemap[i] = nbatoms; 518 stringMap[nbatoms] = xmlStrdup(value); 519 if (stringMap[nbatoms] == NULL) { 520 for (i = 0;i < nbatoms;i++) 521 xmlFree(stringMap[i]); 522 xmlFree(stringRemap); 523 xmlFree(stringMap); 524 xmlFree(stateRemap); 525 xmlFree(ret); 526 return(NULL); 527 } 528 nbatoms++; 529 } 530 } else { 531 xmlFree(stateRemap); 532 xmlFree(stringRemap); 533 for (i = 0;i < nbatoms;i++) 534 xmlFree(stringMap[i]); 535 xmlFree(stringMap); 536 xmlFree(ret); 537 return(NULL); 538 } 539 } 540 #ifdef DEBUG_COMPACTION 541 printf("Final: %d atoms\n", nbatoms); 542 #endif 543 transitions = (int *) xmlMalloc((nbstates + 1) * 544 (nbatoms + 1) * sizeof(int)); 545 if (transitions == NULL) { 546 xmlFree(stateRemap); 547 xmlFree(stringRemap); 548 xmlFree(stringMap); 549 xmlFree(ret); 550 return(NULL); 551 } 552 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 553 554 /* 555 * Allocate the transition table. The first entry for each 556 * state corresponds to the state type. 557 */ 558 transdata = NULL; 559 560 for (i = 0;i < ret->nbStates;i++) { 561 int stateno, atomno, targetno, prev; 562 xmlRegStatePtr state; 563 xmlRegTransPtr trans; 564 565 stateno = stateRemap[i]; 566 if (stateno == -1) 567 continue; 568 state = ret->states[i]; 569 570 transitions[stateno * (nbatoms + 1)] = state->type; 571 572 for (j = 0;j < state->nbTrans;j++) { 573 trans = &(state->trans[j]); 574 if ((trans->to == -1) || (trans->atom == NULL)) 575 continue; 576 atomno = stringRemap[trans->atom->no]; 577 if ((trans->atom->data != NULL) && (transdata == NULL)) { 578 transdata = (void **) xmlMalloc(nbstates * nbatoms * 579 sizeof(void *)); 580 if (transdata != NULL) 581 memset(transdata, 0, 582 nbstates * nbatoms * sizeof(void *)); 583 else { 584 xmlRegexpErrMemory(ctxt, "compiling regexp"); 585 break; 586 } 587 } 588 targetno = stateRemap[trans->to]; 589 /* 590 * if the same atom can generate transitions to 2 different 591 * states then it means the automata is not determinist and 592 * the compact form can't be used ! 593 */ 594 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 595 if (prev != 0) { 596 if (prev != targetno + 1) { 597 ret->determinist = 0; 598 #ifdef DEBUG_COMPACTION 599 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 600 i, j, trans->atom->no, trans->to, atomno, targetno); 601 printf(" previous to is %d\n", prev); 602 #endif 603 if (transdata != NULL) 604 xmlFree(transdata); 605 xmlFree(transitions); 606 xmlFree(stateRemap); 607 xmlFree(stringRemap); 608 for (i = 0;i < nbatoms;i++) 609 xmlFree(stringMap[i]); 610 xmlFree(stringMap); 611 goto not_determ; 612 } 613 } else { 614 #if 0 615 printf("State %d trans %d: atom %d to %d : %d to %d\n", 616 i, j, trans->atom->no, trans->to, atomno, targetno); 617 #endif 618 transitions[stateno * (nbatoms + 1) + atomno + 1] = 619 targetno + 1; /* to avoid 0 */ 620 if (transdata != NULL) 621 transdata[stateno * nbatoms + atomno] = 622 trans->atom->data; 623 } 624 } 625 } 626 ret->determinist = 1; 627 #ifdef DEBUG_COMPACTION 628 /* 629 * Debug 630 */ 631 for (i = 0;i < nbstates;i++) { 632 for (j = 0;j < nbatoms + 1;j++) { 633 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 634 } 635 printf("\n"); 636 } 637 printf("\n"); 638 #endif 639 /* 640 * Cleanup of the old data 641 */ 642 if (ret->states != NULL) { 643 for (i = 0;i < ret->nbStates;i++) 644 xmlRegFreeState(ret->states[i]); 645 xmlFree(ret->states); 646 } 647 ret->states = NULL; 648 ret->nbStates = 0; 649 if (ret->atoms != NULL) { 650 for (i = 0;i < ret->nbAtoms;i++) 651 xmlRegFreeAtom(ret->atoms[i]); 652 xmlFree(ret->atoms); 653 } 654 ret->atoms = NULL; 655 ret->nbAtoms = 0; 656 657 ret->compact = transitions; 658 ret->transdata = transdata; 659 ret->stringMap = stringMap; 660 ret->nbstrings = nbatoms; 661 ret->nbstates = nbstates; 662 xmlFree(stateRemap); 663 xmlFree(stringRemap); 664 } 665 not_determ: 666 ctxt->string = NULL; 667 ctxt->nbStates = 0; 668 ctxt->states = NULL; 669 ctxt->nbAtoms = 0; 670 ctxt->atoms = NULL; 671 ctxt->nbCounters = 0; 672 ctxt->counters = NULL; 673 return(ret); 674 } 675 676 /** 677 * xmlRegNewParserCtxt: 678 * @string: the string to parse 679 * 680 * Allocate a new regexp parser context 681 * 682 * Returns the new context or NULL in case of error 683 */ 684 static xmlRegParserCtxtPtr 685 xmlRegNewParserCtxt(const xmlChar *string) { 686 xmlRegParserCtxtPtr ret; 687 688 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 689 if (ret == NULL) 690 return(NULL); 691 memset(ret, 0, sizeof(xmlRegParserCtxt)); 692 if (string != NULL) 693 ret->string = xmlStrdup(string); 694 ret->cur = ret->string; 695 ret->neg = 0; 696 ret->negs = 0; 697 ret->error = 0; 698 ret->determinist = -1; 699 return(ret); 700 } 701 702 /** 703 * xmlRegNewRange: 704 * @ctxt: the regexp parser context 705 * @neg: is that negative 706 * @type: the type of range 707 * @start: the start codepoint 708 * @end: the end codepoint 709 * 710 * Allocate a new regexp range 711 * 712 * Returns the new range or NULL in case of error 713 */ 714 static xmlRegRangePtr 715 xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 716 int neg, xmlRegAtomType type, int start, int end) { 717 xmlRegRangePtr ret; 718 719 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 720 if (ret == NULL) { 721 xmlRegexpErrMemory(ctxt, "allocating range"); 722 return(NULL); 723 } 724 ret->neg = neg; 725 ret->type = type; 726 ret->start = start; 727 ret->end = end; 728 return(ret); 729 } 730 731 /** 732 * xmlRegFreeRange: 733 * @range: the regexp range 734 * 735 * Free a regexp range 736 */ 737 static void 738 xmlRegFreeRange(xmlRegRangePtr range) { 739 if (range == NULL) 740 return; 741 742 if (range->blockName != NULL) 743 xmlFree(range->blockName); 744 xmlFree(range); 745 } 746 747 /** 748 * xmlRegCopyRange: 749 * @range: the regexp range 750 * 751 * Copy a regexp range 752 * 753 * Returns the new copy or NULL in case of error. 754 */ 755 static xmlRegRangePtr 756 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) { 757 xmlRegRangePtr ret; 758 759 if (range == NULL) 760 return(NULL); 761 762 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start, 763 range->end); 764 if (ret == NULL) 765 return(NULL); 766 if (range->blockName != NULL) { 767 ret->blockName = xmlStrdup(range->blockName); 768 if (ret->blockName == NULL) { 769 xmlRegexpErrMemory(ctxt, "allocating range"); 770 xmlRegFreeRange(ret); 771 return(NULL); 772 } 773 } 774 return(ret); 775 } 776 777 /** 778 * xmlRegNewAtom: 779 * @ctxt: the regexp parser context 780 * @type: the type of atom 781 * 782 * Allocate a new atom 783 * 784 * Returns the new atom or NULL in case of error 785 */ 786 static xmlRegAtomPtr 787 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 788 xmlRegAtomPtr ret; 789 790 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 791 if (ret == NULL) { 792 xmlRegexpErrMemory(ctxt, "allocating atom"); 793 return(NULL); 794 } 795 memset(ret, 0, sizeof(xmlRegAtom)); 796 ret->type = type; 797 ret->quant = XML_REGEXP_QUANT_ONCE; 798 ret->min = 0; 799 ret->max = 0; 800 return(ret); 801 } 802 803 /** 804 * xmlRegFreeAtom: 805 * @atom: the regexp atom 806 * 807 * Free a regexp atom 808 */ 809 static void 810 xmlRegFreeAtom(xmlRegAtomPtr atom) { 811 int i; 812 813 if (atom == NULL) 814 return; 815 816 for (i = 0;i < atom->nbRanges;i++) 817 xmlRegFreeRange(atom->ranges[i]); 818 if (atom->ranges != NULL) 819 xmlFree(atom->ranges); 820 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 821 xmlFree(atom->valuep); 822 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 823 xmlFree(atom->valuep2); 824 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 825 xmlFree(atom->valuep); 826 xmlFree(atom); 827 } 828 829 /** 830 * xmlRegCopyAtom: 831 * @ctxt: the regexp parser context 832 * @atom: the oiginal atom 833 * 834 * Allocate a new regexp range 835 * 836 * Returns the new atom or NULL in case of error 837 */ 838 static xmlRegAtomPtr 839 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 840 xmlRegAtomPtr ret; 841 842 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 843 if (ret == NULL) { 844 xmlRegexpErrMemory(ctxt, "copying atom"); 845 return(NULL); 846 } 847 memset(ret, 0, sizeof(xmlRegAtom)); 848 ret->type = atom->type; 849 ret->quant = atom->quant; 850 ret->min = atom->min; 851 ret->max = atom->max; 852 if (atom->nbRanges > 0) { 853 int i; 854 855 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) * 856 atom->nbRanges); 857 if (ret->ranges == NULL) { 858 xmlRegexpErrMemory(ctxt, "copying atom"); 859 goto error; 860 } 861 for (i = 0;i < atom->nbRanges;i++) { 862 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]); 863 if (ret->ranges[i] == NULL) 864 goto error; 865 ret->nbRanges = i + 1; 866 } 867 } 868 return(ret); 869 870 error: 871 xmlRegFreeAtom(ret); 872 return(NULL); 873 } 874 875 static xmlRegStatePtr 876 xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 877 xmlRegStatePtr ret; 878 879 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 880 if (ret == NULL) { 881 xmlRegexpErrMemory(ctxt, "allocating state"); 882 return(NULL); 883 } 884 memset(ret, 0, sizeof(xmlRegState)); 885 ret->type = XML_REGEXP_TRANS_STATE; 886 ret->mark = XML_REGEXP_MARK_NORMAL; 887 return(ret); 888 } 889 890 /** 891 * xmlRegFreeState: 892 * @state: the regexp state 893 * 894 * Free a regexp state 895 */ 896 static void 897 xmlRegFreeState(xmlRegStatePtr state) { 898 if (state == NULL) 899 return; 900 901 if (state->trans != NULL) 902 xmlFree(state->trans); 903 if (state->transTo != NULL) 904 xmlFree(state->transTo); 905 xmlFree(state); 906 } 907 908 /** 909 * xmlRegFreeParserCtxt: 910 * @ctxt: the regexp parser context 911 * 912 * Free a regexp parser context 913 */ 914 static void 915 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 916 int i; 917 if (ctxt == NULL) 918 return; 919 920 if (ctxt->string != NULL) 921 xmlFree(ctxt->string); 922 if (ctxt->states != NULL) { 923 for (i = 0;i < ctxt->nbStates;i++) 924 xmlRegFreeState(ctxt->states[i]); 925 xmlFree(ctxt->states); 926 } 927 if (ctxt->atoms != NULL) { 928 for (i = 0;i < ctxt->nbAtoms;i++) 929 xmlRegFreeAtom(ctxt->atoms[i]); 930 xmlFree(ctxt->atoms); 931 } 932 if (ctxt->counters != NULL) 933 xmlFree(ctxt->counters); 934 xmlFree(ctxt); 935 } 936 937 /************************************************************************ 938 * * 939 * Display of Data structures * 940 * * 941 ************************************************************************/ 942 943 static void 944 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 945 switch (type) { 946 case XML_REGEXP_EPSILON: 947 fprintf(output, "epsilon "); break; 948 case XML_REGEXP_CHARVAL: 949 fprintf(output, "charval "); break; 950 case XML_REGEXP_RANGES: 951 fprintf(output, "ranges "); break; 952 case XML_REGEXP_SUBREG: 953 fprintf(output, "subexpr "); break; 954 case XML_REGEXP_STRING: 955 fprintf(output, "string "); break; 956 case XML_REGEXP_ANYCHAR: 957 fprintf(output, "anychar "); break; 958 case XML_REGEXP_ANYSPACE: 959 fprintf(output, "anyspace "); break; 960 case XML_REGEXP_NOTSPACE: 961 fprintf(output, "notspace "); break; 962 case XML_REGEXP_INITNAME: 963 fprintf(output, "initname "); break; 964 case XML_REGEXP_NOTINITNAME: 965 fprintf(output, "notinitname "); break; 966 case XML_REGEXP_NAMECHAR: 967 fprintf(output, "namechar "); break; 968 case XML_REGEXP_NOTNAMECHAR: 969 fprintf(output, "notnamechar "); break; 970 case XML_REGEXP_DECIMAL: 971 fprintf(output, "decimal "); break; 972 case XML_REGEXP_NOTDECIMAL: 973 fprintf(output, "notdecimal "); break; 974 case XML_REGEXP_REALCHAR: 975 fprintf(output, "realchar "); break; 976 case XML_REGEXP_NOTREALCHAR: 977 fprintf(output, "notrealchar "); break; 978 case XML_REGEXP_LETTER: 979 fprintf(output, "LETTER "); break; 980 case XML_REGEXP_LETTER_UPPERCASE: 981 fprintf(output, "LETTER_UPPERCASE "); break; 982 case XML_REGEXP_LETTER_LOWERCASE: 983 fprintf(output, "LETTER_LOWERCASE "); break; 984 case XML_REGEXP_LETTER_TITLECASE: 985 fprintf(output, "LETTER_TITLECASE "); break; 986 case XML_REGEXP_LETTER_MODIFIER: 987 fprintf(output, "LETTER_MODIFIER "); break; 988 case XML_REGEXP_LETTER_OTHERS: 989 fprintf(output, "LETTER_OTHERS "); break; 990 case XML_REGEXP_MARK: 991 fprintf(output, "MARK "); break; 992 case XML_REGEXP_MARK_NONSPACING: 993 fprintf(output, "MARK_NONSPACING "); break; 994 case XML_REGEXP_MARK_SPACECOMBINING: 995 fprintf(output, "MARK_SPACECOMBINING "); break; 996 case XML_REGEXP_MARK_ENCLOSING: 997 fprintf(output, "MARK_ENCLOSING "); break; 998 case XML_REGEXP_NUMBER: 999 fprintf(output, "NUMBER "); break; 1000 case XML_REGEXP_NUMBER_DECIMAL: 1001 fprintf(output, "NUMBER_DECIMAL "); break; 1002 case XML_REGEXP_NUMBER_LETTER: 1003 fprintf(output, "NUMBER_LETTER "); break; 1004 case XML_REGEXP_NUMBER_OTHERS: 1005 fprintf(output, "NUMBER_OTHERS "); break; 1006 case XML_REGEXP_PUNCT: 1007 fprintf(output, "PUNCT "); break; 1008 case XML_REGEXP_PUNCT_CONNECTOR: 1009 fprintf(output, "PUNCT_CONNECTOR "); break; 1010 case XML_REGEXP_PUNCT_DASH: 1011 fprintf(output, "PUNCT_DASH "); break; 1012 case XML_REGEXP_PUNCT_OPEN: 1013 fprintf(output, "PUNCT_OPEN "); break; 1014 case XML_REGEXP_PUNCT_CLOSE: 1015 fprintf(output, "PUNCT_CLOSE "); break; 1016 case XML_REGEXP_PUNCT_INITQUOTE: 1017 fprintf(output, "PUNCT_INITQUOTE "); break; 1018 case XML_REGEXP_PUNCT_FINQUOTE: 1019 fprintf(output, "PUNCT_FINQUOTE "); break; 1020 case XML_REGEXP_PUNCT_OTHERS: 1021 fprintf(output, "PUNCT_OTHERS "); break; 1022 case XML_REGEXP_SEPAR: 1023 fprintf(output, "SEPAR "); break; 1024 case XML_REGEXP_SEPAR_SPACE: 1025 fprintf(output, "SEPAR_SPACE "); break; 1026 case XML_REGEXP_SEPAR_LINE: 1027 fprintf(output, "SEPAR_LINE "); break; 1028 case XML_REGEXP_SEPAR_PARA: 1029 fprintf(output, "SEPAR_PARA "); break; 1030 case XML_REGEXP_SYMBOL: 1031 fprintf(output, "SYMBOL "); break; 1032 case XML_REGEXP_SYMBOL_MATH: 1033 fprintf(output, "SYMBOL_MATH "); break; 1034 case XML_REGEXP_SYMBOL_CURRENCY: 1035 fprintf(output, "SYMBOL_CURRENCY "); break; 1036 case XML_REGEXP_SYMBOL_MODIFIER: 1037 fprintf(output, "SYMBOL_MODIFIER "); break; 1038 case XML_REGEXP_SYMBOL_OTHERS: 1039 fprintf(output, "SYMBOL_OTHERS "); break; 1040 case XML_REGEXP_OTHER: 1041 fprintf(output, "OTHER "); break; 1042 case XML_REGEXP_OTHER_CONTROL: 1043 fprintf(output, "OTHER_CONTROL "); break; 1044 case XML_REGEXP_OTHER_FORMAT: 1045 fprintf(output, "OTHER_FORMAT "); break; 1046 case XML_REGEXP_OTHER_PRIVATE: 1047 fprintf(output, "OTHER_PRIVATE "); break; 1048 case XML_REGEXP_OTHER_NA: 1049 fprintf(output, "OTHER_NA "); break; 1050 case XML_REGEXP_BLOCK_NAME: 1051 fprintf(output, "BLOCK "); break; 1052 } 1053 } 1054 1055 static void 1056 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 1057 switch (type) { 1058 case XML_REGEXP_QUANT_EPSILON: 1059 fprintf(output, "epsilon "); break; 1060 case XML_REGEXP_QUANT_ONCE: 1061 fprintf(output, "once "); break; 1062 case XML_REGEXP_QUANT_OPT: 1063 fprintf(output, "? "); break; 1064 case XML_REGEXP_QUANT_MULT: 1065 fprintf(output, "* "); break; 1066 case XML_REGEXP_QUANT_PLUS: 1067 fprintf(output, "+ "); break; 1068 case XML_REGEXP_QUANT_RANGE: 1069 fprintf(output, "range "); break; 1070 case XML_REGEXP_QUANT_ONCEONLY: 1071 fprintf(output, "onceonly "); break; 1072 case XML_REGEXP_QUANT_ALL: 1073 fprintf(output, "all "); break; 1074 } 1075 } 1076 static void 1077 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 1078 fprintf(output, " range: "); 1079 if (range->neg) 1080 fprintf(output, "negative "); 1081 xmlRegPrintAtomType(output, range->type); 1082 fprintf(output, "%c - %c\n", range->start, range->end); 1083 } 1084 1085 static void 1086 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 1087 fprintf(output, " atom: "); 1088 if (atom == NULL) { 1089 fprintf(output, "NULL\n"); 1090 return; 1091 } 1092 if (atom->neg) 1093 fprintf(output, "not "); 1094 xmlRegPrintAtomType(output, atom->type); 1095 xmlRegPrintQuantType(output, atom->quant); 1096 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1097 fprintf(output, "%d-%d ", atom->min, atom->max); 1098 if (atom->type == XML_REGEXP_STRING) 1099 fprintf(output, "'%s' ", (char *) atom->valuep); 1100 if (atom->type == XML_REGEXP_CHARVAL) 1101 fprintf(output, "char %c\n", atom->codepoint); 1102 else if (atom->type == XML_REGEXP_RANGES) { 1103 int i; 1104 fprintf(output, "%d entries\n", atom->nbRanges); 1105 for (i = 0; i < atom->nbRanges;i++) 1106 xmlRegPrintRange(output, atom->ranges[i]); 1107 } else if (atom->type == XML_REGEXP_SUBREG) { 1108 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1109 } else { 1110 fprintf(output, "\n"); 1111 } 1112 } 1113 1114 static void 1115 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1116 fprintf(output, " trans: "); 1117 if (trans == NULL) { 1118 fprintf(output, "NULL\n"); 1119 return; 1120 } 1121 if (trans->to < 0) { 1122 fprintf(output, "removed\n"); 1123 return; 1124 } 1125 if (trans->nd != 0) { 1126 if (trans->nd == 2) 1127 fprintf(output, "last not determinist, "); 1128 else 1129 fprintf(output, "not determinist, "); 1130 } 1131 if (trans->counter >= 0) { 1132 fprintf(output, "counted %d, ", trans->counter); 1133 } 1134 if (trans->count == REGEXP_ALL_COUNTER) { 1135 fprintf(output, "all transition, "); 1136 } else if (trans->count >= 0) { 1137 fprintf(output, "count based %d, ", trans->count); 1138 } 1139 if (trans->atom == NULL) { 1140 fprintf(output, "epsilon to %d\n", trans->to); 1141 return; 1142 } 1143 if (trans->atom->type == XML_REGEXP_CHARVAL) 1144 fprintf(output, "char %c ", trans->atom->codepoint); 1145 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1146 } 1147 1148 static void 1149 xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1150 int i; 1151 1152 fprintf(output, " state: "); 1153 if (state == NULL) { 1154 fprintf(output, "NULL\n"); 1155 return; 1156 } 1157 if (state->type == XML_REGEXP_START_STATE) 1158 fprintf(output, "START "); 1159 if (state->type == XML_REGEXP_FINAL_STATE) 1160 fprintf(output, "FINAL "); 1161 1162 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1163 for (i = 0;i < state->nbTrans; i++) { 1164 xmlRegPrintTrans(output, &(state->trans[i])); 1165 } 1166 } 1167 1168 #ifdef DEBUG_REGEXP_GRAPH 1169 static void 1170 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1171 int i; 1172 1173 fprintf(output, " ctxt: "); 1174 if (ctxt == NULL) { 1175 fprintf(output, "NULL\n"); 1176 return; 1177 } 1178 fprintf(output, "'%s' ", ctxt->string); 1179 if (ctxt->error) 1180 fprintf(output, "error "); 1181 if (ctxt->neg) 1182 fprintf(output, "neg "); 1183 fprintf(output, "\n"); 1184 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1185 for (i = 0;i < ctxt->nbAtoms; i++) { 1186 fprintf(output, " %02d ", i); 1187 xmlRegPrintAtom(output, ctxt->atoms[i]); 1188 } 1189 if (ctxt->atom != NULL) { 1190 fprintf(output, "current atom:\n"); 1191 xmlRegPrintAtom(output, ctxt->atom); 1192 } 1193 fprintf(output, "%d states:", ctxt->nbStates); 1194 if (ctxt->start != NULL) 1195 fprintf(output, " start: %d", ctxt->start->no); 1196 if (ctxt->end != NULL) 1197 fprintf(output, " end: %d", ctxt->end->no); 1198 fprintf(output, "\n"); 1199 for (i = 0;i < ctxt->nbStates; i++) { 1200 xmlRegPrintState(output, ctxt->states[i]); 1201 } 1202 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1203 for (i = 0;i < ctxt->nbCounters; i++) { 1204 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1205 ctxt->counters[i].max); 1206 } 1207 } 1208 #endif 1209 1210 /************************************************************************ 1211 * * 1212 * Finite Automata structures manipulations * 1213 * * 1214 ************************************************************************/ 1215 1216 static void 1217 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1218 int neg, xmlRegAtomType type, int start, int end, 1219 xmlChar *blockName) { 1220 xmlRegRangePtr range; 1221 1222 if (atom == NULL) { 1223 ERROR("add range: atom is NULL"); 1224 return; 1225 } 1226 if (atom->type != XML_REGEXP_RANGES) { 1227 ERROR("add range: atom is not ranges"); 1228 return; 1229 } 1230 if (atom->maxRanges == 0) { 1231 atom->maxRanges = 4; 1232 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1233 sizeof(xmlRegRangePtr)); 1234 if (atom->ranges == NULL) { 1235 xmlRegexpErrMemory(ctxt, "adding ranges"); 1236 atom->maxRanges = 0; 1237 return; 1238 } 1239 } else if (atom->nbRanges >= atom->maxRanges) { 1240 xmlRegRangePtr *tmp; 1241 atom->maxRanges *= 2; 1242 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1243 sizeof(xmlRegRangePtr)); 1244 if (tmp == NULL) { 1245 xmlRegexpErrMemory(ctxt, "adding ranges"); 1246 atom->maxRanges /= 2; 1247 return; 1248 } 1249 atom->ranges = tmp; 1250 } 1251 range = xmlRegNewRange(ctxt, neg, type, start, end); 1252 if (range == NULL) 1253 return; 1254 range->blockName = blockName; 1255 atom->ranges[atom->nbRanges++] = range; 1256 1257 } 1258 1259 static int 1260 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1261 if (ctxt->maxCounters == 0) { 1262 ctxt->maxCounters = 4; 1263 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1264 sizeof(xmlRegCounter)); 1265 if (ctxt->counters == NULL) { 1266 xmlRegexpErrMemory(ctxt, "allocating counter"); 1267 ctxt->maxCounters = 0; 1268 return(-1); 1269 } 1270 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1271 xmlRegCounter *tmp; 1272 ctxt->maxCounters *= 2; 1273 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1274 sizeof(xmlRegCounter)); 1275 if (tmp == NULL) { 1276 xmlRegexpErrMemory(ctxt, "allocating counter"); 1277 ctxt->maxCounters /= 2; 1278 return(-1); 1279 } 1280 ctxt->counters = tmp; 1281 } 1282 ctxt->counters[ctxt->nbCounters].min = -1; 1283 ctxt->counters[ctxt->nbCounters].max = -1; 1284 return(ctxt->nbCounters++); 1285 } 1286 1287 static int 1288 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1289 if (atom == NULL) { 1290 ERROR("atom push: atom is NULL"); 1291 return(-1); 1292 } 1293 if (ctxt->maxAtoms == 0) { 1294 ctxt->maxAtoms = 4; 1295 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1296 sizeof(xmlRegAtomPtr)); 1297 if (ctxt->atoms == NULL) { 1298 xmlRegexpErrMemory(ctxt, "pushing atom"); 1299 ctxt->maxAtoms = 0; 1300 return(-1); 1301 } 1302 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1303 xmlRegAtomPtr *tmp; 1304 ctxt->maxAtoms *= 2; 1305 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1306 sizeof(xmlRegAtomPtr)); 1307 if (tmp == NULL) { 1308 xmlRegexpErrMemory(ctxt, "allocating counter"); 1309 ctxt->maxAtoms /= 2; 1310 return(-1); 1311 } 1312 ctxt->atoms = tmp; 1313 } 1314 atom->no = ctxt->nbAtoms; 1315 ctxt->atoms[ctxt->nbAtoms++] = atom; 1316 return(0); 1317 } 1318 1319 static void 1320 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1321 int from) { 1322 if (target->maxTransTo == 0) { 1323 target->maxTransTo = 8; 1324 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1325 sizeof(int)); 1326 if (target->transTo == NULL) { 1327 xmlRegexpErrMemory(ctxt, "adding transition"); 1328 target->maxTransTo = 0; 1329 return; 1330 } 1331 } else if (target->nbTransTo >= target->maxTransTo) { 1332 int *tmp; 1333 target->maxTransTo *= 2; 1334 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1335 sizeof(int)); 1336 if (tmp == NULL) { 1337 xmlRegexpErrMemory(ctxt, "adding transition"); 1338 target->maxTransTo /= 2; 1339 return; 1340 } 1341 target->transTo = tmp; 1342 } 1343 target->transTo[target->nbTransTo] = from; 1344 target->nbTransTo++; 1345 } 1346 1347 static void 1348 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1349 xmlRegAtomPtr atom, xmlRegStatePtr target, 1350 int counter, int count) { 1351 1352 int nrtrans; 1353 1354 if (state == NULL) { 1355 ERROR("add state: state is NULL"); 1356 return; 1357 } 1358 if (target == NULL) { 1359 ERROR("add state: target is NULL"); 1360 return; 1361 } 1362 /* 1363 * Other routines follow the philosophy 'When in doubt, add a transition' 1364 * so we check here whether such a transition is already present and, if 1365 * so, silently ignore this request. 1366 */ 1367 1368 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1369 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1370 if ((trans->atom == atom) && 1371 (trans->to == target->no) && 1372 (trans->counter == counter) && 1373 (trans->count == count)) { 1374 #ifdef DEBUG_REGEXP_GRAPH 1375 printf("Ignoring duplicate transition from %d to %d\n", 1376 state->no, target->no); 1377 #endif 1378 return; 1379 } 1380 } 1381 1382 if (state->maxTrans == 0) { 1383 state->maxTrans = 8; 1384 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1385 sizeof(xmlRegTrans)); 1386 if (state->trans == NULL) { 1387 xmlRegexpErrMemory(ctxt, "adding transition"); 1388 state->maxTrans = 0; 1389 return; 1390 } 1391 } else if (state->nbTrans >= state->maxTrans) { 1392 xmlRegTrans *tmp; 1393 state->maxTrans *= 2; 1394 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1395 sizeof(xmlRegTrans)); 1396 if (tmp == NULL) { 1397 xmlRegexpErrMemory(ctxt, "adding transition"); 1398 state->maxTrans /= 2; 1399 return; 1400 } 1401 state->trans = tmp; 1402 } 1403 #ifdef DEBUG_REGEXP_GRAPH 1404 printf("Add trans from %d to %d ", state->no, target->no); 1405 if (count == REGEXP_ALL_COUNTER) 1406 printf("all transition\n"); 1407 else if (count >= 0) 1408 printf("count based %d\n", count); 1409 else if (counter >= 0) 1410 printf("counted %d\n", counter); 1411 else if (atom == NULL) 1412 printf("epsilon transition\n"); 1413 else if (atom != NULL) 1414 xmlRegPrintAtom(stdout, atom); 1415 #endif 1416 1417 state->trans[state->nbTrans].atom = atom; 1418 state->trans[state->nbTrans].to = target->no; 1419 state->trans[state->nbTrans].counter = counter; 1420 state->trans[state->nbTrans].count = count; 1421 state->trans[state->nbTrans].nd = 0; 1422 state->nbTrans++; 1423 xmlRegStateAddTransTo(ctxt, target, state->no); 1424 } 1425 1426 static int 1427 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1428 if (state == NULL) return(-1); 1429 if (ctxt->maxStates == 0) { 1430 ctxt->maxStates = 4; 1431 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1432 sizeof(xmlRegStatePtr)); 1433 if (ctxt->states == NULL) { 1434 xmlRegexpErrMemory(ctxt, "adding state"); 1435 ctxt->maxStates = 0; 1436 return(-1); 1437 } 1438 } else if (ctxt->nbStates >= ctxt->maxStates) { 1439 xmlRegStatePtr *tmp; 1440 ctxt->maxStates *= 2; 1441 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1442 sizeof(xmlRegStatePtr)); 1443 if (tmp == NULL) { 1444 xmlRegexpErrMemory(ctxt, "adding state"); 1445 ctxt->maxStates /= 2; 1446 return(-1); 1447 } 1448 ctxt->states = tmp; 1449 } 1450 state->no = ctxt->nbStates; 1451 ctxt->states[ctxt->nbStates++] = state; 1452 return(0); 1453 } 1454 1455 /** 1456 * xmlFAGenerateAllTransition: 1457 * @ctxt: a regexp parser context 1458 * @from: the from state 1459 * @to: the target state or NULL for building a new one 1460 * @lax: 1461 * 1462 */ 1463 static void 1464 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1465 xmlRegStatePtr from, xmlRegStatePtr to, 1466 int lax) { 1467 if (to == NULL) { 1468 to = xmlRegNewState(ctxt); 1469 xmlRegStatePush(ctxt, to); 1470 ctxt->state = to; 1471 } 1472 if (lax) 1473 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1474 else 1475 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1476 } 1477 1478 /** 1479 * xmlFAGenerateEpsilonTransition: 1480 * @ctxt: a regexp parser context 1481 * @from: the from state 1482 * @to: the target state or NULL for building a new one 1483 * 1484 */ 1485 static void 1486 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1487 xmlRegStatePtr from, xmlRegStatePtr to) { 1488 if (to == NULL) { 1489 to = xmlRegNewState(ctxt); 1490 xmlRegStatePush(ctxt, to); 1491 ctxt->state = to; 1492 } 1493 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1494 } 1495 1496 /** 1497 * xmlFAGenerateCountedEpsilonTransition: 1498 * @ctxt: a regexp parser context 1499 * @from: the from state 1500 * @to: the target state or NULL for building a new one 1501 * counter: the counter for that transition 1502 * 1503 */ 1504 static void 1505 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1506 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1507 if (to == NULL) { 1508 to = xmlRegNewState(ctxt); 1509 xmlRegStatePush(ctxt, to); 1510 ctxt->state = to; 1511 } 1512 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1513 } 1514 1515 /** 1516 * xmlFAGenerateCountedTransition: 1517 * @ctxt: a regexp parser context 1518 * @from: the from state 1519 * @to: the target state or NULL for building a new one 1520 * counter: the counter for that transition 1521 * 1522 */ 1523 static void 1524 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1525 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1526 if (to == NULL) { 1527 to = xmlRegNewState(ctxt); 1528 xmlRegStatePush(ctxt, to); 1529 ctxt->state = to; 1530 } 1531 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1532 } 1533 1534 /** 1535 * xmlFAGenerateTransitions: 1536 * @ctxt: a regexp parser context 1537 * @from: the from state 1538 * @to: the target state or NULL for building a new one 1539 * @atom: the atom generating the transition 1540 * 1541 * Returns 0 if success and -1 in case of error. 1542 */ 1543 static int 1544 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1545 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1546 xmlRegStatePtr end; 1547 1548 if (atom == NULL) { 1549 ERROR("genrate transition: atom == NULL"); 1550 return(-1); 1551 } 1552 if (atom->type == XML_REGEXP_SUBREG) { 1553 /* 1554 * this is a subexpression handling one should not need to 1555 * create a new node except for XML_REGEXP_QUANT_RANGE. 1556 */ 1557 if (xmlRegAtomPush(ctxt, atom) < 0) { 1558 return(-1); 1559 } 1560 if ((to != NULL) && (atom->stop != to) && 1561 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1562 /* 1563 * Generate an epsilon transition to link to the target 1564 */ 1565 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1566 #ifdef DV 1567 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1568 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1569 to = xmlRegNewState(ctxt); 1570 xmlRegStatePush(ctxt, to); 1571 ctxt->state = to; 1572 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1573 #endif 1574 } 1575 switch (atom->quant) { 1576 case XML_REGEXP_QUANT_OPT: 1577 atom->quant = XML_REGEXP_QUANT_ONCE; 1578 /* 1579 * transition done to the state after end of atom. 1580 * 1. set transition from atom start to new state 1581 * 2. set transition from atom end to this state. 1582 */ 1583 if (to == NULL) { 1584 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1585 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, 1586 ctxt->state); 1587 } else { 1588 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to); 1589 } 1590 break; 1591 case XML_REGEXP_QUANT_MULT: 1592 atom->quant = XML_REGEXP_QUANT_ONCE; 1593 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1594 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1595 break; 1596 case XML_REGEXP_QUANT_PLUS: 1597 atom->quant = XML_REGEXP_QUANT_ONCE; 1598 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1599 break; 1600 case XML_REGEXP_QUANT_RANGE: { 1601 int counter; 1602 xmlRegStatePtr inter, newstate; 1603 1604 /* 1605 * create the final state now if needed 1606 */ 1607 if (to != NULL) { 1608 newstate = to; 1609 } else { 1610 newstate = xmlRegNewState(ctxt); 1611 xmlRegStatePush(ctxt, newstate); 1612 } 1613 1614 /* 1615 * The principle here is to use counted transition 1616 * to avoid explosion in the number of states in the 1617 * graph. This is clearly more complex but should not 1618 * be exploitable at runtime. 1619 */ 1620 if ((atom->min == 0) && (atom->start0 == NULL)) { 1621 xmlRegAtomPtr copy; 1622 /* 1623 * duplicate a transition based on atom to count next 1624 * occurences after 1. We cannot loop to atom->start 1625 * directly because we need an epsilon transition to 1626 * newstate. 1627 */ 1628 /* ???? For some reason it seems we never reach that 1629 case, I suppose this got optimized out before when 1630 building the automata */ 1631 copy = xmlRegCopyAtom(ctxt, atom); 1632 if (copy == NULL) 1633 return(-1); 1634 copy->quant = XML_REGEXP_QUANT_ONCE; 1635 copy->min = 0; 1636 copy->max = 0; 1637 1638 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy) 1639 < 0) 1640 return(-1); 1641 inter = ctxt->state; 1642 counter = xmlRegGetCounter(ctxt); 1643 ctxt->counters[counter].min = atom->min - 1; 1644 ctxt->counters[counter].max = atom->max - 1; 1645 /* count the number of times we see it again */ 1646 xmlFAGenerateCountedEpsilonTransition(ctxt, inter, 1647 atom->stop, counter); 1648 /* allow a way out based on the count */ 1649 xmlFAGenerateCountedTransition(ctxt, inter, 1650 newstate, counter); 1651 /* and also allow a direct exit for 0 */ 1652 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1653 newstate); 1654 } else { 1655 /* 1656 * either we need the atom at least once or there 1657 * is an atom->start0 allowing to easilly plug the 1658 * epsilon transition. 1659 */ 1660 counter = xmlRegGetCounter(ctxt); 1661 ctxt->counters[counter].min = atom->min - 1; 1662 ctxt->counters[counter].max = atom->max - 1; 1663 /* count the number of times we see it again */ 1664 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1665 atom->start, counter); 1666 /* allow a way out based on the count */ 1667 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1668 newstate, counter); 1669 /* and if needed allow a direct exit for 0 */ 1670 if (atom->min == 0) 1671 xmlFAGenerateEpsilonTransition(ctxt, atom->start0, 1672 newstate); 1673 1674 } 1675 atom->min = 0; 1676 atom->max = 0; 1677 atom->quant = XML_REGEXP_QUANT_ONCE; 1678 ctxt->state = newstate; 1679 } 1680 default: 1681 break; 1682 } 1683 return(0); 1684 } 1685 if ((atom->min == 0) && (atom->max == 0) && 1686 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1687 /* 1688 * we can discard the atom and generate an epsilon transition instead 1689 */ 1690 if (to == NULL) { 1691 to = xmlRegNewState(ctxt); 1692 if (to != NULL) 1693 xmlRegStatePush(ctxt, to); 1694 else { 1695 return(-1); 1696 } 1697 } 1698 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1699 ctxt->state = to; 1700 xmlRegFreeAtom(atom); 1701 return(0); 1702 } 1703 if (to == NULL) { 1704 to = xmlRegNewState(ctxt); 1705 if (to != NULL) 1706 xmlRegStatePush(ctxt, to); 1707 else { 1708 return(-1); 1709 } 1710 } 1711 end = to; 1712 if ((atom->quant == XML_REGEXP_QUANT_MULT) || 1713 (atom->quant == XML_REGEXP_QUANT_PLUS)) { 1714 /* 1715 * Do not pollute the target state by adding transitions from 1716 * it as it is likely to be the shared target of multiple branches. 1717 * So isolate with an epsilon transition. 1718 */ 1719 xmlRegStatePtr tmp; 1720 1721 tmp = xmlRegNewState(ctxt); 1722 if (tmp != NULL) 1723 xmlRegStatePush(ctxt, tmp); 1724 else { 1725 return(-1); 1726 } 1727 xmlFAGenerateEpsilonTransition(ctxt, tmp, to); 1728 to = tmp; 1729 } 1730 if (xmlRegAtomPush(ctxt, atom) < 0) { 1731 return(-1); 1732 } 1733 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1734 ctxt->state = end; 1735 switch (atom->quant) { 1736 case XML_REGEXP_QUANT_OPT: 1737 atom->quant = XML_REGEXP_QUANT_ONCE; 1738 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1739 break; 1740 case XML_REGEXP_QUANT_MULT: 1741 atom->quant = XML_REGEXP_QUANT_ONCE; 1742 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1743 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1744 break; 1745 case XML_REGEXP_QUANT_PLUS: 1746 atom->quant = XML_REGEXP_QUANT_ONCE; 1747 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1748 break; 1749 case XML_REGEXP_QUANT_RANGE: 1750 #if DV_test 1751 if (atom->min == 0) { 1752 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1753 } 1754 #endif 1755 break; 1756 default: 1757 break; 1758 } 1759 return(0); 1760 } 1761 1762 /** 1763 * xmlFAReduceEpsilonTransitions: 1764 * @ctxt: a regexp parser context 1765 * @fromnr: the from state 1766 * @tonr: the to state 1767 * @counter: should that transition be associated to a counted 1768 * 1769 */ 1770 static void 1771 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1772 int tonr, int counter) { 1773 int transnr; 1774 xmlRegStatePtr from; 1775 xmlRegStatePtr to; 1776 1777 #ifdef DEBUG_REGEXP_GRAPH 1778 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1779 #endif 1780 from = ctxt->states[fromnr]; 1781 if (from == NULL) 1782 return; 1783 to = ctxt->states[tonr]; 1784 if (to == NULL) 1785 return; 1786 if ((to->mark == XML_REGEXP_MARK_START) || 1787 (to->mark == XML_REGEXP_MARK_VISITED)) 1788 return; 1789 1790 to->mark = XML_REGEXP_MARK_VISITED; 1791 if (to->type == XML_REGEXP_FINAL_STATE) { 1792 #ifdef DEBUG_REGEXP_GRAPH 1793 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1794 #endif 1795 from->type = XML_REGEXP_FINAL_STATE; 1796 } 1797 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1798 if (to->trans[transnr].to < 0) 1799 continue; 1800 if (to->trans[transnr].atom == NULL) { 1801 /* 1802 * Don't remove counted transitions 1803 * Don't loop either 1804 */ 1805 if (to->trans[transnr].to != fromnr) { 1806 if (to->trans[transnr].count >= 0) { 1807 int newto = to->trans[transnr].to; 1808 1809 xmlRegStateAddTrans(ctxt, from, NULL, 1810 ctxt->states[newto], 1811 -1, to->trans[transnr].count); 1812 } else { 1813 #ifdef DEBUG_REGEXP_GRAPH 1814 printf("Found epsilon trans %d from %d to %d\n", 1815 transnr, tonr, to->trans[transnr].to); 1816 #endif 1817 if (to->trans[transnr].counter >= 0) { 1818 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1819 to->trans[transnr].to, 1820 to->trans[transnr].counter); 1821 } else { 1822 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1823 to->trans[transnr].to, 1824 counter); 1825 } 1826 } 1827 } 1828 } else { 1829 int newto = to->trans[transnr].to; 1830 1831 if (to->trans[transnr].counter >= 0) { 1832 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1833 ctxt->states[newto], 1834 to->trans[transnr].counter, -1); 1835 } else { 1836 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1837 ctxt->states[newto], counter, -1); 1838 } 1839 } 1840 } 1841 to->mark = XML_REGEXP_MARK_NORMAL; 1842 } 1843 1844 /** 1845 * xmlFAEliminateSimpleEpsilonTransitions: 1846 * @ctxt: a regexp parser context 1847 * 1848 * Eliminating general epsilon transitions can get costly in the general 1849 * algorithm due to the large amount of generated new transitions and 1850 * associated comparisons. However for simple epsilon transition used just 1851 * to separate building blocks when generating the automata this can be 1852 * reduced to state elimination: 1853 * - if there exists an epsilon from X to Y 1854 * - if there is no other transition from X 1855 * then X and Y are semantically equivalent and X can be eliminated 1856 * If X is the start state then make Y the start state, else replace the 1857 * target of all transitions to X by transitions to Y. 1858 */ 1859 static void 1860 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1861 int statenr, i, j, newto; 1862 xmlRegStatePtr state, tmp; 1863 1864 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1865 state = ctxt->states[statenr]; 1866 if (state == NULL) 1867 continue; 1868 if (state->nbTrans != 1) 1869 continue; 1870 if (state->type == XML_REGEXP_UNREACH_STATE) 1871 continue; 1872 /* is the only transition out a basic transition */ 1873 if ((state->trans[0].atom == NULL) && 1874 (state->trans[0].to >= 0) && 1875 (state->trans[0].to != statenr) && 1876 (state->trans[0].counter < 0) && 1877 (state->trans[0].count < 0)) { 1878 newto = state->trans[0].to; 1879 1880 if (state->type == XML_REGEXP_START_STATE) { 1881 #ifdef DEBUG_REGEXP_GRAPH 1882 printf("Found simple epsilon trans from start %d to %d\n", 1883 statenr, newto); 1884 #endif 1885 } else { 1886 #ifdef DEBUG_REGEXP_GRAPH 1887 printf("Found simple epsilon trans from %d to %d\n", 1888 statenr, newto); 1889 #endif 1890 for (i = 0;i < state->nbTransTo;i++) { 1891 tmp = ctxt->states[state->transTo[i]]; 1892 for (j = 0;j < tmp->nbTrans;j++) { 1893 if (tmp->trans[j].to == statenr) { 1894 #ifdef DEBUG_REGEXP_GRAPH 1895 printf("Changed transition %d on %d to go to %d\n", 1896 j, tmp->no, newto); 1897 #endif 1898 tmp->trans[j].to = -1; 1899 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom, 1900 ctxt->states[newto], 1901 tmp->trans[j].counter, 1902 tmp->trans[j].count); 1903 } 1904 } 1905 } 1906 if (state->type == XML_REGEXP_FINAL_STATE) 1907 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1908 /* eliminate the transition completely */ 1909 state->nbTrans = 0; 1910 1911 state->type = XML_REGEXP_UNREACH_STATE; 1912 1913 } 1914 1915 } 1916 } 1917 } 1918 /** 1919 * xmlFAEliminateEpsilonTransitions: 1920 * @ctxt: a regexp parser context 1921 * 1922 */ 1923 static void 1924 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1925 int statenr, transnr; 1926 xmlRegStatePtr state; 1927 int has_epsilon; 1928 1929 if (ctxt->states == NULL) return; 1930 1931 /* 1932 * Eliminate simple epsilon transition and the associated unreachable 1933 * states. 1934 */ 1935 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1936 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1937 state = ctxt->states[statenr]; 1938 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) { 1939 #ifdef DEBUG_REGEXP_GRAPH 1940 printf("Removed unreachable state %d\n", statenr); 1941 #endif 1942 xmlRegFreeState(state); 1943 ctxt->states[statenr] = NULL; 1944 } 1945 } 1946 1947 has_epsilon = 0; 1948 1949 /* 1950 * Build the completed transitions bypassing the epsilons 1951 * Use a marking algorithm to avoid loops 1952 * Mark sink states too. 1953 * Process from the latests states backward to the start when 1954 * there is long cascading epsilon chains this minimize the 1955 * recursions and transition compares when adding the new ones 1956 */ 1957 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) { 1958 state = ctxt->states[statenr]; 1959 if (state == NULL) 1960 continue; 1961 if ((state->nbTrans == 0) && 1962 (state->type != XML_REGEXP_FINAL_STATE)) { 1963 state->type = XML_REGEXP_SINK_STATE; 1964 } 1965 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1966 if ((state->trans[transnr].atom == NULL) && 1967 (state->trans[transnr].to >= 0)) { 1968 if (state->trans[transnr].to == statenr) { 1969 state->trans[transnr].to = -1; 1970 #ifdef DEBUG_REGEXP_GRAPH 1971 printf("Removed loopback epsilon trans %d on %d\n", 1972 transnr, statenr); 1973 #endif 1974 } else if (state->trans[transnr].count < 0) { 1975 int newto = state->trans[transnr].to; 1976 1977 #ifdef DEBUG_REGEXP_GRAPH 1978 printf("Found epsilon trans %d from %d to %d\n", 1979 transnr, statenr, newto); 1980 #endif 1981 has_epsilon = 1; 1982 state->trans[transnr].to = -2; 1983 state->mark = XML_REGEXP_MARK_START; 1984 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1985 newto, state->trans[transnr].counter); 1986 state->mark = XML_REGEXP_MARK_NORMAL; 1987 #ifdef DEBUG_REGEXP_GRAPH 1988 } else { 1989 printf("Found counted transition %d on %d\n", 1990 transnr, statenr); 1991 #endif 1992 } 1993 } 1994 } 1995 } 1996 /* 1997 * Eliminate the epsilon transitions 1998 */ 1999 if (has_epsilon) { 2000 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2001 state = ctxt->states[statenr]; 2002 if (state == NULL) 2003 continue; 2004 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2005 xmlRegTransPtr trans = &(state->trans[transnr]); 2006 if ((trans->atom == NULL) && 2007 (trans->count < 0) && 2008 (trans->to >= 0)) { 2009 trans->to = -1; 2010 } 2011 } 2012 } 2013 } 2014 2015 /* 2016 * Use this pass to detect unreachable states too 2017 */ 2018 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2019 state = ctxt->states[statenr]; 2020 if (state != NULL) 2021 state->reached = XML_REGEXP_MARK_NORMAL; 2022 } 2023 state = ctxt->states[0]; 2024 if (state != NULL) 2025 state->reached = XML_REGEXP_MARK_START; 2026 while (state != NULL) { 2027 xmlRegStatePtr target = NULL; 2028 state->reached = XML_REGEXP_MARK_VISITED; 2029 /* 2030 * Mark all states reachable from the current reachable state 2031 */ 2032 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2033 if ((state->trans[transnr].to >= 0) && 2034 ((state->trans[transnr].atom != NULL) || 2035 (state->trans[transnr].count >= 0))) { 2036 int newto = state->trans[transnr].to; 2037 2038 if (ctxt->states[newto] == NULL) 2039 continue; 2040 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 2041 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 2042 target = ctxt->states[newto]; 2043 } 2044 } 2045 } 2046 2047 /* 2048 * find the next accessible state not explored 2049 */ 2050 if (target == NULL) { 2051 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 2052 state = ctxt->states[statenr]; 2053 if ((state != NULL) && (state->reached == 2054 XML_REGEXP_MARK_START)) { 2055 target = state; 2056 break; 2057 } 2058 } 2059 } 2060 state = target; 2061 } 2062 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2063 state = ctxt->states[statenr]; 2064 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 2065 #ifdef DEBUG_REGEXP_GRAPH 2066 printf("Removed unreachable state %d\n", statenr); 2067 #endif 2068 xmlRegFreeState(state); 2069 ctxt->states[statenr] = NULL; 2070 } 2071 } 2072 2073 } 2074 2075 static int 2076 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 2077 int ret = 0; 2078 2079 if ((range1->type == XML_REGEXP_RANGES) || 2080 (range2->type == XML_REGEXP_RANGES) || 2081 (range2->type == XML_REGEXP_SUBREG) || 2082 (range1->type == XML_REGEXP_SUBREG) || 2083 (range1->type == XML_REGEXP_STRING) || 2084 (range2->type == XML_REGEXP_STRING)) 2085 return(-1); 2086 2087 /* put them in order */ 2088 if (range1->type > range2->type) { 2089 xmlRegRangePtr tmp; 2090 2091 tmp = range1; 2092 range1 = range2; 2093 range2 = tmp; 2094 } 2095 if ((range1->type == XML_REGEXP_ANYCHAR) || 2096 (range2->type == XML_REGEXP_ANYCHAR)) { 2097 ret = 1; 2098 } else if ((range1->type == XML_REGEXP_EPSILON) || 2099 (range2->type == XML_REGEXP_EPSILON)) { 2100 return(0); 2101 } else if (range1->type == range2->type) { 2102 if (range1->type != XML_REGEXP_CHARVAL) 2103 ret = 1; 2104 else if ((range1->end < range2->start) || 2105 (range2->end < range1->start)) 2106 ret = 0; 2107 else 2108 ret = 1; 2109 } else if (range1->type == XML_REGEXP_CHARVAL) { 2110 int codepoint; 2111 int neg = 0; 2112 2113 /* 2114 * just check all codepoints in the range for acceptance, 2115 * this is usually way cheaper since done only once at 2116 * compilation than testing over and over at runtime or 2117 * pushing too many states when evaluating. 2118 */ 2119 if (((range1->neg == 0) && (range2->neg != 0)) || 2120 ((range1->neg != 0) && (range2->neg == 0))) 2121 neg = 1; 2122 2123 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 2124 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 2125 0, range2->start, range2->end, 2126 range2->blockName); 2127 if (ret < 0) 2128 return(-1); 2129 if (((neg == 1) && (ret == 0)) || 2130 ((neg == 0) && (ret == 1))) 2131 return(1); 2132 } 2133 return(0); 2134 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 2135 (range2->type == XML_REGEXP_BLOCK_NAME)) { 2136 if (range1->type == range2->type) { 2137 ret = xmlStrEqual(range1->blockName, range2->blockName); 2138 } else { 2139 /* 2140 * comparing a block range with anything else is way 2141 * too costly, and maintining the table is like too much 2142 * memory too, so let's force the automata to save state 2143 * here. 2144 */ 2145 return(1); 2146 } 2147 } else if ((range1->type < XML_REGEXP_LETTER) || 2148 (range2->type < XML_REGEXP_LETTER)) { 2149 if ((range1->type == XML_REGEXP_ANYSPACE) && 2150 (range2->type == XML_REGEXP_NOTSPACE)) 2151 ret = 0; 2152 else if ((range1->type == XML_REGEXP_INITNAME) && 2153 (range2->type == XML_REGEXP_NOTINITNAME)) 2154 ret = 0; 2155 else if ((range1->type == XML_REGEXP_NAMECHAR) && 2156 (range2->type == XML_REGEXP_NOTNAMECHAR)) 2157 ret = 0; 2158 else if ((range1->type == XML_REGEXP_DECIMAL) && 2159 (range2->type == XML_REGEXP_NOTDECIMAL)) 2160 ret = 0; 2161 else if ((range1->type == XML_REGEXP_REALCHAR) && 2162 (range2->type == XML_REGEXP_NOTREALCHAR)) 2163 ret = 0; 2164 else { 2165 /* same thing to limit complexity */ 2166 return(1); 2167 } 2168 } else { 2169 ret = 0; 2170 /* range1->type < range2->type here */ 2171 switch (range1->type) { 2172 case XML_REGEXP_LETTER: 2173 /* all disjoint except in the subgroups */ 2174 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2175 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2176 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2177 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2178 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2179 ret = 1; 2180 break; 2181 case XML_REGEXP_MARK: 2182 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2183 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2184 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2185 ret = 1; 2186 break; 2187 case XML_REGEXP_NUMBER: 2188 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2189 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2190 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2191 ret = 1; 2192 break; 2193 case XML_REGEXP_PUNCT: 2194 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2195 (range2->type == XML_REGEXP_PUNCT_DASH) || 2196 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2197 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2198 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2199 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2200 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2201 ret = 1; 2202 break; 2203 case XML_REGEXP_SEPAR: 2204 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2205 (range2->type == XML_REGEXP_SEPAR_LINE) || 2206 (range2->type == XML_REGEXP_SEPAR_PARA)) 2207 ret = 1; 2208 break; 2209 case XML_REGEXP_SYMBOL: 2210 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2211 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2212 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2213 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2214 ret = 1; 2215 break; 2216 case XML_REGEXP_OTHER: 2217 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2218 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2219 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2220 ret = 1; 2221 break; 2222 default: 2223 if ((range2->type >= XML_REGEXP_LETTER) && 2224 (range2->type < XML_REGEXP_BLOCK_NAME)) 2225 ret = 0; 2226 else { 2227 /* safety net ! */ 2228 return(1); 2229 } 2230 } 2231 } 2232 if (((range1->neg == 0) && (range2->neg != 0)) || 2233 ((range1->neg != 0) && (range2->neg == 0))) 2234 ret = !ret; 2235 return(ret); 2236 } 2237 2238 /** 2239 * xmlFACompareAtomTypes: 2240 * @type1: an atom type 2241 * @type2: an atom type 2242 * 2243 * Compares two atoms type to check whether they intersect in some ways, 2244 * this is used by xmlFACompareAtoms only 2245 * 2246 * Returns 1 if they may intersect and 0 otherwise 2247 */ 2248 static int 2249 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2250 if ((type1 == XML_REGEXP_EPSILON) || 2251 (type1 == XML_REGEXP_CHARVAL) || 2252 (type1 == XML_REGEXP_RANGES) || 2253 (type1 == XML_REGEXP_SUBREG) || 2254 (type1 == XML_REGEXP_STRING) || 2255 (type1 == XML_REGEXP_ANYCHAR)) 2256 return(1); 2257 if ((type2 == XML_REGEXP_EPSILON) || 2258 (type2 == XML_REGEXP_CHARVAL) || 2259 (type2 == XML_REGEXP_RANGES) || 2260 (type2 == XML_REGEXP_SUBREG) || 2261 (type2 == XML_REGEXP_STRING) || 2262 (type2 == XML_REGEXP_ANYCHAR)) 2263 return(1); 2264 2265 if (type1 == type2) return(1); 2266 2267 /* simplify subsequent compares by making sure type1 < type2 */ 2268 if (type1 > type2) { 2269 xmlRegAtomType tmp = type1; 2270 type1 = type2; 2271 type2 = tmp; 2272 } 2273 switch (type1) { 2274 case XML_REGEXP_ANYSPACE: /* \s */ 2275 /* can't be a letter, number, mark, pontuation, symbol */ 2276 if ((type2 == XML_REGEXP_NOTSPACE) || 2277 ((type2 >= XML_REGEXP_LETTER) && 2278 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2279 ((type2 >= XML_REGEXP_NUMBER) && 2280 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2281 ((type2 >= XML_REGEXP_MARK) && 2282 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2283 ((type2 >= XML_REGEXP_PUNCT) && 2284 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2285 ((type2 >= XML_REGEXP_SYMBOL) && 2286 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2287 ) return(0); 2288 break; 2289 case XML_REGEXP_NOTSPACE: /* \S */ 2290 break; 2291 case XML_REGEXP_INITNAME: /* \l */ 2292 /* can't be a number, mark, separator, pontuation, symbol or other */ 2293 if ((type2 == XML_REGEXP_NOTINITNAME) || 2294 ((type2 >= XML_REGEXP_NUMBER) && 2295 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2296 ((type2 >= XML_REGEXP_MARK) && 2297 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2298 ((type2 >= XML_REGEXP_SEPAR) && 2299 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2300 ((type2 >= XML_REGEXP_PUNCT) && 2301 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2302 ((type2 >= XML_REGEXP_SYMBOL) && 2303 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2304 ((type2 >= XML_REGEXP_OTHER) && 2305 (type2 <= XML_REGEXP_OTHER_NA)) 2306 ) return(0); 2307 break; 2308 case XML_REGEXP_NOTINITNAME: /* \L */ 2309 break; 2310 case XML_REGEXP_NAMECHAR: /* \c */ 2311 /* can't be a mark, separator, pontuation, symbol or other */ 2312 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2313 ((type2 >= XML_REGEXP_MARK) && 2314 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2315 ((type2 >= XML_REGEXP_PUNCT) && 2316 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2317 ((type2 >= XML_REGEXP_SEPAR) && 2318 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2319 ((type2 >= XML_REGEXP_SYMBOL) && 2320 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2321 ((type2 >= XML_REGEXP_OTHER) && 2322 (type2 <= XML_REGEXP_OTHER_NA)) 2323 ) return(0); 2324 break; 2325 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2326 break; 2327 case XML_REGEXP_DECIMAL: /* \d */ 2328 /* can't be a letter, mark, separator, pontuation, symbol or other */ 2329 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2330 (type2 == XML_REGEXP_REALCHAR) || 2331 ((type2 >= XML_REGEXP_LETTER) && 2332 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2333 ((type2 >= XML_REGEXP_MARK) && 2334 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2335 ((type2 >= XML_REGEXP_PUNCT) && 2336 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2337 ((type2 >= XML_REGEXP_SEPAR) && 2338 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2339 ((type2 >= XML_REGEXP_SYMBOL) && 2340 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2341 ((type2 >= XML_REGEXP_OTHER) && 2342 (type2 <= XML_REGEXP_OTHER_NA)) 2343 )return(0); 2344 break; 2345 case XML_REGEXP_NOTDECIMAL: /* \D */ 2346 break; 2347 case XML_REGEXP_REALCHAR: /* \w */ 2348 /* can't be a mark, separator, pontuation, symbol or other */ 2349 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2350 ((type2 >= XML_REGEXP_MARK) && 2351 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2352 ((type2 >= XML_REGEXP_PUNCT) && 2353 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2354 ((type2 >= XML_REGEXP_SEPAR) && 2355 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2356 ((type2 >= XML_REGEXP_SYMBOL) && 2357 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2358 ((type2 >= XML_REGEXP_OTHER) && 2359 (type2 <= XML_REGEXP_OTHER_NA)) 2360 )return(0); 2361 break; 2362 case XML_REGEXP_NOTREALCHAR: /* \W */ 2363 break; 2364 /* 2365 * at that point we know both type 1 and type2 are from 2366 * character categories are ordered and are different, 2367 * it becomes simple because this is a partition 2368 */ 2369 case XML_REGEXP_LETTER: 2370 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2371 return(1); 2372 return(0); 2373 case XML_REGEXP_LETTER_UPPERCASE: 2374 case XML_REGEXP_LETTER_LOWERCASE: 2375 case XML_REGEXP_LETTER_TITLECASE: 2376 case XML_REGEXP_LETTER_MODIFIER: 2377 case XML_REGEXP_LETTER_OTHERS: 2378 return(0); 2379 case XML_REGEXP_MARK: 2380 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2381 return(1); 2382 return(0); 2383 case XML_REGEXP_MARK_NONSPACING: 2384 case XML_REGEXP_MARK_SPACECOMBINING: 2385 case XML_REGEXP_MARK_ENCLOSING: 2386 return(0); 2387 case XML_REGEXP_NUMBER: 2388 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2389 return(1); 2390 return(0); 2391 case XML_REGEXP_NUMBER_DECIMAL: 2392 case XML_REGEXP_NUMBER_LETTER: 2393 case XML_REGEXP_NUMBER_OTHERS: 2394 return(0); 2395 case XML_REGEXP_PUNCT: 2396 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2397 return(1); 2398 return(0); 2399 case XML_REGEXP_PUNCT_CONNECTOR: 2400 case XML_REGEXP_PUNCT_DASH: 2401 case XML_REGEXP_PUNCT_OPEN: 2402 case XML_REGEXP_PUNCT_CLOSE: 2403 case XML_REGEXP_PUNCT_INITQUOTE: 2404 case XML_REGEXP_PUNCT_FINQUOTE: 2405 case XML_REGEXP_PUNCT_OTHERS: 2406 return(0); 2407 case XML_REGEXP_SEPAR: 2408 if (type2 <= XML_REGEXP_SEPAR_PARA) 2409 return(1); 2410 return(0); 2411 case XML_REGEXP_SEPAR_SPACE: 2412 case XML_REGEXP_SEPAR_LINE: 2413 case XML_REGEXP_SEPAR_PARA: 2414 return(0); 2415 case XML_REGEXP_SYMBOL: 2416 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2417 return(1); 2418 return(0); 2419 case XML_REGEXP_SYMBOL_MATH: 2420 case XML_REGEXP_SYMBOL_CURRENCY: 2421 case XML_REGEXP_SYMBOL_MODIFIER: 2422 case XML_REGEXP_SYMBOL_OTHERS: 2423 return(0); 2424 case XML_REGEXP_OTHER: 2425 if (type2 <= XML_REGEXP_OTHER_NA) 2426 return(1); 2427 return(0); 2428 case XML_REGEXP_OTHER_CONTROL: 2429 case XML_REGEXP_OTHER_FORMAT: 2430 case XML_REGEXP_OTHER_PRIVATE: 2431 case XML_REGEXP_OTHER_NA: 2432 return(0); 2433 default: 2434 break; 2435 } 2436 return(1); 2437 } 2438 2439 /** 2440 * xmlFAEqualAtoms: 2441 * @atom1: an atom 2442 * @atom2: an atom 2443 * @deep: if not set only compare string pointers 2444 * 2445 * Compares two atoms to check whether they are the same exactly 2446 * this is used to remove equivalent transitions 2447 * 2448 * Returns 1 if same and 0 otherwise 2449 */ 2450 static int 2451 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2452 int ret = 0; 2453 2454 if (atom1 == atom2) 2455 return(1); 2456 if ((atom1 == NULL) || (atom2 == NULL)) 2457 return(0); 2458 2459 if (atom1->type != atom2->type) 2460 return(0); 2461 switch (atom1->type) { 2462 case XML_REGEXP_EPSILON: 2463 ret = 0; 2464 break; 2465 case XML_REGEXP_STRING: 2466 if (!deep) 2467 ret = (atom1->valuep == atom2->valuep); 2468 else 2469 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2470 (xmlChar *)atom2->valuep); 2471 break; 2472 case XML_REGEXP_CHARVAL: 2473 ret = (atom1->codepoint == atom2->codepoint); 2474 break; 2475 case XML_REGEXP_RANGES: 2476 /* too hard to do in the general case */ 2477 ret = 0; 2478 default: 2479 break; 2480 } 2481 return(ret); 2482 } 2483 2484 /** 2485 * xmlFACompareAtoms: 2486 * @atom1: an atom 2487 * @atom2: an atom 2488 * @deep: if not set only compare string pointers 2489 * 2490 * Compares two atoms to check whether they intersect in some ways, 2491 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2492 * 2493 * Returns 1 if yes and 0 otherwise 2494 */ 2495 static int 2496 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2497 int ret = 1; 2498 2499 if (atom1 == atom2) 2500 return(1); 2501 if ((atom1 == NULL) || (atom2 == NULL)) 2502 return(0); 2503 2504 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2505 (atom2->type == XML_REGEXP_ANYCHAR)) 2506 return(1); 2507 2508 if (atom1->type > atom2->type) { 2509 xmlRegAtomPtr tmp; 2510 tmp = atom1; 2511 atom1 = atom2; 2512 atom2 = tmp; 2513 } 2514 if (atom1->type != atom2->type) { 2515 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2516 /* if they can't intersect at the type level break now */ 2517 if (ret == 0) 2518 return(0); 2519 } 2520 switch (atom1->type) { 2521 case XML_REGEXP_STRING: 2522 if (!deep) 2523 ret = (atom1->valuep != atom2->valuep); 2524 else 2525 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, 2526 (xmlChar *)atom2->valuep); 2527 break; 2528 case XML_REGEXP_EPSILON: 2529 goto not_determinist; 2530 case XML_REGEXP_CHARVAL: 2531 if (atom2->type == XML_REGEXP_CHARVAL) { 2532 ret = (atom1->codepoint == atom2->codepoint); 2533 } else { 2534 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2535 if (ret < 0) 2536 ret = 1; 2537 } 2538 break; 2539 case XML_REGEXP_RANGES: 2540 if (atom2->type == XML_REGEXP_RANGES) { 2541 int i, j, res; 2542 xmlRegRangePtr r1, r2; 2543 2544 /* 2545 * need to check that none of the ranges eventually matches 2546 */ 2547 for (i = 0;i < atom1->nbRanges;i++) { 2548 for (j = 0;j < atom2->nbRanges;j++) { 2549 r1 = atom1->ranges[i]; 2550 r2 = atom2->ranges[j]; 2551 res = xmlFACompareRanges(r1, r2); 2552 if (res == 1) { 2553 ret = 1; 2554 goto done; 2555 } 2556 } 2557 } 2558 ret = 0; 2559 } 2560 break; 2561 default: 2562 goto not_determinist; 2563 } 2564 done: 2565 if (atom1->neg != atom2->neg) { 2566 ret = !ret; 2567 } 2568 if (ret == 0) 2569 return(0); 2570 not_determinist: 2571 return(1); 2572 } 2573 2574 /** 2575 * xmlFARecurseDeterminism: 2576 * @ctxt: a regexp parser context 2577 * 2578 * Check whether the associated regexp is determinist, 2579 * should be called after xmlFAEliminateEpsilonTransitions() 2580 * 2581 */ 2582 static int 2583 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2584 int to, xmlRegAtomPtr atom) { 2585 int ret = 1; 2586 int res; 2587 int transnr, nbTrans; 2588 xmlRegTransPtr t1; 2589 int deep = 1; 2590 2591 if (state == NULL) 2592 return(ret); 2593 if (state->markd == XML_REGEXP_MARK_VISITED) 2594 return(ret); 2595 2596 if (ctxt->flags & AM_AUTOMATA_RNG) 2597 deep = 0; 2598 2599 /* 2600 * don't recurse on transitions potentially added in the course of 2601 * the elimination. 2602 */ 2603 nbTrans = state->nbTrans; 2604 for (transnr = 0;transnr < nbTrans;transnr++) { 2605 t1 = &(state->trans[transnr]); 2606 /* 2607 * check transitions conflicting with the one looked at 2608 */ 2609 if (t1->atom == NULL) { 2610 if (t1->to < 0) 2611 continue; 2612 state->markd = XML_REGEXP_MARK_VISITED; 2613 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2614 to, atom); 2615 state->markd = 0; 2616 if (res == 0) { 2617 ret = 0; 2618 /* t1->nd = 1; */ 2619 } 2620 continue; 2621 } 2622 if (t1->to != to) 2623 continue; 2624 if (xmlFACompareAtoms(t1->atom, atom, deep)) { 2625 ret = 0; 2626 /* mark the transition as non-deterministic */ 2627 t1->nd = 1; 2628 } 2629 } 2630 return(ret); 2631 } 2632 2633 /** 2634 * xmlFAComputesDeterminism: 2635 * @ctxt: a regexp parser context 2636 * 2637 * Check whether the associated regexp is determinist, 2638 * should be called after xmlFAEliminateEpsilonTransitions() 2639 * 2640 */ 2641 static int 2642 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2643 int statenr, transnr; 2644 xmlRegStatePtr state; 2645 xmlRegTransPtr t1, t2, last; 2646 int i; 2647 int ret = 1; 2648 int deep = 1; 2649 2650 #ifdef DEBUG_REGEXP_GRAPH 2651 printf("xmlFAComputesDeterminism\n"); 2652 xmlRegPrintCtxt(stdout, ctxt); 2653 #endif 2654 if (ctxt->determinist != -1) 2655 return(ctxt->determinist); 2656 2657 if (ctxt->flags & AM_AUTOMATA_RNG) 2658 deep = 0; 2659 2660 /* 2661 * First cleanup the automata removing cancelled transitions 2662 */ 2663 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2664 state = ctxt->states[statenr]; 2665 if (state == NULL) 2666 continue; 2667 if (state->nbTrans < 2) 2668 continue; 2669 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2670 t1 = &(state->trans[transnr]); 2671 /* 2672 * Determinism checks in case of counted or all transitions 2673 * will have to be handled separately 2674 */ 2675 if (t1->atom == NULL) { 2676 /* t1->nd = 1; */ 2677 continue; 2678 } 2679 if (t1->to == -1) /* eliminated */ 2680 continue; 2681 for (i = 0;i < transnr;i++) { 2682 t2 = &(state->trans[i]); 2683 if (t2->to == -1) /* eliminated */ 2684 continue; 2685 if (t2->atom != NULL) { 2686 if (t1->to == t2->to) { 2687 /* 2688 * Here we use deep because we want to keep the 2689 * transitions which indicate a conflict 2690 */ 2691 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) && 2692 (t1->counter == t2->counter) && 2693 (t1->count == t2->count)) 2694 t2->to = -1; /* eliminated */ 2695 } 2696 } 2697 } 2698 } 2699 } 2700 2701 /* 2702 * Check for all states that there aren't 2 transitions 2703 * with the same atom and a different target. 2704 */ 2705 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2706 state = ctxt->states[statenr]; 2707 if (state == NULL) 2708 continue; 2709 if (state->nbTrans < 2) 2710 continue; 2711 last = NULL; 2712 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2713 t1 = &(state->trans[transnr]); 2714 /* 2715 * Determinism checks in case of counted or all transitions 2716 * will have to be handled separately 2717 */ 2718 if (t1->atom == NULL) { 2719 continue; 2720 } 2721 if (t1->to == -1) /* eliminated */ 2722 continue; 2723 for (i = 0;i < transnr;i++) { 2724 t2 = &(state->trans[i]); 2725 if (t2->to == -1) /* eliminated */ 2726 continue; 2727 if (t2->atom != NULL) { 2728 /* 2729 * But here we don't use deep because we want to 2730 * find transitions which indicate a conflict 2731 */ 2732 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) { 2733 ret = 0; 2734 /* mark the transitions as non-deterministic ones */ 2735 t1->nd = 1; 2736 t2->nd = 1; 2737 last = t1; 2738 } 2739 } else if (t1->to != -1) { 2740 /* 2741 * do the closure in case of remaining specific 2742 * epsilon transitions like choices or all 2743 */ 2744 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2745 t2->to, t2->atom); 2746 /* don't shortcut the computation so all non deterministic 2747 transition get marked down 2748 if (ret == 0) 2749 return(0); 2750 */ 2751 if (ret == 0) { 2752 t1->nd = 1; 2753 /* t2->nd = 1; */ 2754 last = t1; 2755 } 2756 } 2757 } 2758 /* don't shortcut the computation so all non deterministic 2759 transition get marked down 2760 if (ret == 0) 2761 break; */ 2762 } 2763 2764 /* 2765 * mark specifically the last non-deterministic transition 2766 * from a state since there is no need to set-up rollback 2767 * from it 2768 */ 2769 if (last != NULL) { 2770 last->nd = 2; 2771 } 2772 2773 /* don't shortcut the computation so all non deterministic 2774 transition get marked down 2775 if (ret == 0) 2776 break; */ 2777 } 2778 2779 ctxt->determinist = ret; 2780 return(ret); 2781 } 2782 2783 /************************************************************************ 2784 * * 2785 * Routines to check input against transition atoms * 2786 * * 2787 ************************************************************************/ 2788 2789 static int 2790 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2791 int start, int end, const xmlChar *blockName) { 2792 int ret = 0; 2793 2794 switch (type) { 2795 case XML_REGEXP_STRING: 2796 case XML_REGEXP_SUBREG: 2797 case XML_REGEXP_RANGES: 2798 case XML_REGEXP_EPSILON: 2799 return(-1); 2800 case XML_REGEXP_ANYCHAR: 2801 ret = ((codepoint != '\n') && (codepoint != '\r')); 2802 break; 2803 case XML_REGEXP_CHARVAL: 2804 ret = ((codepoint >= start) && (codepoint <= end)); 2805 break; 2806 case XML_REGEXP_NOTSPACE: 2807 neg = !neg; 2808 case XML_REGEXP_ANYSPACE: 2809 ret = ((codepoint == '\n') || (codepoint == '\r') || 2810 (codepoint == '\t') || (codepoint == ' ')); 2811 break; 2812 case XML_REGEXP_NOTINITNAME: 2813 neg = !neg; 2814 case XML_REGEXP_INITNAME: 2815 ret = (IS_LETTER(codepoint) || 2816 (codepoint == '_') || (codepoint == ':')); 2817 break; 2818 case XML_REGEXP_NOTNAMECHAR: 2819 neg = !neg; 2820 case XML_REGEXP_NAMECHAR: 2821 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2822 (codepoint == '.') || (codepoint == '-') || 2823 (codepoint == '_') || (codepoint == ':') || 2824 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2825 break; 2826 case XML_REGEXP_NOTDECIMAL: 2827 neg = !neg; 2828 case XML_REGEXP_DECIMAL: 2829 ret = xmlUCSIsCatNd(codepoint); 2830 break; 2831 case XML_REGEXP_REALCHAR: 2832 neg = !neg; 2833 case XML_REGEXP_NOTREALCHAR: 2834 ret = xmlUCSIsCatP(codepoint); 2835 if (ret == 0) 2836 ret = xmlUCSIsCatZ(codepoint); 2837 if (ret == 0) 2838 ret = xmlUCSIsCatC(codepoint); 2839 break; 2840 case XML_REGEXP_LETTER: 2841 ret = xmlUCSIsCatL(codepoint); 2842 break; 2843 case XML_REGEXP_LETTER_UPPERCASE: 2844 ret = xmlUCSIsCatLu(codepoint); 2845 break; 2846 case XML_REGEXP_LETTER_LOWERCASE: 2847 ret = xmlUCSIsCatLl(codepoint); 2848 break; 2849 case XML_REGEXP_LETTER_TITLECASE: 2850 ret = xmlUCSIsCatLt(codepoint); 2851 break; 2852 case XML_REGEXP_LETTER_MODIFIER: 2853 ret = xmlUCSIsCatLm(codepoint); 2854 break; 2855 case XML_REGEXP_LETTER_OTHERS: 2856 ret = xmlUCSIsCatLo(codepoint); 2857 break; 2858 case XML_REGEXP_MARK: 2859 ret = xmlUCSIsCatM(codepoint); 2860 break; 2861 case XML_REGEXP_MARK_NONSPACING: 2862 ret = xmlUCSIsCatMn(codepoint); 2863 break; 2864 case XML_REGEXP_MARK_SPACECOMBINING: 2865 ret = xmlUCSIsCatMc(codepoint); 2866 break; 2867 case XML_REGEXP_MARK_ENCLOSING: 2868 ret = xmlUCSIsCatMe(codepoint); 2869 break; 2870 case XML_REGEXP_NUMBER: 2871 ret = xmlUCSIsCatN(codepoint); 2872 break; 2873 case XML_REGEXP_NUMBER_DECIMAL: 2874 ret = xmlUCSIsCatNd(codepoint); 2875 break; 2876 case XML_REGEXP_NUMBER_LETTER: 2877 ret = xmlUCSIsCatNl(codepoint); 2878 break; 2879 case XML_REGEXP_NUMBER_OTHERS: 2880 ret = xmlUCSIsCatNo(codepoint); 2881 break; 2882 case XML_REGEXP_PUNCT: 2883 ret = xmlUCSIsCatP(codepoint); 2884 break; 2885 case XML_REGEXP_PUNCT_CONNECTOR: 2886 ret = xmlUCSIsCatPc(codepoint); 2887 break; 2888 case XML_REGEXP_PUNCT_DASH: 2889 ret = xmlUCSIsCatPd(codepoint); 2890 break; 2891 case XML_REGEXP_PUNCT_OPEN: 2892 ret = xmlUCSIsCatPs(codepoint); 2893 break; 2894 case XML_REGEXP_PUNCT_CLOSE: 2895 ret = xmlUCSIsCatPe(codepoint); 2896 break; 2897 case XML_REGEXP_PUNCT_INITQUOTE: 2898 ret = xmlUCSIsCatPi(codepoint); 2899 break; 2900 case XML_REGEXP_PUNCT_FINQUOTE: 2901 ret = xmlUCSIsCatPf(codepoint); 2902 break; 2903 case XML_REGEXP_PUNCT_OTHERS: 2904 ret = xmlUCSIsCatPo(codepoint); 2905 break; 2906 case XML_REGEXP_SEPAR: 2907 ret = xmlUCSIsCatZ(codepoint); 2908 break; 2909 case XML_REGEXP_SEPAR_SPACE: 2910 ret = xmlUCSIsCatZs(codepoint); 2911 break; 2912 case XML_REGEXP_SEPAR_LINE: 2913 ret = xmlUCSIsCatZl(codepoint); 2914 break; 2915 case XML_REGEXP_SEPAR_PARA: 2916 ret = xmlUCSIsCatZp(codepoint); 2917 break; 2918 case XML_REGEXP_SYMBOL: 2919 ret = xmlUCSIsCatS(codepoint); 2920 break; 2921 case XML_REGEXP_SYMBOL_MATH: 2922 ret = xmlUCSIsCatSm(codepoint); 2923 break; 2924 case XML_REGEXP_SYMBOL_CURRENCY: 2925 ret = xmlUCSIsCatSc(codepoint); 2926 break; 2927 case XML_REGEXP_SYMBOL_MODIFIER: 2928 ret = xmlUCSIsCatSk(codepoint); 2929 break; 2930 case XML_REGEXP_SYMBOL_OTHERS: 2931 ret = xmlUCSIsCatSo(codepoint); 2932 break; 2933 case XML_REGEXP_OTHER: 2934 ret = xmlUCSIsCatC(codepoint); 2935 break; 2936 case XML_REGEXP_OTHER_CONTROL: 2937 ret = xmlUCSIsCatCc(codepoint); 2938 break; 2939 case XML_REGEXP_OTHER_FORMAT: 2940 ret = xmlUCSIsCatCf(codepoint); 2941 break; 2942 case XML_REGEXP_OTHER_PRIVATE: 2943 ret = xmlUCSIsCatCo(codepoint); 2944 break; 2945 case XML_REGEXP_OTHER_NA: 2946 /* ret = xmlUCSIsCatCn(codepoint); */ 2947 /* Seems it doesn't exist anymore in recent Unicode releases */ 2948 ret = 0; 2949 break; 2950 case XML_REGEXP_BLOCK_NAME: 2951 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2952 break; 2953 } 2954 if (neg) 2955 return(!ret); 2956 return(ret); 2957 } 2958 2959 static int 2960 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2961 int i, ret = 0; 2962 xmlRegRangePtr range; 2963 2964 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2965 return(-1); 2966 2967 switch (atom->type) { 2968 case XML_REGEXP_SUBREG: 2969 case XML_REGEXP_EPSILON: 2970 return(-1); 2971 case XML_REGEXP_CHARVAL: 2972 return(codepoint == atom->codepoint); 2973 case XML_REGEXP_RANGES: { 2974 int accept = 0; 2975 2976 for (i = 0;i < atom->nbRanges;i++) { 2977 range = atom->ranges[i]; 2978 if (range->neg == 2) { 2979 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2980 0, range->start, range->end, 2981 range->blockName); 2982 if (ret != 0) 2983 return(0); /* excluded char */ 2984 } else if (range->neg) { 2985 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2986 0, range->start, range->end, 2987 range->blockName); 2988 if (ret == 0) 2989 accept = 1; 2990 else 2991 return(0); 2992 } else { 2993 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2994 0, range->start, range->end, 2995 range->blockName); 2996 if (ret != 0) 2997 accept = 1; /* might still be excluded */ 2998 } 2999 } 3000 return(accept); 3001 } 3002 case XML_REGEXP_STRING: 3003 printf("TODO: XML_REGEXP_STRING\n"); 3004 return(-1); 3005 case XML_REGEXP_ANYCHAR: 3006 case XML_REGEXP_ANYSPACE: 3007 case XML_REGEXP_NOTSPACE: 3008 case XML_REGEXP_INITNAME: 3009 case XML_REGEXP_NOTINITNAME: 3010 case XML_REGEXP_NAMECHAR: 3011 case XML_REGEXP_NOTNAMECHAR: 3012 case XML_REGEXP_DECIMAL: 3013 case XML_REGEXP_NOTDECIMAL: 3014 case XML_REGEXP_REALCHAR: 3015 case XML_REGEXP_NOTREALCHAR: 3016 case XML_REGEXP_LETTER: 3017 case XML_REGEXP_LETTER_UPPERCASE: 3018 case XML_REGEXP_LETTER_LOWERCASE: 3019 case XML_REGEXP_LETTER_TITLECASE: 3020 case XML_REGEXP_LETTER_MODIFIER: 3021 case XML_REGEXP_LETTER_OTHERS: 3022 case XML_REGEXP_MARK: 3023 case XML_REGEXP_MARK_NONSPACING: 3024 case XML_REGEXP_MARK_SPACECOMBINING: 3025 case XML_REGEXP_MARK_ENCLOSING: 3026 case XML_REGEXP_NUMBER: 3027 case XML_REGEXP_NUMBER_DECIMAL: 3028 case XML_REGEXP_NUMBER_LETTER: 3029 case XML_REGEXP_NUMBER_OTHERS: 3030 case XML_REGEXP_PUNCT: 3031 case XML_REGEXP_PUNCT_CONNECTOR: 3032 case XML_REGEXP_PUNCT_DASH: 3033 case XML_REGEXP_PUNCT_OPEN: 3034 case XML_REGEXP_PUNCT_CLOSE: 3035 case XML_REGEXP_PUNCT_INITQUOTE: 3036 case XML_REGEXP_PUNCT_FINQUOTE: 3037 case XML_REGEXP_PUNCT_OTHERS: 3038 case XML_REGEXP_SEPAR: 3039 case XML_REGEXP_SEPAR_SPACE: 3040 case XML_REGEXP_SEPAR_LINE: 3041 case XML_REGEXP_SEPAR_PARA: 3042 case XML_REGEXP_SYMBOL: 3043 case XML_REGEXP_SYMBOL_MATH: 3044 case XML_REGEXP_SYMBOL_CURRENCY: 3045 case XML_REGEXP_SYMBOL_MODIFIER: 3046 case XML_REGEXP_SYMBOL_OTHERS: 3047 case XML_REGEXP_OTHER: 3048 case XML_REGEXP_OTHER_CONTROL: 3049 case XML_REGEXP_OTHER_FORMAT: 3050 case XML_REGEXP_OTHER_PRIVATE: 3051 case XML_REGEXP_OTHER_NA: 3052 case XML_REGEXP_BLOCK_NAME: 3053 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 3054 (const xmlChar *)atom->valuep); 3055 if (atom->neg) 3056 ret = !ret; 3057 break; 3058 } 3059 return(ret); 3060 } 3061 3062 /************************************************************************ 3063 * * 3064 * Saving and restoring state of an execution context * 3065 * * 3066 ************************************************************************/ 3067 3068 #ifdef DEBUG_REGEXP_EXEC 3069 static void 3070 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 3071 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 3072 if (exec->inputStack != NULL) { 3073 int i; 3074 printf(": "); 3075 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 3076 printf("%s ", (const char *) 3077 exec->inputStack[exec->inputStackNr - (i + 1)].value); 3078 } else { 3079 printf(": %s", &(exec->inputString[exec->index])); 3080 } 3081 printf("\n"); 3082 } 3083 #endif 3084 3085 static void 3086 xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 3087 #ifdef DEBUG_REGEXP_EXEC 3088 printf("saving "); 3089 exec->transno++; 3090 xmlFARegDebugExec(exec); 3091 exec->transno--; 3092 #endif 3093 #ifdef MAX_PUSH 3094 if (exec->nbPush > MAX_PUSH) { 3095 return; 3096 } 3097 exec->nbPush++; 3098 #endif 3099 3100 if (exec->maxRollbacks == 0) { 3101 exec->maxRollbacks = 4; 3102 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 3103 sizeof(xmlRegExecRollback)); 3104 if (exec->rollbacks == NULL) { 3105 xmlRegexpErrMemory(NULL, "saving regexp"); 3106 exec->maxRollbacks = 0; 3107 return; 3108 } 3109 memset(exec->rollbacks, 0, 3110 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3111 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 3112 xmlRegExecRollback *tmp; 3113 int len = exec->maxRollbacks; 3114 3115 exec->maxRollbacks *= 2; 3116 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 3117 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3118 if (tmp == NULL) { 3119 xmlRegexpErrMemory(NULL, "saving regexp"); 3120 exec->maxRollbacks /= 2; 3121 return; 3122 } 3123 exec->rollbacks = tmp; 3124 tmp = &exec->rollbacks[len]; 3125 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 3126 } 3127 exec->rollbacks[exec->nbRollbacks].state = exec->state; 3128 exec->rollbacks[exec->nbRollbacks].index = exec->index; 3129 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 3130 if (exec->comp->nbCounters > 0) { 3131 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3132 exec->rollbacks[exec->nbRollbacks].counts = (int *) 3133 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 3134 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3135 xmlRegexpErrMemory(NULL, "saving regexp"); 3136 exec->status = -5; 3137 return; 3138 } 3139 } 3140 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 3141 exec->comp->nbCounters * sizeof(int)); 3142 } 3143 exec->nbRollbacks++; 3144 } 3145 3146 static void 3147 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 3148 if (exec->nbRollbacks <= 0) { 3149 exec->status = -1; 3150 #ifdef DEBUG_REGEXP_EXEC 3151 printf("rollback failed on empty stack\n"); 3152 #endif 3153 return; 3154 } 3155 exec->nbRollbacks--; 3156 exec->state = exec->rollbacks[exec->nbRollbacks].state; 3157 exec->index = exec->rollbacks[exec->nbRollbacks].index; 3158 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 3159 if (exec->comp->nbCounters > 0) { 3160 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3161 fprintf(stderr, "exec save: allocation failed"); 3162 exec->status = -6; 3163 return; 3164 } 3165 if (exec->counts) { 3166 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 3167 exec->comp->nbCounters * sizeof(int)); 3168 } 3169 } 3170 3171 #ifdef DEBUG_REGEXP_EXEC 3172 printf("restored "); 3173 xmlFARegDebugExec(exec); 3174 #endif 3175 } 3176 3177 /************************************************************************ 3178 * * 3179 * Verifier, running an input against a compiled regexp * 3180 * * 3181 ************************************************************************/ 3182 3183 static int 3184 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 3185 xmlRegExecCtxt execval; 3186 xmlRegExecCtxtPtr exec = &execval; 3187 int ret, codepoint = 0, len, deter; 3188 3189 exec->inputString = content; 3190 exec->index = 0; 3191 exec->nbPush = 0; 3192 exec->determinist = 1; 3193 exec->maxRollbacks = 0; 3194 exec->nbRollbacks = 0; 3195 exec->rollbacks = NULL; 3196 exec->status = 0; 3197 exec->comp = comp; 3198 exec->state = comp->states[0]; 3199 exec->transno = 0; 3200 exec->transcount = 0; 3201 exec->inputStack = NULL; 3202 exec->inputStackMax = 0; 3203 if (comp->nbCounters > 0) { 3204 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3205 if (exec->counts == NULL) { 3206 xmlRegexpErrMemory(NULL, "running regexp"); 3207 return(-1); 3208 } 3209 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3210 } else 3211 exec->counts = NULL; 3212 while ((exec->status == 0) && (exec->state != NULL) && 3213 ((exec->inputString[exec->index] != 0) || 3214 ((exec->state != NULL) && 3215 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3216 xmlRegTransPtr trans; 3217 xmlRegAtomPtr atom; 3218 3219 /* 3220 * If end of input on non-terminal state, rollback, however we may 3221 * still have epsilon like transition for counted transitions 3222 * on counters, in that case don't break too early. Additionally, 3223 * if we are working on a range like "AB{0,2}", where B is not present, 3224 * we don't want to break. 3225 */ 3226 len = 1; 3227 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3228 /* 3229 * if there is a transition, we must check if 3230 * atom allows minOccurs of 0 3231 */ 3232 if (exec->transno < exec->state->nbTrans) { 3233 trans = &exec->state->trans[exec->transno]; 3234 if (trans->to >=0) { 3235 atom = trans->atom; 3236 if (!((atom->min == 0) && (atom->max > 0))) 3237 goto rollback; 3238 } 3239 } else 3240 goto rollback; 3241 } 3242 3243 exec->transcount = 0; 3244 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3245 trans = &exec->state->trans[exec->transno]; 3246 if (trans->to < 0) 3247 continue; 3248 atom = trans->atom; 3249 ret = 0; 3250 deter = 1; 3251 if (trans->count >= 0) { 3252 int count; 3253 xmlRegCounterPtr counter; 3254 3255 if (exec->counts == NULL) { 3256 exec->status = -1; 3257 goto error; 3258 } 3259 /* 3260 * A counted transition. 3261 */ 3262 3263 count = exec->counts[trans->count]; 3264 counter = &exec->comp->counters[trans->count]; 3265 #ifdef DEBUG_REGEXP_EXEC 3266 printf("testing count %d: val %d, min %d, max %d\n", 3267 trans->count, count, counter->min, counter->max); 3268 #endif 3269 ret = ((count >= counter->min) && (count <= counter->max)); 3270 if ((ret) && (counter->min != counter->max)) 3271 deter = 0; 3272 } else if (atom == NULL) { 3273 fprintf(stderr, "epsilon transition left at runtime\n"); 3274 exec->status = -2; 3275 break; 3276 } else if (exec->inputString[exec->index] != 0) { 3277 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3278 ret = xmlRegCheckCharacter(atom, codepoint); 3279 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3280 xmlRegStatePtr to = comp->states[trans->to]; 3281 3282 /* 3283 * this is a multiple input sequence 3284 * If there is a counter associated increment it now. 3285 * before potentially saving and rollback 3286 * do not increment if the counter is already over the 3287 * maximum limit in which case get to next transition 3288 */ 3289 if (trans->counter >= 0) { 3290 xmlRegCounterPtr counter; 3291 3292 if ((exec->counts == NULL) || 3293 (exec->comp == NULL) || 3294 (exec->comp->counters == NULL)) { 3295 exec->status = -1; 3296 goto error; 3297 } 3298 counter = &exec->comp->counters[trans->counter]; 3299 if (exec->counts[trans->counter] >= counter->max) 3300 continue; /* for loop on transitions */ 3301 3302 #ifdef DEBUG_REGEXP_EXEC 3303 printf("Increasing count %d\n", trans->counter); 3304 #endif 3305 exec->counts[trans->counter]++; 3306 } 3307 if (exec->state->nbTrans > exec->transno + 1) { 3308 xmlFARegExecSave(exec); 3309 } 3310 exec->transcount = 1; 3311 do { 3312 /* 3313 * Try to progress as much as possible on the input 3314 */ 3315 if (exec->transcount == atom->max) { 3316 break; 3317 } 3318 exec->index += len; 3319 /* 3320 * End of input: stop here 3321 */ 3322 if (exec->inputString[exec->index] == 0) { 3323 exec->index -= len; 3324 break; 3325 } 3326 if (exec->transcount >= atom->min) { 3327 int transno = exec->transno; 3328 xmlRegStatePtr state = exec->state; 3329 3330 /* 3331 * The transition is acceptable save it 3332 */ 3333 exec->transno = -1; /* trick */ 3334 exec->state = to; 3335 xmlFARegExecSave(exec); 3336 exec->transno = transno; 3337 exec->state = state; 3338 } 3339 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3340 len); 3341 ret = xmlRegCheckCharacter(atom, codepoint); 3342 exec->transcount++; 3343 } while (ret == 1); 3344 if (exec->transcount < atom->min) 3345 ret = 0; 3346 3347 /* 3348 * If the last check failed but one transition was found 3349 * possible, rollback 3350 */ 3351 if (ret < 0) 3352 ret = 0; 3353 if (ret == 0) { 3354 goto rollback; 3355 } 3356 if (trans->counter >= 0) { 3357 if (exec->counts == NULL) { 3358 exec->status = -1; 3359 goto error; 3360 } 3361 #ifdef DEBUG_REGEXP_EXEC 3362 printf("Decreasing count %d\n", trans->counter); 3363 #endif 3364 exec->counts[trans->counter]--; 3365 } 3366 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3367 /* 3368 * we don't match on the codepoint, but minOccurs of 0 3369 * says that's ok. Setting len to 0 inhibits stepping 3370 * over the codepoint. 3371 */ 3372 exec->transcount = 1; 3373 len = 0; 3374 ret = 1; 3375 } 3376 } else if ((atom->min == 0) && (atom->max > 0)) { 3377 /* another spot to match when minOccurs is 0 */ 3378 exec->transcount = 1; 3379 len = 0; 3380 ret = 1; 3381 } 3382 if (ret == 1) { 3383 if ((trans->nd == 1) || 3384 ((trans->count >= 0) && (deter == 0) && 3385 (exec->state->nbTrans > exec->transno + 1))) { 3386 #ifdef DEBUG_REGEXP_EXEC 3387 if (trans->nd == 1) 3388 printf("Saving on nd transition atom %d for %c at %d\n", 3389 trans->atom->no, codepoint, exec->index); 3390 else 3391 printf("Saving on counted transition count %d for %c at %d\n", 3392 trans->count, codepoint, exec->index); 3393 #endif 3394 xmlFARegExecSave(exec); 3395 } 3396 if (trans->counter >= 0) { 3397 xmlRegCounterPtr counter; 3398 3399 /* make sure we don't go over the counter maximum value */ 3400 if ((exec->counts == NULL) || 3401 (exec->comp == NULL) || 3402 (exec->comp->counters == NULL)) { 3403 exec->status = -1; 3404 goto error; 3405 } 3406 counter = &exec->comp->counters[trans->counter]; 3407 if (exec->counts[trans->counter] >= counter->max) 3408 continue; /* for loop on transitions */ 3409 #ifdef DEBUG_REGEXP_EXEC 3410 printf("Increasing count %d\n", trans->counter); 3411 #endif 3412 exec->counts[trans->counter]++; 3413 } 3414 if ((trans->count >= 0) && 3415 (trans->count < REGEXP_ALL_COUNTER)) { 3416 if (exec->counts == NULL) { 3417 exec->status = -1; 3418 goto error; 3419 } 3420 #ifdef DEBUG_REGEXP_EXEC 3421 printf("resetting count %d on transition\n", 3422 trans->count); 3423 #endif 3424 exec->counts[trans->count] = 0; 3425 } 3426 #ifdef DEBUG_REGEXP_EXEC 3427 printf("entering state %d\n", trans->to); 3428 #endif 3429 exec->state = comp->states[trans->to]; 3430 exec->transno = 0; 3431 if (trans->atom != NULL) { 3432 exec->index += len; 3433 } 3434 goto progress; 3435 } else if (ret < 0) { 3436 exec->status = -4; 3437 break; 3438 } 3439 } 3440 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3441 rollback: 3442 /* 3443 * Failed to find a way out 3444 */ 3445 exec->determinist = 0; 3446 #ifdef DEBUG_REGEXP_EXEC 3447 printf("rollback from state %d on %d:%c\n", exec->state->no, 3448 codepoint,codepoint); 3449 #endif 3450 xmlFARegExecRollBack(exec); 3451 } 3452 progress: 3453 continue; 3454 } 3455 error: 3456 if (exec->rollbacks != NULL) { 3457 if (exec->counts != NULL) { 3458 int i; 3459 3460 for (i = 0;i < exec->maxRollbacks;i++) 3461 if (exec->rollbacks[i].counts != NULL) 3462 xmlFree(exec->rollbacks[i].counts); 3463 } 3464 xmlFree(exec->rollbacks); 3465 } 3466 if (exec->state == NULL) 3467 return(-1); 3468 if (exec->counts != NULL) 3469 xmlFree(exec->counts); 3470 if (exec->status == 0) 3471 return(1); 3472 if (exec->status == -1) { 3473 if (exec->nbPush > MAX_PUSH) 3474 return(-1); 3475 return(0); 3476 } 3477 return(exec->status); 3478 } 3479 3480 /************************************************************************ 3481 * * 3482 * Progressive interface to the verifier one atom at a time * 3483 * * 3484 ************************************************************************/ 3485 #ifdef DEBUG_ERR 3486 static void testerr(xmlRegExecCtxtPtr exec); 3487 #endif 3488 3489 /** 3490 * xmlRegNewExecCtxt: 3491 * @comp: a precompiled regular expression 3492 * @callback: a callback function used for handling progresses in the 3493 * automata matching phase 3494 * @data: the context data associated to the callback in this context 3495 * 3496 * Build a context used for progressive evaluation of a regexp. 3497 * 3498 * Returns the new context 3499 */ 3500 xmlRegExecCtxtPtr 3501 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3502 xmlRegExecCtxtPtr exec; 3503 3504 if (comp == NULL) 3505 return(NULL); 3506 if ((comp->compact == NULL) && (comp->states == NULL)) 3507 return(NULL); 3508 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3509 if (exec == NULL) { 3510 xmlRegexpErrMemory(NULL, "creating execution context"); 3511 return(NULL); 3512 } 3513 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3514 exec->inputString = NULL; 3515 exec->index = 0; 3516 exec->determinist = 1; 3517 exec->maxRollbacks = 0; 3518 exec->nbRollbacks = 0; 3519 exec->rollbacks = NULL; 3520 exec->status = 0; 3521 exec->comp = comp; 3522 if (comp->compact == NULL) 3523 exec->state = comp->states[0]; 3524 exec->transno = 0; 3525 exec->transcount = 0; 3526 exec->callback = callback; 3527 exec->data = data; 3528 if (comp->nbCounters > 0) { 3529 /* 3530 * For error handling, exec->counts is allocated twice the size 3531 * the second half is used to store the data in case of rollback 3532 */ 3533 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3534 * 2); 3535 if (exec->counts == NULL) { 3536 xmlRegexpErrMemory(NULL, "creating execution context"); 3537 xmlFree(exec); 3538 return(NULL); 3539 } 3540 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3541 exec->errCounts = &exec->counts[comp->nbCounters]; 3542 } else { 3543 exec->counts = NULL; 3544 exec->errCounts = NULL; 3545 } 3546 exec->inputStackMax = 0; 3547 exec->inputStackNr = 0; 3548 exec->inputStack = NULL; 3549 exec->errStateNo = -1; 3550 exec->errString = NULL; 3551 exec->nbPush = 0; 3552 return(exec); 3553 } 3554 3555 /** 3556 * xmlRegFreeExecCtxt: 3557 * @exec: a regular expression evaulation context 3558 * 3559 * Free the structures associated to a regular expression evaulation context. 3560 */ 3561 void 3562 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3563 if (exec == NULL) 3564 return; 3565 3566 if (exec->rollbacks != NULL) { 3567 if (exec->counts != NULL) { 3568 int i; 3569 3570 for (i = 0;i < exec->maxRollbacks;i++) 3571 if (exec->rollbacks[i].counts != NULL) 3572 xmlFree(exec->rollbacks[i].counts); 3573 } 3574 xmlFree(exec->rollbacks); 3575 } 3576 if (exec->counts != NULL) 3577 xmlFree(exec->counts); 3578 if (exec->inputStack != NULL) { 3579 int i; 3580 3581 for (i = 0;i < exec->inputStackNr;i++) { 3582 if (exec->inputStack[i].value != NULL) 3583 xmlFree(exec->inputStack[i].value); 3584 } 3585 xmlFree(exec->inputStack); 3586 } 3587 if (exec->errString != NULL) 3588 xmlFree(exec->errString); 3589 xmlFree(exec); 3590 } 3591 3592 static void 3593 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3594 void *data) { 3595 #ifdef DEBUG_PUSH 3596 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3597 #endif 3598 if (exec->inputStackMax == 0) { 3599 exec->inputStackMax = 4; 3600 exec->inputStack = (xmlRegInputTokenPtr) 3601 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3602 if (exec->inputStack == NULL) { 3603 xmlRegexpErrMemory(NULL, "pushing input string"); 3604 exec->inputStackMax = 0; 3605 return; 3606 } 3607 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3608 xmlRegInputTokenPtr tmp; 3609 3610 exec->inputStackMax *= 2; 3611 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3612 exec->inputStackMax * sizeof(xmlRegInputToken)); 3613 if (tmp == NULL) { 3614 xmlRegexpErrMemory(NULL, "pushing input string"); 3615 exec->inputStackMax /= 2; 3616 return; 3617 } 3618 exec->inputStack = tmp; 3619 } 3620 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3621 exec->inputStack[exec->inputStackNr].data = data; 3622 exec->inputStackNr++; 3623 exec->inputStack[exec->inputStackNr].value = NULL; 3624 exec->inputStack[exec->inputStackNr].data = NULL; 3625 } 3626 3627 /** 3628 * xmlRegStrEqualWildcard: 3629 * @expStr: the string to be evaluated 3630 * @valStr: the validation string 3631 * 3632 * Checks if both strings are equal or have the same content. "*" 3633 * can be used as a wildcard in @valStr; "|" is used as a seperator of 3634 * substrings in both @expStr and @valStr. 3635 * 3636 * Returns 1 if the comparison is satisfied and the number of substrings 3637 * is equal, 0 otherwise. 3638 */ 3639 3640 static int 3641 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3642 if (expStr == valStr) return(1); 3643 if (expStr == NULL) return(0); 3644 if (valStr == NULL) return(0); 3645 do { 3646 /* 3647 * Eval if we have a wildcard for the current item. 3648 */ 3649 if (*expStr != *valStr) { 3650 /* if one of them starts with a wildcard make valStr be it */ 3651 if (*valStr == '*') { 3652 const xmlChar *tmp; 3653 3654 tmp = valStr; 3655 valStr = expStr; 3656 expStr = tmp; 3657 } 3658 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3659 do { 3660 if (*valStr == XML_REG_STRING_SEPARATOR) 3661 break; 3662 valStr++; 3663 } while (*valStr != 0); 3664 continue; 3665 } else 3666 return(0); 3667 } 3668 expStr++; 3669 valStr++; 3670 } while (*valStr != 0); 3671 if (*expStr != 0) 3672 return (0); 3673 else 3674 return (1); 3675 } 3676 3677 /** 3678 * xmlRegCompactPushString: 3679 * @exec: a regexp execution context 3680 * @comp: the precompiled exec with a compact table 3681 * @value: a string token input 3682 * @data: data associated to the token to reuse in callbacks 3683 * 3684 * Push one input token in the execution context 3685 * 3686 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3687 * a negative value in case of error. 3688 */ 3689 static int 3690 xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3691 xmlRegexpPtr comp, 3692 const xmlChar *value, 3693 void *data) { 3694 int state = exec->index; 3695 int i, target; 3696 3697 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3698 return(-1); 3699 3700 if (value == NULL) { 3701 /* 3702 * are we at a final state ? 3703 */ 3704 if (comp->compact[state * (comp->nbstrings + 1)] == 3705 XML_REGEXP_FINAL_STATE) 3706 return(1); 3707 return(0); 3708 } 3709 3710 #ifdef DEBUG_PUSH 3711 printf("value pushed: %s\n", value); 3712 #endif 3713 3714 /* 3715 * Examine all outside transitions from current state 3716 */ 3717 for (i = 0;i < comp->nbstrings;i++) { 3718 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3719 if ((target > 0) && (target <= comp->nbstates)) { 3720 target--; /* to avoid 0 */ 3721 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3722 exec->index = target; 3723 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3724 exec->callback(exec->data, value, 3725 comp->transdata[state * comp->nbstrings + i], data); 3726 } 3727 #ifdef DEBUG_PUSH 3728 printf("entering state %d\n", target); 3729 #endif 3730 if (comp->compact[target * (comp->nbstrings + 1)] == 3731 XML_REGEXP_SINK_STATE) 3732 goto error; 3733 3734 if (comp->compact[target * (comp->nbstrings + 1)] == 3735 XML_REGEXP_FINAL_STATE) 3736 return(1); 3737 return(0); 3738 } 3739 } 3740 } 3741 /* 3742 * Failed to find an exit transition out from current state for the 3743 * current token 3744 */ 3745 #ifdef DEBUG_PUSH 3746 printf("failed to find a transition for %s on state %d\n", value, state); 3747 #endif 3748 error: 3749 if (exec->errString != NULL) 3750 xmlFree(exec->errString); 3751 exec->errString = xmlStrdup(value); 3752 exec->errStateNo = state; 3753 exec->status = -1; 3754 #ifdef DEBUG_ERR 3755 testerr(exec); 3756 #endif 3757 return(-1); 3758 } 3759 3760 /** 3761 * xmlRegExecPushStringInternal: 3762 * @exec: a regexp execution context or NULL to indicate the end 3763 * @value: a string token input 3764 * @data: data associated to the token to reuse in callbacks 3765 * @compound: value was assembled from 2 strings 3766 * 3767 * Push one input token in the execution context 3768 * 3769 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3770 * a negative value in case of error. 3771 */ 3772 static int 3773 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3774 void *data, int compound) { 3775 xmlRegTransPtr trans; 3776 xmlRegAtomPtr atom; 3777 int ret; 3778 int final = 0; 3779 int progress = 1; 3780 3781 if (exec == NULL) 3782 return(-1); 3783 if (exec->comp == NULL) 3784 return(-1); 3785 if (exec->status != 0) 3786 return(exec->status); 3787 3788 if (exec->comp->compact != NULL) 3789 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3790 3791 if (value == NULL) { 3792 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3793 return(1); 3794 final = 1; 3795 } 3796 3797 #ifdef DEBUG_PUSH 3798 printf("value pushed: %s\n", value); 3799 #endif 3800 /* 3801 * If we have an active rollback stack push the new value there 3802 * and get back to where we were left 3803 */ 3804 if ((value != NULL) && (exec->inputStackNr > 0)) { 3805 xmlFARegExecSaveInputString(exec, value, data); 3806 value = exec->inputStack[exec->index].value; 3807 data = exec->inputStack[exec->index].data; 3808 #ifdef DEBUG_PUSH 3809 printf("value loaded: %s\n", value); 3810 #endif 3811 } 3812 3813 while ((exec->status == 0) && 3814 ((value != NULL) || 3815 ((final == 1) && 3816 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3817 3818 /* 3819 * End of input on non-terminal state, rollback, however we may 3820 * still have epsilon like transition for counted transitions 3821 * on counters, in that case don't break too early. 3822 */ 3823 if ((value == NULL) && (exec->counts == NULL)) 3824 goto rollback; 3825 3826 exec->transcount = 0; 3827 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3828 trans = &exec->state->trans[exec->transno]; 3829 if (trans->to < 0) 3830 continue; 3831 atom = trans->atom; 3832 ret = 0; 3833 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3834 int i; 3835 int count; 3836 xmlRegTransPtr t; 3837 xmlRegCounterPtr counter; 3838 3839 ret = 0; 3840 3841 #ifdef DEBUG_PUSH 3842 printf("testing all lax %d\n", trans->count); 3843 #endif 3844 /* 3845 * Check all counted transitions from the current state 3846 */ 3847 if ((value == NULL) && (final)) { 3848 ret = 1; 3849 } else if (value != NULL) { 3850 for (i = 0;i < exec->state->nbTrans;i++) { 3851 t = &exec->state->trans[i]; 3852 if ((t->counter < 0) || (t == trans)) 3853 continue; 3854 counter = &exec->comp->counters[t->counter]; 3855 count = exec->counts[t->counter]; 3856 if ((count < counter->max) && 3857 (t->atom != NULL) && 3858 (xmlStrEqual(value, t->atom->valuep))) { 3859 ret = 0; 3860 break; 3861 } 3862 if ((count >= counter->min) && 3863 (count < counter->max) && 3864 (t->atom != NULL) && 3865 (xmlStrEqual(value, t->atom->valuep))) { 3866 ret = 1; 3867 break; 3868 } 3869 } 3870 } 3871 } else if (trans->count == REGEXP_ALL_COUNTER) { 3872 int i; 3873 int count; 3874 xmlRegTransPtr t; 3875 xmlRegCounterPtr counter; 3876 3877 ret = 1; 3878 3879 #ifdef DEBUG_PUSH 3880 printf("testing all %d\n", trans->count); 3881 #endif 3882 /* 3883 * Check all counted transitions from the current state 3884 */ 3885 for (i = 0;i < exec->state->nbTrans;i++) { 3886 t = &exec->state->trans[i]; 3887 if ((t->counter < 0) || (t == trans)) 3888 continue; 3889 counter = &exec->comp->counters[t->counter]; 3890 count = exec->counts[t->counter]; 3891 if ((count < counter->min) || (count > counter->max)) { 3892 ret = 0; 3893 break; 3894 } 3895 } 3896 } else if (trans->count >= 0) { 3897 int count; 3898 xmlRegCounterPtr counter; 3899 3900 /* 3901 * A counted transition. 3902 */ 3903 3904 count = exec->counts[trans->count]; 3905 counter = &exec->comp->counters[trans->count]; 3906 #ifdef DEBUG_PUSH 3907 printf("testing count %d: val %d, min %d, max %d\n", 3908 trans->count, count, counter->min, counter->max); 3909 #endif 3910 ret = ((count >= counter->min) && (count <= counter->max)); 3911 } else if (atom == NULL) { 3912 fprintf(stderr, "epsilon transition left at runtime\n"); 3913 exec->status = -2; 3914 break; 3915 } else if (value != NULL) { 3916 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3917 if (atom->neg) { 3918 ret = !ret; 3919 if (!compound) 3920 ret = 0; 3921 } 3922 if ((ret == 1) && (trans->counter >= 0)) { 3923 xmlRegCounterPtr counter; 3924 int count; 3925 3926 count = exec->counts[trans->counter]; 3927 counter = &exec->comp->counters[trans->counter]; 3928 if (count >= counter->max) 3929 ret = 0; 3930 } 3931 3932 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3933 xmlRegStatePtr to = exec->comp->states[trans->to]; 3934 3935 /* 3936 * this is a multiple input sequence 3937 */ 3938 if (exec->state->nbTrans > exec->transno + 1) { 3939 if (exec->inputStackNr <= 0) { 3940 xmlFARegExecSaveInputString(exec, value, data); 3941 } 3942 xmlFARegExecSave(exec); 3943 } 3944 exec->transcount = 1; 3945 do { 3946 /* 3947 * Try to progress as much as possible on the input 3948 */ 3949 if (exec->transcount == atom->max) { 3950 break; 3951 } 3952 exec->index++; 3953 value = exec->inputStack[exec->index].value; 3954 data = exec->inputStack[exec->index].data; 3955 #ifdef DEBUG_PUSH 3956 printf("value loaded: %s\n", value); 3957 #endif 3958 3959 /* 3960 * End of input: stop here 3961 */ 3962 if (value == NULL) { 3963 exec->index --; 3964 break; 3965 } 3966 if (exec->transcount >= atom->min) { 3967 int transno = exec->transno; 3968 xmlRegStatePtr state = exec->state; 3969 3970 /* 3971 * The transition is acceptable save it 3972 */ 3973 exec->transno = -1; /* trick */ 3974 exec->state = to; 3975 if (exec->inputStackNr <= 0) { 3976 xmlFARegExecSaveInputString(exec, value, data); 3977 } 3978 xmlFARegExecSave(exec); 3979 exec->transno = transno; 3980 exec->state = state; 3981 } 3982 ret = xmlStrEqual(value, atom->valuep); 3983 exec->transcount++; 3984 } while (ret == 1); 3985 if (exec->transcount < atom->min) 3986 ret = 0; 3987 3988 /* 3989 * If the last check failed but one transition was found 3990 * possible, rollback 3991 */ 3992 if (ret < 0) 3993 ret = 0; 3994 if (ret == 0) { 3995 goto rollback; 3996 } 3997 } 3998 } 3999 if (ret == 1) { 4000 if ((exec->callback != NULL) && (atom != NULL) && 4001 (data != NULL)) { 4002 exec->callback(exec->data, atom->valuep, 4003 atom->data, data); 4004 } 4005 if (exec->state->nbTrans > exec->transno + 1) { 4006 if (exec->inputStackNr <= 0) { 4007 xmlFARegExecSaveInputString(exec, value, data); 4008 } 4009 xmlFARegExecSave(exec); 4010 } 4011 if (trans->counter >= 0) { 4012 #ifdef DEBUG_PUSH 4013 printf("Increasing count %d\n", trans->counter); 4014 #endif 4015 exec->counts[trans->counter]++; 4016 } 4017 if ((trans->count >= 0) && 4018 (trans->count < REGEXP_ALL_COUNTER)) { 4019 #ifdef DEBUG_REGEXP_EXEC 4020 printf("resetting count %d on transition\n", 4021 trans->count); 4022 #endif 4023 exec->counts[trans->count] = 0; 4024 } 4025 #ifdef DEBUG_PUSH 4026 printf("entering state %d\n", trans->to); 4027 #endif 4028 if ((exec->comp->states[trans->to] != NULL) && 4029 (exec->comp->states[trans->to]->type == 4030 XML_REGEXP_SINK_STATE)) { 4031 /* 4032 * entering a sink state, save the current state as error 4033 * state. 4034 */ 4035 if (exec->errString != NULL) 4036 xmlFree(exec->errString); 4037 exec->errString = xmlStrdup(value); 4038 exec->errState = exec->state; 4039 memcpy(exec->errCounts, exec->counts, 4040 exec->comp->nbCounters * sizeof(int)); 4041 } 4042 exec->state = exec->comp->states[trans->to]; 4043 exec->transno = 0; 4044 if (trans->atom != NULL) { 4045 if (exec->inputStack != NULL) { 4046 exec->index++; 4047 if (exec->index < exec->inputStackNr) { 4048 value = exec->inputStack[exec->index].value; 4049 data = exec->inputStack[exec->index].data; 4050 #ifdef DEBUG_PUSH 4051 printf("value loaded: %s\n", value); 4052 #endif 4053 } else { 4054 value = NULL; 4055 data = NULL; 4056 #ifdef DEBUG_PUSH 4057 printf("end of input\n"); 4058 #endif 4059 } 4060 } else { 4061 value = NULL; 4062 data = NULL; 4063 #ifdef DEBUG_PUSH 4064 printf("end of input\n"); 4065 #endif 4066 } 4067 } 4068 goto progress; 4069 } else if (ret < 0) { 4070 exec->status = -4; 4071 break; 4072 } 4073 } 4074 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4075 rollback: 4076 /* 4077 * if we didn't yet rollback on the current input 4078 * store the current state as the error state. 4079 */ 4080 if ((progress) && (exec->state != NULL) && 4081 (exec->state->type != XML_REGEXP_SINK_STATE)) { 4082 progress = 0; 4083 if (exec->errString != NULL) 4084 xmlFree(exec->errString); 4085 exec->errString = xmlStrdup(value); 4086 exec->errState = exec->state; 4087 memcpy(exec->errCounts, exec->counts, 4088 exec->comp->nbCounters * sizeof(int)); 4089 } 4090 4091 /* 4092 * Failed to find a way out 4093 */ 4094 exec->determinist = 0; 4095 xmlFARegExecRollBack(exec); 4096 if ((exec->inputStack != NULL ) && (exec->status == 0)) { 4097 value = exec->inputStack[exec->index].value; 4098 data = exec->inputStack[exec->index].data; 4099 #ifdef DEBUG_PUSH 4100 printf("value loaded: %s\n", value); 4101 #endif 4102 } 4103 } 4104 continue; 4105 progress: 4106 progress = 1; 4107 continue; 4108 } 4109 if (exec->status == 0) { 4110 return(exec->state->type == XML_REGEXP_FINAL_STATE); 4111 } 4112 #ifdef DEBUG_ERR 4113 if (exec->status < 0) { 4114 testerr(exec); 4115 } 4116 #endif 4117 return(exec->status); 4118 } 4119 4120 /** 4121 * xmlRegExecPushString: 4122 * @exec: a regexp execution context or NULL to indicate the end 4123 * @value: a string token input 4124 * @data: data associated to the token to reuse in callbacks 4125 * 4126 * Push one input token in the execution context 4127 * 4128 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4129 * a negative value in case of error. 4130 */ 4131 int 4132 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 4133 void *data) { 4134 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 4135 } 4136 4137 /** 4138 * xmlRegExecPushString2: 4139 * @exec: a regexp execution context or NULL to indicate the end 4140 * @value: the first string token input 4141 * @value2: the second string token input 4142 * @data: data associated to the token to reuse in callbacks 4143 * 4144 * Push one input token in the execution context 4145 * 4146 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4147 * a negative value in case of error. 4148 */ 4149 int 4150 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 4151 const xmlChar *value2, void *data) { 4152 xmlChar buf[150]; 4153 int lenn, lenp, ret; 4154 xmlChar *str; 4155 4156 if (exec == NULL) 4157 return(-1); 4158 if (exec->comp == NULL) 4159 return(-1); 4160 if (exec->status != 0) 4161 return(exec->status); 4162 4163 if (value2 == NULL) 4164 return(xmlRegExecPushString(exec, value, data)); 4165 4166 lenn = strlen((char *) value2); 4167 lenp = strlen((char *) value); 4168 4169 if (150 < lenn + lenp + 2) { 4170 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4171 if (str == NULL) { 4172 exec->status = -1; 4173 return(-1); 4174 } 4175 } else { 4176 str = buf; 4177 } 4178 memcpy(&str[0], value, lenp); 4179 str[lenp] = XML_REG_STRING_SEPARATOR; 4180 memcpy(&str[lenp + 1], value2, lenn); 4181 str[lenn + lenp + 1] = 0; 4182 4183 if (exec->comp->compact != NULL) 4184 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 4185 else 4186 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 4187 4188 if (str != buf) 4189 xmlFree(str); 4190 return(ret); 4191 } 4192 4193 /** 4194 * xmlRegExecGetValues: 4195 * @exec: a regexp execution context 4196 * @err: error extraction or normal one 4197 * @nbval: pointer to the number of accepted values IN/OUT 4198 * @nbneg: return number of negative transitions 4199 * @values: pointer to the array of acceptable values 4200 * @terminal: return value if this was a terminal state 4201 * 4202 * Extract informations from the regexp execution, internal routine to 4203 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 4204 * 4205 * Returns: 0 in case of success or -1 in case of error. 4206 */ 4207 static int 4208 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 4209 int *nbval, int *nbneg, 4210 xmlChar **values, int *terminal) { 4211 int maxval; 4212 int nb = 0; 4213 4214 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 4215 (values == NULL) || (*nbval <= 0)) 4216 return(-1); 4217 4218 maxval = *nbval; 4219 *nbval = 0; 4220 *nbneg = 0; 4221 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4222 xmlRegexpPtr comp; 4223 int target, i, state; 4224 4225 comp = exec->comp; 4226 4227 if (err) { 4228 if (exec->errStateNo == -1) return(-1); 4229 state = exec->errStateNo; 4230 } else { 4231 state = exec->index; 4232 } 4233 if (terminal != NULL) { 4234 if (comp->compact[state * (comp->nbstrings + 1)] == 4235 XML_REGEXP_FINAL_STATE) 4236 *terminal = 1; 4237 else 4238 *terminal = 0; 4239 } 4240 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4241 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4242 if ((target > 0) && (target <= comp->nbstates) && 4243 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4244 XML_REGEXP_SINK_STATE)) { 4245 values[nb++] = comp->stringMap[i]; 4246 (*nbval)++; 4247 } 4248 } 4249 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4250 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4251 if ((target > 0) && (target <= comp->nbstates) && 4252 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4253 XML_REGEXP_SINK_STATE)) { 4254 values[nb++] = comp->stringMap[i]; 4255 (*nbneg)++; 4256 } 4257 } 4258 } else { 4259 int transno; 4260 xmlRegTransPtr trans; 4261 xmlRegAtomPtr atom; 4262 xmlRegStatePtr state; 4263 4264 if (terminal != NULL) { 4265 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4266 *terminal = 1; 4267 else 4268 *terminal = 0; 4269 } 4270 4271 if (err) { 4272 if (exec->errState == NULL) return(-1); 4273 state = exec->errState; 4274 } else { 4275 if (exec->state == NULL) return(-1); 4276 state = exec->state; 4277 } 4278 for (transno = 0; 4279 (transno < state->nbTrans) && (nb < maxval); 4280 transno++) { 4281 trans = &state->trans[transno]; 4282 if (trans->to < 0) 4283 continue; 4284 atom = trans->atom; 4285 if ((atom == NULL) || (atom->valuep == NULL)) 4286 continue; 4287 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4288 /* this should not be reached but ... */ 4289 TODO; 4290 } else if (trans->count == REGEXP_ALL_COUNTER) { 4291 /* this should not be reached but ... */ 4292 TODO; 4293 } else if (trans->counter >= 0) { 4294 xmlRegCounterPtr counter = NULL; 4295 int count; 4296 4297 if (err) 4298 count = exec->errCounts[trans->counter]; 4299 else 4300 count = exec->counts[trans->counter]; 4301 if (exec->comp != NULL) 4302 counter = &exec->comp->counters[trans->counter]; 4303 if ((counter == NULL) || (count < counter->max)) { 4304 if (atom->neg) 4305 values[nb++] = (xmlChar *) atom->valuep2; 4306 else 4307 values[nb++] = (xmlChar *) atom->valuep; 4308 (*nbval)++; 4309 } 4310 } else { 4311 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) && 4312 (exec->comp->states[trans->to]->type != 4313 XML_REGEXP_SINK_STATE)) { 4314 if (atom->neg) 4315 values[nb++] = (xmlChar *) atom->valuep2; 4316 else 4317 values[nb++] = (xmlChar *) atom->valuep; 4318 (*nbval)++; 4319 } 4320 } 4321 } 4322 for (transno = 0; 4323 (transno < state->nbTrans) && (nb < maxval); 4324 transno++) { 4325 trans = &state->trans[transno]; 4326 if (trans->to < 0) 4327 continue; 4328 atom = trans->atom; 4329 if ((atom == NULL) || (atom->valuep == NULL)) 4330 continue; 4331 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4332 continue; 4333 } else if (trans->count == REGEXP_ALL_COUNTER) { 4334 continue; 4335 } else if (trans->counter >= 0) { 4336 continue; 4337 } else { 4338 if ((exec->comp->states[trans->to] != NULL) && 4339 (exec->comp->states[trans->to]->type == 4340 XML_REGEXP_SINK_STATE)) { 4341 if (atom->neg) 4342 values[nb++] = (xmlChar *) atom->valuep2; 4343 else 4344 values[nb++] = (xmlChar *) atom->valuep; 4345 (*nbneg)++; 4346 } 4347 } 4348 } 4349 } 4350 return(0); 4351 } 4352 4353 /** 4354 * xmlRegExecNextValues: 4355 * @exec: a regexp execution context 4356 * @nbval: pointer to the number of accepted values IN/OUT 4357 * @nbneg: return number of negative transitions 4358 * @values: pointer to the array of acceptable values 4359 * @terminal: return value if this was a terminal state 4360 * 4361 * Extract informations from the regexp execution, 4362 * the parameter @values must point to an array of @nbval string pointers 4363 * on return nbval will contain the number of possible strings in that 4364 * state and the @values array will be updated with them. The string values 4365 * returned will be freed with the @exec context and don't need to be 4366 * deallocated. 4367 * 4368 * Returns: 0 in case of success or -1 in case of error. 4369 */ 4370 int 4371 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4372 xmlChar **values, int *terminal) { 4373 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4374 } 4375 4376 /** 4377 * xmlRegExecErrInfo: 4378 * @exec: a regexp execution context generating an error 4379 * @string: return value for the error string 4380 * @nbval: pointer to the number of accepted values IN/OUT 4381 * @nbneg: return number of negative transitions 4382 * @values: pointer to the array of acceptable values 4383 * @terminal: return value if this was a terminal state 4384 * 4385 * Extract error informations from the regexp execution, the parameter 4386 * @string will be updated with the value pushed and not accepted, 4387 * the parameter @values must point to an array of @nbval string pointers 4388 * on return nbval will contain the number of possible strings in that 4389 * state and the @values array will be updated with them. The string values 4390 * returned will be freed with the @exec context and don't need to be 4391 * deallocated. 4392 * 4393 * Returns: 0 in case of success or -1 in case of error. 4394 */ 4395 int 4396 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4397 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4398 if (exec == NULL) 4399 return(-1); 4400 if (string != NULL) { 4401 if (exec->status != 0) 4402 *string = exec->errString; 4403 else 4404 *string = NULL; 4405 } 4406 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4407 } 4408 4409 #ifdef DEBUG_ERR 4410 static void testerr(xmlRegExecCtxtPtr exec) { 4411 const xmlChar *string; 4412 xmlChar *values[5]; 4413 int nb = 5; 4414 int nbneg; 4415 int terminal; 4416 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4417 } 4418 #endif 4419 4420 #if 0 4421 static int 4422 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4423 xmlRegTransPtr trans; 4424 xmlRegAtomPtr atom; 4425 int ret; 4426 int codepoint, len; 4427 4428 if (exec == NULL) 4429 return(-1); 4430 if (exec->status != 0) 4431 return(exec->status); 4432 4433 while ((exec->status == 0) && 4434 ((exec->inputString[exec->index] != 0) || 4435 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4436 4437 /* 4438 * End of input on non-terminal state, rollback, however we may 4439 * still have epsilon like transition for counted transitions 4440 * on counters, in that case don't break too early. 4441 */ 4442 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4443 goto rollback; 4444 4445 exec->transcount = 0; 4446 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4447 trans = &exec->state->trans[exec->transno]; 4448 if (trans->to < 0) 4449 continue; 4450 atom = trans->atom; 4451 ret = 0; 4452 if (trans->count >= 0) { 4453 int count; 4454 xmlRegCounterPtr counter; 4455 4456 /* 4457 * A counted transition. 4458 */ 4459 4460 count = exec->counts[trans->count]; 4461 counter = &exec->comp->counters[trans->count]; 4462 #ifdef DEBUG_REGEXP_EXEC 4463 printf("testing count %d: val %d, min %d, max %d\n", 4464 trans->count, count, counter->min, counter->max); 4465 #endif 4466 ret = ((count >= counter->min) && (count <= counter->max)); 4467 } else if (atom == NULL) { 4468 fprintf(stderr, "epsilon transition left at runtime\n"); 4469 exec->status = -2; 4470 break; 4471 } else if (exec->inputString[exec->index] != 0) { 4472 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4473 ret = xmlRegCheckCharacter(atom, codepoint); 4474 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4475 xmlRegStatePtr to = exec->comp->states[trans->to]; 4476 4477 /* 4478 * this is a multiple input sequence 4479 */ 4480 if (exec->state->nbTrans > exec->transno + 1) { 4481 xmlFARegExecSave(exec); 4482 } 4483 exec->transcount = 1; 4484 do { 4485 /* 4486 * Try to progress as much as possible on the input 4487 */ 4488 if (exec->transcount == atom->max) { 4489 break; 4490 } 4491 exec->index += len; 4492 /* 4493 * End of input: stop here 4494 */ 4495 if (exec->inputString[exec->index] == 0) { 4496 exec->index -= len; 4497 break; 4498 } 4499 if (exec->transcount >= atom->min) { 4500 int transno = exec->transno; 4501 xmlRegStatePtr state = exec->state; 4502 4503 /* 4504 * The transition is acceptable save it 4505 */ 4506 exec->transno = -1; /* trick */ 4507 exec->state = to; 4508 xmlFARegExecSave(exec); 4509 exec->transno = transno; 4510 exec->state = state; 4511 } 4512 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4513 len); 4514 ret = xmlRegCheckCharacter(atom, codepoint); 4515 exec->transcount++; 4516 } while (ret == 1); 4517 if (exec->transcount < atom->min) 4518 ret = 0; 4519 4520 /* 4521 * If the last check failed but one transition was found 4522 * possible, rollback 4523 */ 4524 if (ret < 0) 4525 ret = 0; 4526 if (ret == 0) { 4527 goto rollback; 4528 } 4529 } 4530 } 4531 if (ret == 1) { 4532 if (exec->state->nbTrans > exec->transno + 1) { 4533 xmlFARegExecSave(exec); 4534 } 4535 /* 4536 * restart count for expressions like this ((abc){2})* 4537 */ 4538 if (trans->count >= 0) { 4539 #ifdef DEBUG_REGEXP_EXEC 4540 printf("Reset count %d\n", trans->count); 4541 #endif 4542 exec->counts[trans->count] = 0; 4543 } 4544 if (trans->counter >= 0) { 4545 #ifdef DEBUG_REGEXP_EXEC 4546 printf("Increasing count %d\n", trans->counter); 4547 #endif 4548 exec->counts[trans->counter]++; 4549 } 4550 #ifdef DEBUG_REGEXP_EXEC 4551 printf("entering state %d\n", trans->to); 4552 #endif 4553 exec->state = exec->comp->states[trans->to]; 4554 exec->transno = 0; 4555 if (trans->atom != NULL) { 4556 exec->index += len; 4557 } 4558 goto progress; 4559 } else if (ret < 0) { 4560 exec->status = -4; 4561 break; 4562 } 4563 } 4564 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4565 rollback: 4566 /* 4567 * Failed to find a way out 4568 */ 4569 exec->determinist = 0; 4570 xmlFARegExecRollBack(exec); 4571 } 4572 progress: 4573 continue; 4574 } 4575 } 4576 #endif 4577 /************************************************************************ 4578 * * 4579 * Parser for the Schemas Datatype Regular Expressions * 4580 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4581 * * 4582 ************************************************************************/ 4583 4584 /** 4585 * xmlFAIsChar: 4586 * @ctxt: a regexp parser context 4587 * 4588 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4589 */ 4590 static int 4591 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4592 int cur; 4593 int len; 4594 4595 cur = CUR_SCHAR(ctxt->cur, len); 4596 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4597 (cur == '*') || (cur == '+') || (cur == '(') || 4598 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4599 (cur == 0x5D) || (cur == 0)) 4600 return(-1); 4601 return(cur); 4602 } 4603 4604 /** 4605 * xmlFAParseCharProp: 4606 * @ctxt: a regexp parser context 4607 * 4608 * [27] charProp ::= IsCategory | IsBlock 4609 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4610 * Separators | Symbols | Others 4611 * [29] Letters ::= 'L' [ultmo]? 4612 * [30] Marks ::= 'M' [nce]? 4613 * [31] Numbers ::= 'N' [dlo]? 4614 * [32] Punctuation ::= 'P' [cdseifo]? 4615 * [33] Separators ::= 'Z' [slp]? 4616 * [34] Symbols ::= 'S' [mcko]? 4617 * [35] Others ::= 'C' [cfon]? 4618 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4619 */ 4620 static void 4621 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4622 int cur; 4623 xmlRegAtomType type = (xmlRegAtomType) 0; 4624 xmlChar *blockName = NULL; 4625 4626 cur = CUR; 4627 if (cur == 'L') { 4628 NEXT; 4629 cur = CUR; 4630 if (cur == 'u') { 4631 NEXT; 4632 type = XML_REGEXP_LETTER_UPPERCASE; 4633 } else if (cur == 'l') { 4634 NEXT; 4635 type = XML_REGEXP_LETTER_LOWERCASE; 4636 } else if (cur == 't') { 4637 NEXT; 4638 type = XML_REGEXP_LETTER_TITLECASE; 4639 } else if (cur == 'm') { 4640 NEXT; 4641 type = XML_REGEXP_LETTER_MODIFIER; 4642 } else if (cur == 'o') { 4643 NEXT; 4644 type = XML_REGEXP_LETTER_OTHERS; 4645 } else { 4646 type = XML_REGEXP_LETTER; 4647 } 4648 } else if (cur == 'M') { 4649 NEXT; 4650 cur = CUR; 4651 if (cur == 'n') { 4652 NEXT; 4653 /* nonspacing */ 4654 type = XML_REGEXP_MARK_NONSPACING; 4655 } else if (cur == 'c') { 4656 NEXT; 4657 /* spacing combining */ 4658 type = XML_REGEXP_MARK_SPACECOMBINING; 4659 } else if (cur == 'e') { 4660 NEXT; 4661 /* enclosing */ 4662 type = XML_REGEXP_MARK_ENCLOSING; 4663 } else { 4664 /* all marks */ 4665 type = XML_REGEXP_MARK; 4666 } 4667 } else if (cur == 'N') { 4668 NEXT; 4669 cur = CUR; 4670 if (cur == 'd') { 4671 NEXT; 4672 /* digital */ 4673 type = XML_REGEXP_NUMBER_DECIMAL; 4674 } else if (cur == 'l') { 4675 NEXT; 4676 /* letter */ 4677 type = XML_REGEXP_NUMBER_LETTER; 4678 } else if (cur == 'o') { 4679 NEXT; 4680 /* other */ 4681 type = XML_REGEXP_NUMBER_OTHERS; 4682 } else { 4683 /* all numbers */ 4684 type = XML_REGEXP_NUMBER; 4685 } 4686 } else if (cur == 'P') { 4687 NEXT; 4688 cur = CUR; 4689 if (cur == 'c') { 4690 NEXT; 4691 /* connector */ 4692 type = XML_REGEXP_PUNCT_CONNECTOR; 4693 } else if (cur == 'd') { 4694 NEXT; 4695 /* dash */ 4696 type = XML_REGEXP_PUNCT_DASH; 4697 } else if (cur == 's') { 4698 NEXT; 4699 /* open */ 4700 type = XML_REGEXP_PUNCT_OPEN; 4701 } else if (cur == 'e') { 4702 NEXT; 4703 /* close */ 4704 type = XML_REGEXP_PUNCT_CLOSE; 4705 } else if (cur == 'i') { 4706 NEXT; 4707 /* initial quote */ 4708 type = XML_REGEXP_PUNCT_INITQUOTE; 4709 } else if (cur == 'f') { 4710 NEXT; 4711 /* final quote */ 4712 type = XML_REGEXP_PUNCT_FINQUOTE; 4713 } else if (cur == 'o') { 4714 NEXT; 4715 /* other */ 4716 type = XML_REGEXP_PUNCT_OTHERS; 4717 } else { 4718 /* all punctuation */ 4719 type = XML_REGEXP_PUNCT; 4720 } 4721 } else if (cur == 'Z') { 4722 NEXT; 4723 cur = CUR; 4724 if (cur == 's') { 4725 NEXT; 4726 /* space */ 4727 type = XML_REGEXP_SEPAR_SPACE; 4728 } else if (cur == 'l') { 4729 NEXT; 4730 /* line */ 4731 type = XML_REGEXP_SEPAR_LINE; 4732 } else if (cur == 'p') { 4733 NEXT; 4734 /* paragraph */ 4735 type = XML_REGEXP_SEPAR_PARA; 4736 } else { 4737 /* all separators */ 4738 type = XML_REGEXP_SEPAR; 4739 } 4740 } else if (cur == 'S') { 4741 NEXT; 4742 cur = CUR; 4743 if (cur == 'm') { 4744 NEXT; 4745 type = XML_REGEXP_SYMBOL_MATH; 4746 /* math */ 4747 } else if (cur == 'c') { 4748 NEXT; 4749 type = XML_REGEXP_SYMBOL_CURRENCY; 4750 /* currency */ 4751 } else if (cur == 'k') { 4752 NEXT; 4753 type = XML_REGEXP_SYMBOL_MODIFIER; 4754 /* modifiers */ 4755 } else if (cur == 'o') { 4756 NEXT; 4757 type = XML_REGEXP_SYMBOL_OTHERS; 4758 /* other */ 4759 } else { 4760 /* all symbols */ 4761 type = XML_REGEXP_SYMBOL; 4762 } 4763 } else if (cur == 'C') { 4764 NEXT; 4765 cur = CUR; 4766 if (cur == 'c') { 4767 NEXT; 4768 /* control */ 4769 type = XML_REGEXP_OTHER_CONTROL; 4770 } else if (cur == 'f') { 4771 NEXT; 4772 /* format */ 4773 type = XML_REGEXP_OTHER_FORMAT; 4774 } else if (cur == 'o') { 4775 NEXT; 4776 /* private use */ 4777 type = XML_REGEXP_OTHER_PRIVATE; 4778 } else if (cur == 'n') { 4779 NEXT; 4780 /* not assigned */ 4781 type = XML_REGEXP_OTHER_NA; 4782 } else { 4783 /* all others */ 4784 type = XML_REGEXP_OTHER; 4785 } 4786 } else if (cur == 'I') { 4787 const xmlChar *start; 4788 NEXT; 4789 cur = CUR; 4790 if (cur != 's') { 4791 ERROR("IsXXXX expected"); 4792 return; 4793 } 4794 NEXT; 4795 start = ctxt->cur; 4796 cur = CUR; 4797 if (((cur >= 'a') && (cur <= 'z')) || 4798 ((cur >= 'A') && (cur <= 'Z')) || 4799 ((cur >= '0') && (cur <= '9')) || 4800 (cur == 0x2D)) { 4801 NEXT; 4802 cur = CUR; 4803 while (((cur >= 'a') && (cur <= 'z')) || 4804 ((cur >= 'A') && (cur <= 'Z')) || 4805 ((cur >= '0') && (cur <= '9')) || 4806 (cur == 0x2D)) { 4807 NEXT; 4808 cur = CUR; 4809 } 4810 } 4811 type = XML_REGEXP_BLOCK_NAME; 4812 blockName = xmlStrndup(start, ctxt->cur - start); 4813 } else { 4814 ERROR("Unknown char property"); 4815 return; 4816 } 4817 if (ctxt->atom == NULL) { 4818 ctxt->atom = xmlRegNewAtom(ctxt, type); 4819 if (ctxt->atom != NULL) 4820 ctxt->atom->valuep = blockName; 4821 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4822 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4823 type, 0, 0, blockName); 4824 } 4825 } 4826 4827 /** 4828 * xmlFAParseCharClassEsc: 4829 * @ctxt: a regexp parser context 4830 * 4831 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4832 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4833 * [25] catEsc ::= '\p{' charProp '}' 4834 * [26] complEsc ::= '\P{' charProp '}' 4835 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4836 */ 4837 static void 4838 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4839 int cur; 4840 4841 if (CUR == '.') { 4842 if (ctxt->atom == NULL) { 4843 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4844 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4845 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4846 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4847 } 4848 NEXT; 4849 return; 4850 } 4851 if (CUR != '\\') { 4852 ERROR("Escaped sequence: expecting \\"); 4853 return; 4854 } 4855 NEXT; 4856 cur = CUR; 4857 if (cur == 'p') { 4858 NEXT; 4859 if (CUR != '{') { 4860 ERROR("Expecting '{'"); 4861 return; 4862 } 4863 NEXT; 4864 xmlFAParseCharProp(ctxt); 4865 if (CUR != '}') { 4866 ERROR("Expecting '}'"); 4867 return; 4868 } 4869 NEXT; 4870 } else if (cur == 'P') { 4871 NEXT; 4872 if (CUR != '{') { 4873 ERROR("Expecting '{'"); 4874 return; 4875 } 4876 NEXT; 4877 xmlFAParseCharProp(ctxt); 4878 ctxt->atom->neg = 1; 4879 if (CUR != '}') { 4880 ERROR("Expecting '}'"); 4881 return; 4882 } 4883 NEXT; 4884 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4885 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4886 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4887 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4888 (cur == 0x5E)) { 4889 if (ctxt->atom == NULL) { 4890 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4891 if (ctxt->atom != NULL) { 4892 switch (cur) { 4893 case 'n': 4894 ctxt->atom->codepoint = '\n'; 4895 break; 4896 case 'r': 4897 ctxt->atom->codepoint = '\r'; 4898 break; 4899 case 't': 4900 ctxt->atom->codepoint = '\t'; 4901 break; 4902 default: 4903 ctxt->atom->codepoint = cur; 4904 } 4905 } 4906 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4907 switch (cur) { 4908 case 'n': 4909 cur = '\n'; 4910 break; 4911 case 'r': 4912 cur = '\r'; 4913 break; 4914 case 't': 4915 cur = '\t'; 4916 break; 4917 } 4918 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4919 XML_REGEXP_CHARVAL, cur, cur, NULL); 4920 } 4921 NEXT; 4922 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4923 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4924 (cur == 'w') || (cur == 'W')) { 4925 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4926 4927 switch (cur) { 4928 case 's': 4929 type = XML_REGEXP_ANYSPACE; 4930 break; 4931 case 'S': 4932 type = XML_REGEXP_NOTSPACE; 4933 break; 4934 case 'i': 4935 type = XML_REGEXP_INITNAME; 4936 break; 4937 case 'I': 4938 type = XML_REGEXP_NOTINITNAME; 4939 break; 4940 case 'c': 4941 type = XML_REGEXP_NAMECHAR; 4942 break; 4943 case 'C': 4944 type = XML_REGEXP_NOTNAMECHAR; 4945 break; 4946 case 'd': 4947 type = XML_REGEXP_DECIMAL; 4948 break; 4949 case 'D': 4950 type = XML_REGEXP_NOTDECIMAL; 4951 break; 4952 case 'w': 4953 type = XML_REGEXP_REALCHAR; 4954 break; 4955 case 'W': 4956 type = XML_REGEXP_NOTREALCHAR; 4957 break; 4958 } 4959 NEXT; 4960 if (ctxt->atom == NULL) { 4961 ctxt->atom = xmlRegNewAtom(ctxt, type); 4962 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4963 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4964 type, 0, 0, NULL); 4965 } 4966 } else { 4967 ERROR("Wrong escape sequence, misuse of character '\\'"); 4968 } 4969 } 4970 4971 /** 4972 * xmlFAParseCharRange: 4973 * @ctxt: a regexp parser context 4974 * 4975 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 4976 * [18] seRange ::= charOrEsc '-' charOrEsc 4977 * [20] charOrEsc ::= XmlChar | SingleCharEsc 4978 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 4979 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 4980 */ 4981 static void 4982 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 4983 int cur, len; 4984 int start = -1; 4985 int end = -1; 4986 4987 if (CUR == '\0') { 4988 ERROR("Expecting ']'"); 4989 return; 4990 } 4991 4992 cur = CUR; 4993 if (cur == '\\') { 4994 NEXT; 4995 cur = CUR; 4996 switch (cur) { 4997 case 'n': start = 0xA; break; 4998 case 'r': start = 0xD; break; 4999 case 't': start = 0x9; break; 5000 case '\\': case '|': case '.': case '-': case '^': case '?': 5001 case '*': case '+': case '{': case '}': case '(': case ')': 5002 case '[': case ']': 5003 start = cur; break; 5004 default: 5005 ERROR("Invalid escape value"); 5006 return; 5007 } 5008 end = start; 5009 len = 1; 5010 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5011 end = start = CUR_SCHAR(ctxt->cur, len); 5012 } else { 5013 ERROR("Expecting a char range"); 5014 return; 5015 } 5016 /* 5017 * Since we are "inside" a range, we can assume ctxt->cur is past 5018 * the start of ctxt->string, and PREV should be safe 5019 */ 5020 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) { 5021 NEXTL(len); 5022 return; 5023 } 5024 NEXTL(len); 5025 cur = CUR; 5026 if ((cur != '-') || (NXT(1) == ']')) { 5027 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5028 XML_REGEXP_CHARVAL, start, end, NULL); 5029 return; 5030 } 5031 NEXT; 5032 cur = CUR; 5033 if (cur == '\\') { 5034 NEXT; 5035 cur = CUR; 5036 switch (cur) { 5037 case 'n': end = 0xA; break; 5038 case 'r': end = 0xD; break; 5039 case 't': end = 0x9; break; 5040 case '\\': case '|': case '.': case '-': case '^': case '?': 5041 case '*': case '+': case '{': case '}': case '(': case ')': 5042 case '[': case ']': 5043 end = cur; break; 5044 default: 5045 ERROR("Invalid escape value"); 5046 return; 5047 } 5048 len = 1; 5049 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5050 end = CUR_SCHAR(ctxt->cur, len); 5051 } else { 5052 ERROR("Expecting the end of a char range"); 5053 return; 5054 } 5055 NEXTL(len); 5056 /* TODO check that the values are acceptable character ranges for XML */ 5057 if (end < start) { 5058 ERROR("End of range is before start of range"); 5059 } else { 5060 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5061 XML_REGEXP_CHARVAL, start, end, NULL); 5062 } 5063 return; 5064 } 5065 5066 /** 5067 * xmlFAParsePosCharGroup: 5068 * @ctxt: a regexp parser context 5069 * 5070 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 5071 */ 5072 static void 5073 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 5074 do { 5075 if (CUR == '\\') { 5076 xmlFAParseCharClassEsc(ctxt); 5077 } else { 5078 xmlFAParseCharRange(ctxt); 5079 } 5080 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 5081 (CUR != 0) && (ctxt->error == 0)); 5082 } 5083 5084 /** 5085 * xmlFAParseCharGroup: 5086 * @ctxt: a regexp parser context 5087 * 5088 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 5089 * [15] negCharGroup ::= '^' posCharGroup 5090 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 5091 * [12] charClassExpr ::= '[' charGroup ']' 5092 */ 5093 static void 5094 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 5095 int n = ctxt->neg; 5096 while ((CUR != ']') && (ctxt->error == 0)) { 5097 if (CUR == '^') { 5098 int neg = ctxt->neg; 5099 5100 NEXT; 5101 ctxt->neg = !ctxt->neg; 5102 xmlFAParsePosCharGroup(ctxt); 5103 ctxt->neg = neg; 5104 } else if ((CUR == '-') && (NXT(1) == '[')) { 5105 int neg = ctxt->neg; 5106 ctxt->neg = 2; 5107 NEXT; /* eat the '-' */ 5108 NEXT; /* eat the '[' */ 5109 xmlFAParseCharGroup(ctxt); 5110 if (CUR == ']') { 5111 NEXT; 5112 } else { 5113 ERROR("charClassExpr: ']' expected"); 5114 break; 5115 } 5116 ctxt->neg = neg; 5117 break; 5118 } else if (CUR != ']') { 5119 xmlFAParsePosCharGroup(ctxt); 5120 } 5121 } 5122 ctxt->neg = n; 5123 } 5124 5125 /** 5126 * xmlFAParseCharClass: 5127 * @ctxt: a regexp parser context 5128 * 5129 * [11] charClass ::= charClassEsc | charClassExpr 5130 * [12] charClassExpr ::= '[' charGroup ']' 5131 */ 5132 static void 5133 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 5134 if (CUR == '[') { 5135 NEXT; 5136 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 5137 if (ctxt->atom == NULL) 5138 return; 5139 xmlFAParseCharGroup(ctxt); 5140 if (CUR == ']') { 5141 NEXT; 5142 } else { 5143 ERROR("xmlFAParseCharClass: ']' expected"); 5144 } 5145 } else { 5146 xmlFAParseCharClassEsc(ctxt); 5147 } 5148 } 5149 5150 /** 5151 * xmlFAParseQuantExact: 5152 * @ctxt: a regexp parser context 5153 * 5154 * [8] QuantExact ::= [0-9]+ 5155 * 5156 * Returns 0 if success or -1 in case of error 5157 */ 5158 static int 5159 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 5160 int ret = 0; 5161 int ok = 0; 5162 5163 while ((CUR >= '0') && (CUR <= '9')) { 5164 ret = ret * 10 + (CUR - '0'); 5165 ok = 1; 5166 NEXT; 5167 } 5168 if (ok != 1) { 5169 return(-1); 5170 } 5171 return(ret); 5172 } 5173 5174 /** 5175 * xmlFAParseQuantifier: 5176 * @ctxt: a regexp parser context 5177 * 5178 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 5179 * [5] quantity ::= quantRange | quantMin | QuantExact 5180 * [6] quantRange ::= QuantExact ',' QuantExact 5181 * [7] quantMin ::= QuantExact ',' 5182 * [8] QuantExact ::= [0-9]+ 5183 */ 5184 static int 5185 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5186 int cur; 5187 5188 cur = CUR; 5189 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5190 if (ctxt->atom != NULL) { 5191 if (cur == '?') 5192 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5193 else if (cur == '*') 5194 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5195 else if (cur == '+') 5196 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5197 } 5198 NEXT; 5199 return(1); 5200 } 5201 if (cur == '{') { 5202 int min = 0, max = 0; 5203 5204 NEXT; 5205 cur = xmlFAParseQuantExact(ctxt); 5206 if (cur >= 0) 5207 min = cur; 5208 if (CUR == ',') { 5209 NEXT; 5210 if (CUR == '}') 5211 max = INT_MAX; 5212 else { 5213 cur = xmlFAParseQuantExact(ctxt); 5214 if (cur >= 0) 5215 max = cur; 5216 else { 5217 ERROR("Improper quantifier"); 5218 } 5219 } 5220 } 5221 if (CUR == '}') { 5222 NEXT; 5223 } else { 5224 ERROR("Unterminated quantifier"); 5225 } 5226 if (max == 0) 5227 max = min; 5228 if (ctxt->atom != NULL) { 5229 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5230 ctxt->atom->min = min; 5231 ctxt->atom->max = max; 5232 } 5233 return(1); 5234 } 5235 return(0); 5236 } 5237 5238 /** 5239 * xmlFAParseAtom: 5240 * @ctxt: a regexp parser context 5241 * 5242 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5243 */ 5244 static int 5245 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5246 int codepoint, len; 5247 5248 codepoint = xmlFAIsChar(ctxt); 5249 if (codepoint > 0) { 5250 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5251 if (ctxt->atom == NULL) 5252 return(-1); 5253 codepoint = CUR_SCHAR(ctxt->cur, len); 5254 ctxt->atom->codepoint = codepoint; 5255 NEXTL(len); 5256 return(1); 5257 } else if (CUR == '|') { 5258 return(0); 5259 } else if (CUR == 0) { 5260 return(0); 5261 } else if (CUR == ')') { 5262 return(0); 5263 } else if (CUR == '(') { 5264 xmlRegStatePtr start, oldend, start0; 5265 5266 NEXT; 5267 /* 5268 * this extra Epsilon transition is needed if we count with 0 allowed 5269 * unfortunately this can't be known at that point 5270 */ 5271 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5272 start0 = ctxt->state; 5273 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5274 start = ctxt->state; 5275 oldend = ctxt->end; 5276 ctxt->end = NULL; 5277 ctxt->atom = NULL; 5278 xmlFAParseRegExp(ctxt, 0); 5279 if (CUR == ')') { 5280 NEXT; 5281 } else { 5282 ERROR("xmlFAParseAtom: expecting ')'"); 5283 } 5284 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5285 if (ctxt->atom == NULL) 5286 return(-1); 5287 ctxt->atom->start = start; 5288 ctxt->atom->start0 = start0; 5289 ctxt->atom->stop = ctxt->state; 5290 ctxt->end = oldend; 5291 return(1); 5292 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5293 xmlFAParseCharClass(ctxt); 5294 return(1); 5295 } 5296 return(0); 5297 } 5298 5299 /** 5300 * xmlFAParsePiece: 5301 * @ctxt: a regexp parser context 5302 * 5303 * [3] piece ::= atom quantifier? 5304 */ 5305 static int 5306 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5307 int ret; 5308 5309 ctxt->atom = NULL; 5310 ret = xmlFAParseAtom(ctxt); 5311 if (ret == 0) 5312 return(0); 5313 if (ctxt->atom == NULL) { 5314 ERROR("internal: no atom generated"); 5315 } 5316 xmlFAParseQuantifier(ctxt); 5317 return(1); 5318 } 5319 5320 /** 5321 * xmlFAParseBranch: 5322 * @ctxt: a regexp parser context 5323 * @to: optional target to the end of the branch 5324 * 5325 * @to is used to optimize by removing duplicate path in automata 5326 * in expressions like (a|b)(c|d) 5327 * 5328 * [2] branch ::= piece* 5329 */ 5330 static int 5331 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5332 xmlRegStatePtr previous; 5333 int ret; 5334 5335 previous = ctxt->state; 5336 ret = xmlFAParsePiece(ctxt); 5337 if (ret != 0) { 5338 if (xmlFAGenerateTransitions(ctxt, previous, 5339 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5340 return(-1); 5341 previous = ctxt->state; 5342 ctxt->atom = NULL; 5343 } 5344 while ((ret != 0) && (ctxt->error == 0)) { 5345 ret = xmlFAParsePiece(ctxt); 5346 if (ret != 0) { 5347 if (xmlFAGenerateTransitions(ctxt, previous, 5348 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5349 return(-1); 5350 previous = ctxt->state; 5351 ctxt->atom = NULL; 5352 } 5353 } 5354 return(0); 5355 } 5356 5357 /** 5358 * xmlFAParseRegExp: 5359 * @ctxt: a regexp parser context 5360 * @top: is this the top-level expression ? 5361 * 5362 * [1] regExp ::= branch ( '|' branch )* 5363 */ 5364 static void 5365 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5366 xmlRegStatePtr start, end; 5367 5368 /* if not top start should have been generated by an epsilon trans */ 5369 start = ctxt->state; 5370 ctxt->end = NULL; 5371 xmlFAParseBranch(ctxt, NULL); 5372 if (top) { 5373 #ifdef DEBUG_REGEXP_GRAPH 5374 printf("State %d is final\n", ctxt->state->no); 5375 #endif 5376 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5377 } 5378 if (CUR != '|') { 5379 ctxt->end = ctxt->state; 5380 return; 5381 } 5382 end = ctxt->state; 5383 while ((CUR == '|') && (ctxt->error == 0)) { 5384 NEXT; 5385 if (CUR == 0) { 5386 ERROR("expecting a branch after |") 5387 return; 5388 } 5389 ctxt->state = start; 5390 ctxt->end = NULL; 5391 xmlFAParseBranch(ctxt, end); 5392 } 5393 if (!top) { 5394 ctxt->state = end; 5395 ctxt->end = end; 5396 } 5397 } 5398 5399 /************************************************************************ 5400 * * 5401 * The basic API * 5402 * * 5403 ************************************************************************/ 5404 5405 /** 5406 * xmlRegexpPrint: 5407 * @output: the file for the output debug 5408 * @regexp: the compiled regexp 5409 * 5410 * Print the content of the compiled regular expression 5411 */ 5412 void 5413 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5414 int i; 5415 5416 if (output == NULL) 5417 return; 5418 fprintf(output, " regexp: "); 5419 if (regexp == NULL) { 5420 fprintf(output, "NULL\n"); 5421 return; 5422 } 5423 fprintf(output, "'%s' ", regexp->string); 5424 fprintf(output, "\n"); 5425 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5426 for (i = 0;i < regexp->nbAtoms; i++) { 5427 fprintf(output, " %02d ", i); 5428 xmlRegPrintAtom(output, regexp->atoms[i]); 5429 } 5430 fprintf(output, "%d states:", regexp->nbStates); 5431 fprintf(output, "\n"); 5432 for (i = 0;i < regexp->nbStates; i++) { 5433 xmlRegPrintState(output, regexp->states[i]); 5434 } 5435 fprintf(output, "%d counters:\n", regexp->nbCounters); 5436 for (i = 0;i < regexp->nbCounters; i++) { 5437 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5438 regexp->counters[i].max); 5439 } 5440 } 5441 5442 /** 5443 * xmlRegexpCompile: 5444 * @regexp: a regular expression string 5445 * 5446 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5447 * Appendix F and builds an automata suitable for testing strings against 5448 * that regular expression 5449 * 5450 * Returns the compiled expression or NULL in case of error 5451 */ 5452 xmlRegexpPtr 5453 xmlRegexpCompile(const xmlChar *regexp) { 5454 xmlRegexpPtr ret; 5455 xmlRegParserCtxtPtr ctxt; 5456 5457 ctxt = xmlRegNewParserCtxt(regexp); 5458 if (ctxt == NULL) 5459 return(NULL); 5460 5461 /* initialize the parser */ 5462 ctxt->end = NULL; 5463 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5464 xmlRegStatePush(ctxt, ctxt->start); 5465 5466 /* parse the expression building an automata */ 5467 xmlFAParseRegExp(ctxt, 1); 5468 if (CUR != 0) { 5469 ERROR("xmlFAParseRegExp: extra characters"); 5470 } 5471 if (ctxt->error != 0) { 5472 xmlRegFreeParserCtxt(ctxt); 5473 return(NULL); 5474 } 5475 ctxt->end = ctxt->state; 5476 ctxt->start->type = XML_REGEXP_START_STATE; 5477 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5478 5479 /* remove the Epsilon except for counted transitions */ 5480 xmlFAEliminateEpsilonTransitions(ctxt); 5481 5482 5483 if (ctxt->error != 0) { 5484 xmlRegFreeParserCtxt(ctxt); 5485 return(NULL); 5486 } 5487 ret = xmlRegEpxFromParse(ctxt); 5488 xmlRegFreeParserCtxt(ctxt); 5489 return(ret); 5490 } 5491 5492 /** 5493 * xmlRegexpExec: 5494 * @comp: the compiled regular expression 5495 * @content: the value to check against the regular expression 5496 * 5497 * Check if the regular expression generates the value 5498 * 5499 * Returns 1 if it matches, 0 if not and a negative value in case of error 5500 */ 5501 int 5502 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5503 if ((comp == NULL) || (content == NULL)) 5504 return(-1); 5505 return(xmlFARegExec(comp, content)); 5506 } 5507 5508 /** 5509 * xmlRegexpIsDeterminist: 5510 * @comp: the compiled regular expression 5511 * 5512 * Check if the regular expression is determinist 5513 * 5514 * Returns 1 if it yes, 0 if not and a negative value in case of error 5515 */ 5516 int 5517 xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5518 xmlAutomataPtr am; 5519 int ret; 5520 5521 if (comp == NULL) 5522 return(-1); 5523 if (comp->determinist != -1) 5524 return(comp->determinist); 5525 5526 am = xmlNewAutomata(); 5527 if (am->states != NULL) { 5528 int i; 5529 5530 for (i = 0;i < am->nbStates;i++) 5531 xmlRegFreeState(am->states[i]); 5532 xmlFree(am->states); 5533 } 5534 am->nbAtoms = comp->nbAtoms; 5535 am->atoms = comp->atoms; 5536 am->nbStates = comp->nbStates; 5537 am->states = comp->states; 5538 am->determinist = -1; 5539 am->flags = comp->flags; 5540 ret = xmlFAComputesDeterminism(am); 5541 am->atoms = NULL; 5542 am->states = NULL; 5543 xmlFreeAutomata(am); 5544 comp->determinist = ret; 5545 return(ret); 5546 } 5547 5548 /** 5549 * xmlRegFreeRegexp: 5550 * @regexp: the regexp 5551 * 5552 * Free a regexp 5553 */ 5554 void 5555 xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5556 int i; 5557 if (regexp == NULL) 5558 return; 5559 5560 if (regexp->string != NULL) 5561 xmlFree(regexp->string); 5562 if (regexp->states != NULL) { 5563 for (i = 0;i < regexp->nbStates;i++) 5564 xmlRegFreeState(regexp->states[i]); 5565 xmlFree(regexp->states); 5566 } 5567 if (regexp->atoms != NULL) { 5568 for (i = 0;i < regexp->nbAtoms;i++) 5569 xmlRegFreeAtom(regexp->atoms[i]); 5570 xmlFree(regexp->atoms); 5571 } 5572 if (regexp->counters != NULL) 5573 xmlFree(regexp->counters); 5574 if (regexp->compact != NULL) 5575 xmlFree(regexp->compact); 5576 if (regexp->transdata != NULL) 5577 xmlFree(regexp->transdata); 5578 if (regexp->stringMap != NULL) { 5579 for (i = 0; i < regexp->nbstrings;i++) 5580 xmlFree(regexp->stringMap[i]); 5581 xmlFree(regexp->stringMap); 5582 } 5583 5584 xmlFree(regexp); 5585 } 5586 5587 #ifdef LIBXML_AUTOMATA_ENABLED 5588 /************************************************************************ 5589 * * 5590 * The Automata interface * 5591 * * 5592 ************************************************************************/ 5593 5594 /** 5595 * xmlNewAutomata: 5596 * 5597 * Create a new automata 5598 * 5599 * Returns the new object or NULL in case of failure 5600 */ 5601 xmlAutomataPtr 5602 xmlNewAutomata(void) { 5603 xmlAutomataPtr ctxt; 5604 5605 ctxt = xmlRegNewParserCtxt(NULL); 5606 if (ctxt == NULL) 5607 return(NULL); 5608 5609 /* initialize the parser */ 5610 ctxt->end = NULL; 5611 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5612 if (ctxt->start == NULL) { 5613 xmlFreeAutomata(ctxt); 5614 return(NULL); 5615 } 5616 ctxt->start->type = XML_REGEXP_START_STATE; 5617 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5618 xmlRegFreeState(ctxt->start); 5619 xmlFreeAutomata(ctxt); 5620 return(NULL); 5621 } 5622 ctxt->flags = 0; 5623 5624 return(ctxt); 5625 } 5626 5627 /** 5628 * xmlFreeAutomata: 5629 * @am: an automata 5630 * 5631 * Free an automata 5632 */ 5633 void 5634 xmlFreeAutomata(xmlAutomataPtr am) { 5635 if (am == NULL) 5636 return; 5637 xmlRegFreeParserCtxt(am); 5638 } 5639 5640 /** 5641 * xmlAutomataSetFlags: 5642 * @am: an automata 5643 * @flags: a set of internal flags 5644 * 5645 * Set some flags on the automata 5646 */ 5647 void 5648 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) { 5649 if (am == NULL) 5650 return; 5651 am->flags |= flags; 5652 } 5653 5654 /** 5655 * xmlAutomataGetInitState: 5656 * @am: an automata 5657 * 5658 * Initial state lookup 5659 * 5660 * Returns the initial state of the automata 5661 */ 5662 xmlAutomataStatePtr 5663 xmlAutomataGetInitState(xmlAutomataPtr am) { 5664 if (am == NULL) 5665 return(NULL); 5666 return(am->start); 5667 } 5668 5669 /** 5670 * xmlAutomataSetFinalState: 5671 * @am: an automata 5672 * @state: a state in this automata 5673 * 5674 * Makes that state a final state 5675 * 5676 * Returns 0 or -1 in case of error 5677 */ 5678 int 5679 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5680 if ((am == NULL) || (state == NULL)) 5681 return(-1); 5682 state->type = XML_REGEXP_FINAL_STATE; 5683 return(0); 5684 } 5685 5686 /** 5687 * xmlAutomataNewTransition: 5688 * @am: an automata 5689 * @from: the starting point of the transition 5690 * @to: the target point of the transition or NULL 5691 * @token: the input string associated to that transition 5692 * @data: data passed to the callback function if the transition is activated 5693 * 5694 * If @to is NULL, this creates first a new target state in the automata 5695 * and then adds a transition from the @from state to the target state 5696 * activated by the value of @token 5697 * 5698 * Returns the target state or NULL in case of error 5699 */ 5700 xmlAutomataStatePtr 5701 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5702 xmlAutomataStatePtr to, const xmlChar *token, 5703 void *data) { 5704 xmlRegAtomPtr atom; 5705 5706 if ((am == NULL) || (from == NULL) || (token == NULL)) 5707 return(NULL); 5708 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5709 if (atom == NULL) 5710 return(NULL); 5711 atom->data = data; 5712 atom->valuep = xmlStrdup(token); 5713 5714 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5715 xmlRegFreeAtom(atom); 5716 return(NULL); 5717 } 5718 if (to == NULL) 5719 return(am->state); 5720 return(to); 5721 } 5722 5723 /** 5724 * xmlAutomataNewTransition2: 5725 * @am: an automata 5726 * @from: the starting point of the transition 5727 * @to: the target point of the transition or NULL 5728 * @token: the first input string associated to that transition 5729 * @token2: the second input string associated to that transition 5730 * @data: data passed to the callback function if the transition is activated 5731 * 5732 * If @to is NULL, this creates first a new target state in the automata 5733 * and then adds a transition from the @from state to the target state 5734 * activated by the value of @token 5735 * 5736 * Returns the target state or NULL in case of error 5737 */ 5738 xmlAutomataStatePtr 5739 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5740 xmlAutomataStatePtr to, const xmlChar *token, 5741 const xmlChar *token2, void *data) { 5742 xmlRegAtomPtr atom; 5743 5744 if ((am == NULL) || (from == NULL) || (token == NULL)) 5745 return(NULL); 5746 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5747 if (atom == NULL) 5748 return(NULL); 5749 atom->data = data; 5750 if ((token2 == NULL) || (*token2 == 0)) { 5751 atom->valuep = xmlStrdup(token); 5752 } else { 5753 int lenn, lenp; 5754 xmlChar *str; 5755 5756 lenn = strlen((char *) token2); 5757 lenp = strlen((char *) token); 5758 5759 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5760 if (str == NULL) { 5761 xmlRegFreeAtom(atom); 5762 return(NULL); 5763 } 5764 memcpy(&str[0], token, lenp); 5765 str[lenp] = '|'; 5766 memcpy(&str[lenp + 1], token2, lenn); 5767 str[lenn + lenp + 1] = 0; 5768 5769 atom->valuep = str; 5770 } 5771 5772 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5773 xmlRegFreeAtom(atom); 5774 return(NULL); 5775 } 5776 if (to == NULL) 5777 return(am->state); 5778 return(to); 5779 } 5780 5781 /** 5782 * xmlAutomataNewNegTrans: 5783 * @am: an automata 5784 * @from: the starting point of the transition 5785 * @to: the target point of the transition or NULL 5786 * @token: the first input string associated to that transition 5787 * @token2: the second input string associated to that transition 5788 * @data: data passed to the callback function if the transition is activated 5789 * 5790 * If @to is NULL, this creates first a new target state in the automata 5791 * and then adds a transition from the @from state to the target state 5792 * activated by any value except (@token,@token2) 5793 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5794 # the semantic of XSD ##other 5795 * 5796 * Returns the target state or NULL in case of error 5797 */ 5798 xmlAutomataStatePtr 5799 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5800 xmlAutomataStatePtr to, const xmlChar *token, 5801 const xmlChar *token2, void *data) { 5802 xmlRegAtomPtr atom; 5803 xmlChar err_msg[200]; 5804 5805 if ((am == NULL) || (from == NULL) || (token == NULL)) 5806 return(NULL); 5807 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5808 if (atom == NULL) 5809 return(NULL); 5810 atom->data = data; 5811 atom->neg = 1; 5812 if ((token2 == NULL) || (*token2 == 0)) { 5813 atom->valuep = xmlStrdup(token); 5814 } else { 5815 int lenn, lenp; 5816 xmlChar *str; 5817 5818 lenn = strlen((char *) token2); 5819 lenp = strlen((char *) token); 5820 5821 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5822 if (str == NULL) { 5823 xmlRegFreeAtom(atom); 5824 return(NULL); 5825 } 5826 memcpy(&str[0], token, lenp); 5827 str[lenp] = '|'; 5828 memcpy(&str[lenp + 1], token2, lenn); 5829 str[lenn + lenp + 1] = 0; 5830 5831 atom->valuep = str; 5832 } 5833 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5834 err_msg[199] = 0; 5835 atom->valuep2 = xmlStrdup(err_msg); 5836 5837 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5838 xmlRegFreeAtom(atom); 5839 return(NULL); 5840 } 5841 am->negs++; 5842 if (to == NULL) 5843 return(am->state); 5844 return(to); 5845 } 5846 5847 /** 5848 * xmlAutomataNewCountTrans2: 5849 * @am: an automata 5850 * @from: the starting point of the transition 5851 * @to: the target point of the transition or NULL 5852 * @token: the input string associated to that transition 5853 * @token2: the second input string associated to that transition 5854 * @min: the minimum successive occurences of token 5855 * @max: the maximum successive occurences of token 5856 * @data: data associated to the transition 5857 * 5858 * If @to is NULL, this creates first a new target state in the automata 5859 * and then adds a transition from the @from state to the target state 5860 * activated by a succession of input of value @token and @token2 and 5861 * whose number is between @min and @max 5862 * 5863 * Returns the target state or NULL in case of error 5864 */ 5865 xmlAutomataStatePtr 5866 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5867 xmlAutomataStatePtr to, const xmlChar *token, 5868 const xmlChar *token2, 5869 int min, int max, void *data) { 5870 xmlRegAtomPtr atom; 5871 int counter; 5872 5873 if ((am == NULL) || (from == NULL) || (token == NULL)) 5874 return(NULL); 5875 if (min < 0) 5876 return(NULL); 5877 if ((max < min) || (max < 1)) 5878 return(NULL); 5879 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5880 if (atom == NULL) 5881 return(NULL); 5882 if ((token2 == NULL) || (*token2 == 0)) { 5883 atom->valuep = xmlStrdup(token); 5884 } else { 5885 int lenn, lenp; 5886 xmlChar *str; 5887 5888 lenn = strlen((char *) token2); 5889 lenp = strlen((char *) token); 5890 5891 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5892 if (str == NULL) { 5893 xmlRegFreeAtom(atom); 5894 return(NULL); 5895 } 5896 memcpy(&str[0], token, lenp); 5897 str[lenp] = '|'; 5898 memcpy(&str[lenp + 1], token2, lenn); 5899 str[lenn + lenp + 1] = 0; 5900 5901 atom->valuep = str; 5902 } 5903 atom->data = data; 5904 if (min == 0) 5905 atom->min = 1; 5906 else 5907 atom->min = min; 5908 atom->max = max; 5909 5910 /* 5911 * associate a counter to the transition. 5912 */ 5913 counter = xmlRegGetCounter(am); 5914 am->counters[counter].min = min; 5915 am->counters[counter].max = max; 5916 5917 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5918 if (to == NULL) { 5919 to = xmlRegNewState(am); 5920 xmlRegStatePush(am, to); 5921 } 5922 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5923 xmlRegAtomPush(am, atom); 5924 am->state = to; 5925 5926 if (to == NULL) 5927 to = am->state; 5928 if (to == NULL) 5929 return(NULL); 5930 if (min == 0) 5931 xmlFAGenerateEpsilonTransition(am, from, to); 5932 return(to); 5933 } 5934 5935 /** 5936 * xmlAutomataNewCountTrans: 5937 * @am: an automata 5938 * @from: the starting point of the transition 5939 * @to: the target point of the transition or NULL 5940 * @token: the input string associated to that transition 5941 * @min: the minimum successive occurences of token 5942 * @max: the maximum successive occurences of token 5943 * @data: data associated to the transition 5944 * 5945 * If @to is NULL, this creates first a new target state in the automata 5946 * and then adds a transition from the @from state to the target state 5947 * activated by a succession of input of value @token and whose number 5948 * is between @min and @max 5949 * 5950 * Returns the target state or NULL in case of error 5951 */ 5952 xmlAutomataStatePtr 5953 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5954 xmlAutomataStatePtr to, const xmlChar *token, 5955 int min, int max, void *data) { 5956 xmlRegAtomPtr atom; 5957 int counter; 5958 5959 if ((am == NULL) || (from == NULL) || (token == NULL)) 5960 return(NULL); 5961 if (min < 0) 5962 return(NULL); 5963 if ((max < min) || (max < 1)) 5964 return(NULL); 5965 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5966 if (atom == NULL) 5967 return(NULL); 5968 atom->valuep = xmlStrdup(token); 5969 atom->data = data; 5970 if (min == 0) 5971 atom->min = 1; 5972 else 5973 atom->min = min; 5974 atom->max = max; 5975 5976 /* 5977 * associate a counter to the transition. 5978 */ 5979 counter = xmlRegGetCounter(am); 5980 am->counters[counter].min = min; 5981 am->counters[counter].max = max; 5982 5983 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5984 if (to == NULL) { 5985 to = xmlRegNewState(am); 5986 xmlRegStatePush(am, to); 5987 } 5988 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5989 xmlRegAtomPush(am, atom); 5990 am->state = to; 5991 5992 if (to == NULL) 5993 to = am->state; 5994 if (to == NULL) 5995 return(NULL); 5996 if (min == 0) 5997 xmlFAGenerateEpsilonTransition(am, from, to); 5998 return(to); 5999 } 6000 6001 /** 6002 * xmlAutomataNewOnceTrans2: 6003 * @am: an automata 6004 * @from: the starting point of the transition 6005 * @to: the target point of the transition or NULL 6006 * @token: the input string associated to that transition 6007 * @token2: the second input string associated to that transition 6008 * @min: the minimum successive occurences of token 6009 * @max: the maximum successive occurences of token 6010 * @data: data associated to the transition 6011 * 6012 * If @to is NULL, this creates first a new target state in the automata 6013 * and then adds a transition from the @from state to the target state 6014 * activated by a succession of input of value @token and @token2 and whose 6015 * number is between @min and @max, moreover that transition can only be 6016 * crossed once. 6017 * 6018 * Returns the target state or NULL in case of error 6019 */ 6020 xmlAutomataStatePtr 6021 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 6022 xmlAutomataStatePtr to, const xmlChar *token, 6023 const xmlChar *token2, 6024 int min, int max, void *data) { 6025 xmlRegAtomPtr atom; 6026 int counter; 6027 6028 if ((am == NULL) || (from == NULL) || (token == NULL)) 6029 return(NULL); 6030 if (min < 1) 6031 return(NULL); 6032 if ((max < min) || (max < 1)) 6033 return(NULL); 6034 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6035 if (atom == NULL) 6036 return(NULL); 6037 if ((token2 == NULL) || (*token2 == 0)) { 6038 atom->valuep = xmlStrdup(token); 6039 } else { 6040 int lenn, lenp; 6041 xmlChar *str; 6042 6043 lenn = strlen((char *) token2); 6044 lenp = strlen((char *) token); 6045 6046 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 6047 if (str == NULL) { 6048 xmlRegFreeAtom(atom); 6049 return(NULL); 6050 } 6051 memcpy(&str[0], token, lenp); 6052 str[lenp] = '|'; 6053 memcpy(&str[lenp + 1], token2, lenn); 6054 str[lenn + lenp + 1] = 0; 6055 6056 atom->valuep = str; 6057 } 6058 atom->data = data; 6059 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6060 atom->min = min; 6061 atom->max = max; 6062 /* 6063 * associate a counter to the transition. 6064 */ 6065 counter = xmlRegGetCounter(am); 6066 am->counters[counter].min = 1; 6067 am->counters[counter].max = 1; 6068 6069 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6070 if (to == NULL) { 6071 to = xmlRegNewState(am); 6072 xmlRegStatePush(am, to); 6073 } 6074 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6075 xmlRegAtomPush(am, atom); 6076 am->state = to; 6077 return(to); 6078 } 6079 6080 6081 6082 /** 6083 * xmlAutomataNewOnceTrans: 6084 * @am: an automata 6085 * @from: the starting point of the transition 6086 * @to: the target point of the transition or NULL 6087 * @token: the input string associated to that transition 6088 * @min: the minimum successive occurences of token 6089 * @max: the maximum successive occurences of token 6090 * @data: data associated to the transition 6091 * 6092 * If @to is NULL, this creates first a new target state in the automata 6093 * and then adds a transition from the @from state to the target state 6094 * activated by a succession of input of value @token and whose number 6095 * is between @min and @max, moreover that transition can only be crossed 6096 * once. 6097 * 6098 * Returns the target state or NULL in case of error 6099 */ 6100 xmlAutomataStatePtr 6101 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6102 xmlAutomataStatePtr to, const xmlChar *token, 6103 int min, int max, void *data) { 6104 xmlRegAtomPtr atom; 6105 int counter; 6106 6107 if ((am == NULL) || (from == NULL) || (token == NULL)) 6108 return(NULL); 6109 if (min < 1) 6110 return(NULL); 6111 if ((max < min) || (max < 1)) 6112 return(NULL); 6113 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6114 if (atom == NULL) 6115 return(NULL); 6116 atom->valuep = xmlStrdup(token); 6117 atom->data = data; 6118 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6119 atom->min = min; 6120 atom->max = max; 6121 /* 6122 * associate a counter to the transition. 6123 */ 6124 counter = xmlRegGetCounter(am); 6125 am->counters[counter].min = 1; 6126 am->counters[counter].max = 1; 6127 6128 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6129 if (to == NULL) { 6130 to = xmlRegNewState(am); 6131 xmlRegStatePush(am, to); 6132 } 6133 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6134 xmlRegAtomPush(am, atom); 6135 am->state = to; 6136 return(to); 6137 } 6138 6139 /** 6140 * xmlAutomataNewState: 6141 * @am: an automata 6142 * 6143 * Create a new disconnected state in the automata 6144 * 6145 * Returns the new state or NULL in case of error 6146 */ 6147 xmlAutomataStatePtr 6148 xmlAutomataNewState(xmlAutomataPtr am) { 6149 xmlAutomataStatePtr to; 6150 6151 if (am == NULL) 6152 return(NULL); 6153 to = xmlRegNewState(am); 6154 xmlRegStatePush(am, to); 6155 return(to); 6156 } 6157 6158 /** 6159 * xmlAutomataNewEpsilon: 6160 * @am: an automata 6161 * @from: the starting point of the transition 6162 * @to: the target point of the transition or NULL 6163 * 6164 * If @to is NULL, this creates first a new target state in the automata 6165 * and then adds an epsilon transition from the @from state to the 6166 * target state 6167 * 6168 * Returns the target state or NULL in case of error 6169 */ 6170 xmlAutomataStatePtr 6171 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 6172 xmlAutomataStatePtr to) { 6173 if ((am == NULL) || (from == NULL)) 6174 return(NULL); 6175 xmlFAGenerateEpsilonTransition(am, from, to); 6176 if (to == NULL) 6177 return(am->state); 6178 return(to); 6179 } 6180 6181 /** 6182 * xmlAutomataNewAllTrans: 6183 * @am: an automata 6184 * @from: the starting point of the transition 6185 * @to: the target point of the transition or NULL 6186 * @lax: allow to transition if not all all transitions have been activated 6187 * 6188 * If @to is NULL, this creates first a new target state in the automata 6189 * and then adds a an ALL transition from the @from state to the 6190 * target state. That transition is an epsilon transition allowed only when 6191 * all transitions from the @from node have been activated. 6192 * 6193 * Returns the target state or NULL in case of error 6194 */ 6195 xmlAutomataStatePtr 6196 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6197 xmlAutomataStatePtr to, int lax) { 6198 if ((am == NULL) || (from == NULL)) 6199 return(NULL); 6200 xmlFAGenerateAllTransition(am, from, to, lax); 6201 if (to == NULL) 6202 return(am->state); 6203 return(to); 6204 } 6205 6206 /** 6207 * xmlAutomataNewCounter: 6208 * @am: an automata 6209 * @min: the minimal value on the counter 6210 * @max: the maximal value on the counter 6211 * 6212 * Create a new counter 6213 * 6214 * Returns the counter number or -1 in case of error 6215 */ 6216 int 6217 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 6218 int ret; 6219 6220 if (am == NULL) 6221 return(-1); 6222 6223 ret = xmlRegGetCounter(am); 6224 if (ret < 0) 6225 return(-1); 6226 am->counters[ret].min = min; 6227 am->counters[ret].max = max; 6228 return(ret); 6229 } 6230 6231 /** 6232 * xmlAutomataNewCountedTrans: 6233 * @am: an automata 6234 * @from: the starting point of the transition 6235 * @to: the target point of the transition or NULL 6236 * @counter: the counter associated to that transition 6237 * 6238 * If @to is NULL, this creates first a new target state in the automata 6239 * and then adds an epsilon transition from the @from state to the target state 6240 * which will increment the counter provided 6241 * 6242 * Returns the target state or NULL in case of error 6243 */ 6244 xmlAutomataStatePtr 6245 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6246 xmlAutomataStatePtr to, int counter) { 6247 if ((am == NULL) || (from == NULL) || (counter < 0)) 6248 return(NULL); 6249 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 6250 if (to == NULL) 6251 return(am->state); 6252 return(to); 6253 } 6254 6255 /** 6256 * xmlAutomataNewCounterTrans: 6257 * @am: an automata 6258 * @from: the starting point of the transition 6259 * @to: the target point of the transition or NULL 6260 * @counter: the counter associated to that transition 6261 * 6262 * If @to is NULL, this creates first a new target state in the automata 6263 * and then adds an epsilon transition from the @from state to the target state 6264 * which will be allowed only if the counter is within the right range. 6265 * 6266 * Returns the target state or NULL in case of error 6267 */ 6268 xmlAutomataStatePtr 6269 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6270 xmlAutomataStatePtr to, int counter) { 6271 if ((am == NULL) || (from == NULL) || (counter < 0)) 6272 return(NULL); 6273 xmlFAGenerateCountedTransition(am, from, to, counter); 6274 if (to == NULL) 6275 return(am->state); 6276 return(to); 6277 } 6278 6279 /** 6280 * xmlAutomataCompile: 6281 * @am: an automata 6282 * 6283 * Compile the automata into a Reg Exp ready for being executed. 6284 * The automata should be free after this point. 6285 * 6286 * Returns the compiled regexp or NULL in case of error 6287 */ 6288 xmlRegexpPtr 6289 xmlAutomataCompile(xmlAutomataPtr am) { 6290 xmlRegexpPtr ret; 6291 6292 if ((am == NULL) || (am->error != 0)) return(NULL); 6293 xmlFAEliminateEpsilonTransitions(am); 6294 /* xmlFAComputesDeterminism(am); */ 6295 ret = xmlRegEpxFromParse(am); 6296 6297 return(ret); 6298 } 6299 6300 /** 6301 * xmlAutomataIsDeterminist: 6302 * @am: an automata 6303 * 6304 * Checks if an automata is determinist. 6305 * 6306 * Returns 1 if true, 0 if not, and -1 in case of error 6307 */ 6308 int 6309 xmlAutomataIsDeterminist(xmlAutomataPtr am) { 6310 int ret; 6311 6312 if (am == NULL) 6313 return(-1); 6314 6315 ret = xmlFAComputesDeterminism(am); 6316 return(ret); 6317 } 6318 #endif /* LIBXML_AUTOMATA_ENABLED */ 6319 6320 #ifdef LIBXML_EXPR_ENABLED 6321 /************************************************************************ 6322 * * 6323 * Formal Expression handling code * 6324 * * 6325 ************************************************************************/ 6326 /************************************************************************ 6327 * * 6328 * Expression handling context * 6329 * * 6330 ************************************************************************/ 6331 6332 struct _xmlExpCtxt { 6333 xmlDictPtr dict; 6334 xmlExpNodePtr *table; 6335 int size; 6336 int nbElems; 6337 int nb_nodes; 6338 int maxNodes; 6339 const char *expr; 6340 const char *cur; 6341 int nb_cons; 6342 int tabSize; 6343 }; 6344 6345 /** 6346 * xmlExpNewCtxt: 6347 * @maxNodes: the maximum number of nodes 6348 * @dict: optional dictionnary to use internally 6349 * 6350 * Creates a new context for manipulating expressions 6351 * 6352 * Returns the context or NULL in case of error 6353 */ 6354 xmlExpCtxtPtr 6355 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 6356 xmlExpCtxtPtr ret; 6357 int size = 256; 6358 6359 if (maxNodes <= 4096) 6360 maxNodes = 4096; 6361 6362 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 6363 if (ret == NULL) 6364 return(NULL); 6365 memset(ret, 0, sizeof(xmlExpCtxt)); 6366 ret->size = size; 6367 ret->nbElems = 0; 6368 ret->maxNodes = maxNodes; 6369 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 6370 if (ret->table == NULL) { 6371 xmlFree(ret); 6372 return(NULL); 6373 } 6374 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 6375 if (dict == NULL) { 6376 ret->dict = xmlDictCreate(); 6377 if (ret->dict == NULL) { 6378 xmlFree(ret->table); 6379 xmlFree(ret); 6380 return(NULL); 6381 } 6382 } else { 6383 ret->dict = dict; 6384 xmlDictReference(ret->dict); 6385 } 6386 return(ret); 6387 } 6388 6389 /** 6390 * xmlExpFreeCtxt: 6391 * @ctxt: an expression context 6392 * 6393 * Free an expression context 6394 */ 6395 void 6396 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 6397 if (ctxt == NULL) 6398 return; 6399 xmlDictFree(ctxt->dict); 6400 if (ctxt->table != NULL) 6401 xmlFree(ctxt->table); 6402 xmlFree(ctxt); 6403 } 6404 6405 /************************************************************************ 6406 * * 6407 * Structure associated to an expression node * 6408 * * 6409 ************************************************************************/ 6410 #define MAX_NODES 10000 6411 6412 /* #define DEBUG_DERIV */ 6413 6414 /* 6415 * TODO: 6416 * - Wildcards 6417 * - public API for creation 6418 * 6419 * Started 6420 * - regression testing 6421 * 6422 * Done 6423 * - split into module and test tool 6424 * - memleaks 6425 */ 6426 6427 typedef enum { 6428 XML_EXP_NILABLE = (1 << 0) 6429 } xmlExpNodeInfo; 6430 6431 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 6432 6433 struct _xmlExpNode { 6434 unsigned char type;/* xmlExpNodeType */ 6435 unsigned char info;/* OR of xmlExpNodeInfo */ 6436 unsigned short key; /* the hash key */ 6437 unsigned int ref; /* The number of references */ 6438 int c_max; /* the maximum length it can consume */ 6439 xmlExpNodePtr exp_left; 6440 xmlExpNodePtr next;/* the next node in the hash table or free list */ 6441 union { 6442 struct { 6443 int f_min; 6444 int f_max; 6445 } count; 6446 struct { 6447 xmlExpNodePtr f_right; 6448 } children; 6449 const xmlChar *f_str; 6450 } field; 6451 }; 6452 6453 #define exp_min field.count.f_min 6454 #define exp_max field.count.f_max 6455 /* #define exp_left field.children.f_left */ 6456 #define exp_right field.children.f_right 6457 #define exp_str field.f_str 6458 6459 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 6460 static xmlExpNode forbiddenExpNode = { 6461 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6462 }; 6463 xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 6464 static xmlExpNode emptyExpNode = { 6465 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6466 }; 6467 xmlExpNodePtr emptyExp = &emptyExpNode; 6468 6469 /************************************************************************ 6470 * * 6471 * The custom hash table for unicity and canonicalization * 6472 * of sub-expressions pointers * 6473 * * 6474 ************************************************************************/ 6475 /* 6476 * xmlExpHashNameComputeKey: 6477 * Calculate the hash key for a token 6478 */ 6479 static unsigned short 6480 xmlExpHashNameComputeKey(const xmlChar *name) { 6481 unsigned short value = 0L; 6482 char ch; 6483 6484 if (name != NULL) { 6485 value += 30 * (*name); 6486 while ((ch = *name++) != 0) { 6487 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 6488 } 6489 } 6490 return (value); 6491 } 6492 6493 /* 6494 * xmlExpHashComputeKey: 6495 * Calculate the hash key for a compound expression 6496 */ 6497 static unsigned short 6498 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 6499 xmlExpNodePtr right) { 6500 unsigned long value; 6501 unsigned short ret; 6502 6503 switch (type) { 6504 case XML_EXP_SEQ: 6505 value = left->key; 6506 value += right->key; 6507 value *= 3; 6508 ret = (unsigned short) value; 6509 break; 6510 case XML_EXP_OR: 6511 value = left->key; 6512 value += right->key; 6513 value *= 7; 6514 ret = (unsigned short) value; 6515 break; 6516 case XML_EXP_COUNT: 6517 value = left->key; 6518 value += right->key; 6519 ret = (unsigned short) value; 6520 break; 6521 default: 6522 ret = 0; 6523 } 6524 return(ret); 6525 } 6526 6527 6528 static xmlExpNodePtr 6529 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6530 xmlExpNodePtr ret; 6531 6532 if (ctxt->nb_nodes >= MAX_NODES) 6533 return(NULL); 6534 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6535 if (ret == NULL) 6536 return(NULL); 6537 memset(ret, 0, sizeof(xmlExpNode)); 6538 ret->type = type; 6539 ret->next = NULL; 6540 ctxt->nb_nodes++; 6541 ctxt->nb_cons++; 6542 return(ret); 6543 } 6544 6545 /** 6546 * xmlExpHashGetEntry: 6547 * @table: the hash table 6548 * 6549 * Get the unique entry from the hash table. The entry is created if 6550 * needed. @left and @right are consumed, i.e. their ref count will 6551 * be decremented by the operation. 6552 * 6553 * Returns the pointer or NULL in case of error 6554 */ 6555 static xmlExpNodePtr 6556 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6557 xmlExpNodePtr left, xmlExpNodePtr right, 6558 const xmlChar *name, int min, int max) { 6559 unsigned short kbase, key; 6560 xmlExpNodePtr entry; 6561 xmlExpNodePtr insert; 6562 6563 if (ctxt == NULL) 6564 return(NULL); 6565 6566 /* 6567 * Check for duplicate and insertion location. 6568 */ 6569 if (type == XML_EXP_ATOM) { 6570 kbase = xmlExpHashNameComputeKey(name); 6571 } else if (type == XML_EXP_COUNT) { 6572 /* COUNT reduction rule 1 */ 6573 /* a{1} -> a */ 6574 if (min == max) { 6575 if (min == 1) { 6576 return(left); 6577 } 6578 if (min == 0) { 6579 xmlExpFree(ctxt, left); 6580 return(emptyExp); 6581 } 6582 } 6583 if (min < 0) { 6584 xmlExpFree(ctxt, left); 6585 return(forbiddenExp); 6586 } 6587 if (max == -1) 6588 kbase = min + 79; 6589 else 6590 kbase = max - min; 6591 kbase += left->key; 6592 } else if (type == XML_EXP_OR) { 6593 /* Forbid reduction rules */ 6594 if (left->type == XML_EXP_FORBID) { 6595 xmlExpFree(ctxt, left); 6596 return(right); 6597 } 6598 if (right->type == XML_EXP_FORBID) { 6599 xmlExpFree(ctxt, right); 6600 return(left); 6601 } 6602 6603 /* OR reduction rule 1 */ 6604 /* a | a reduced to a */ 6605 if (left == right) { 6606 left->ref--; 6607 return(left); 6608 } 6609 /* OR canonicalization rule 1 */ 6610 /* linearize (a | b) | c into a | (b | c) */ 6611 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6612 xmlExpNodePtr tmp = left; 6613 left = right; 6614 right = tmp; 6615 } 6616 /* OR reduction rule 2 */ 6617 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6618 if (right->type == XML_EXP_OR) { 6619 if ((left == right->exp_left) || 6620 (left == right->exp_right)) { 6621 xmlExpFree(ctxt, left); 6622 return(right); 6623 } 6624 } 6625 /* OR canonicalization rule 2 */ 6626 /* linearize (a | b) | c into a | (b | c) */ 6627 if (left->type == XML_EXP_OR) { 6628 xmlExpNodePtr tmp; 6629 6630 /* OR canonicalization rule 2 */ 6631 if ((left->exp_right->type != XML_EXP_OR) && 6632 (left->exp_right->key < left->exp_left->key)) { 6633 tmp = left->exp_right; 6634 left->exp_right = left->exp_left; 6635 left->exp_left = tmp; 6636 } 6637 left->exp_right->ref++; 6638 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6639 NULL, 0, 0); 6640 left->exp_left->ref++; 6641 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6642 NULL, 0, 0); 6643 6644 xmlExpFree(ctxt, left); 6645 return(tmp); 6646 } 6647 if (right->type == XML_EXP_OR) { 6648 /* Ordering in the tree */ 6649 /* C | (A | B) -> A | (B | C) */ 6650 if (left->key > right->exp_right->key) { 6651 xmlExpNodePtr tmp; 6652 right->exp_right->ref++; 6653 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6654 left, NULL, 0, 0); 6655 right->exp_left->ref++; 6656 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6657 tmp, NULL, 0, 0); 6658 xmlExpFree(ctxt, right); 6659 return(tmp); 6660 } 6661 /* Ordering in the tree */ 6662 /* B | (A | C) -> A | (B | C) */ 6663 if (left->key > right->exp_left->key) { 6664 xmlExpNodePtr tmp; 6665 right->exp_right->ref++; 6666 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6667 right->exp_right, NULL, 0, 0); 6668 right->exp_left->ref++; 6669 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6670 tmp, NULL, 0, 0); 6671 xmlExpFree(ctxt, right); 6672 return(tmp); 6673 } 6674 } 6675 /* we know both types are != XML_EXP_OR here */ 6676 else if (left->key > right->key) { 6677 xmlExpNodePtr tmp = left; 6678 left = right; 6679 right = tmp; 6680 } 6681 kbase = xmlExpHashComputeKey(type, left, right); 6682 } else if (type == XML_EXP_SEQ) { 6683 /* Forbid reduction rules */ 6684 if (left->type == XML_EXP_FORBID) { 6685 xmlExpFree(ctxt, right); 6686 return(left); 6687 } 6688 if (right->type == XML_EXP_FORBID) { 6689 xmlExpFree(ctxt, left); 6690 return(right); 6691 } 6692 /* Empty reduction rules */ 6693 if (right->type == XML_EXP_EMPTY) { 6694 return(left); 6695 } 6696 if (left->type == XML_EXP_EMPTY) { 6697 return(right); 6698 } 6699 kbase = xmlExpHashComputeKey(type, left, right); 6700 } else 6701 return(NULL); 6702 6703 key = kbase % ctxt->size; 6704 if (ctxt->table[key] != NULL) { 6705 for (insert = ctxt->table[key]; insert != NULL; 6706 insert = insert->next) { 6707 if ((insert->key == kbase) && 6708 (insert->type == type)) { 6709 if (type == XML_EXP_ATOM) { 6710 if (name == insert->exp_str) { 6711 insert->ref++; 6712 return(insert); 6713 } 6714 } else if (type == XML_EXP_COUNT) { 6715 if ((insert->exp_min == min) && (insert->exp_max == max) && 6716 (insert->exp_left == left)) { 6717 insert->ref++; 6718 left->ref--; 6719 return(insert); 6720 } 6721 } else if ((insert->exp_left == left) && 6722 (insert->exp_right == right)) { 6723 insert->ref++; 6724 left->ref--; 6725 right->ref--; 6726 return(insert); 6727 } 6728 } 6729 } 6730 } 6731 6732 entry = xmlExpNewNode(ctxt, type); 6733 if (entry == NULL) 6734 return(NULL); 6735 entry->key = kbase; 6736 if (type == XML_EXP_ATOM) { 6737 entry->exp_str = name; 6738 entry->c_max = 1; 6739 } else if (type == XML_EXP_COUNT) { 6740 entry->exp_min = min; 6741 entry->exp_max = max; 6742 entry->exp_left = left; 6743 if ((min == 0) || (IS_NILLABLE(left))) 6744 entry->info |= XML_EXP_NILABLE; 6745 if (max < 0) 6746 entry->c_max = -1; 6747 else 6748 entry->c_max = max * entry->exp_left->c_max; 6749 } else { 6750 entry->exp_left = left; 6751 entry->exp_right = right; 6752 if (type == XML_EXP_OR) { 6753 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6754 entry->info |= XML_EXP_NILABLE; 6755 if ((entry->exp_left->c_max == -1) || 6756 (entry->exp_right->c_max == -1)) 6757 entry->c_max = -1; 6758 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6759 entry->c_max = entry->exp_left->c_max; 6760 else 6761 entry->c_max = entry->exp_right->c_max; 6762 } else { 6763 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6764 entry->info |= XML_EXP_NILABLE; 6765 if ((entry->exp_left->c_max == -1) || 6766 (entry->exp_right->c_max == -1)) 6767 entry->c_max = -1; 6768 else 6769 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6770 } 6771 } 6772 entry->ref = 1; 6773 if (ctxt->table[key] != NULL) 6774 entry->next = ctxt->table[key]; 6775 6776 ctxt->table[key] = entry; 6777 ctxt->nbElems++; 6778 6779 return(entry); 6780 } 6781 6782 /** 6783 * xmlExpFree: 6784 * @ctxt: the expression context 6785 * @exp: the expression 6786 * 6787 * Dereference the expression 6788 */ 6789 void 6790 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6791 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6792 return; 6793 exp->ref--; 6794 if (exp->ref == 0) { 6795 unsigned short key; 6796 6797 /* Unlink it first from the hash table */ 6798 key = exp->key % ctxt->size; 6799 if (ctxt->table[key] == exp) { 6800 ctxt->table[key] = exp->next; 6801 } else { 6802 xmlExpNodePtr tmp; 6803 6804 tmp = ctxt->table[key]; 6805 while (tmp != NULL) { 6806 if (tmp->next == exp) { 6807 tmp->next = exp->next; 6808 break; 6809 } 6810 tmp = tmp->next; 6811 } 6812 } 6813 6814 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6815 xmlExpFree(ctxt, exp->exp_left); 6816 xmlExpFree(ctxt, exp->exp_right); 6817 } else if (exp->type == XML_EXP_COUNT) { 6818 xmlExpFree(ctxt, exp->exp_left); 6819 } 6820 xmlFree(exp); 6821 ctxt->nb_nodes--; 6822 } 6823 } 6824 6825 /** 6826 * xmlExpRef: 6827 * @exp: the expression 6828 * 6829 * Increase the reference count of the expression 6830 */ 6831 void 6832 xmlExpRef(xmlExpNodePtr exp) { 6833 if (exp != NULL) 6834 exp->ref++; 6835 } 6836 6837 /** 6838 * xmlExpNewAtom: 6839 * @ctxt: the expression context 6840 * @name: the atom name 6841 * @len: the atom name length in byte (or -1); 6842 * 6843 * Get the atom associated to this name from that context 6844 * 6845 * Returns the node or NULL in case of error 6846 */ 6847 xmlExpNodePtr 6848 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 6849 if ((ctxt == NULL) || (name == NULL)) 6850 return(NULL); 6851 name = xmlDictLookup(ctxt->dict, name, len); 6852 if (name == NULL) 6853 return(NULL); 6854 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 6855 } 6856 6857 /** 6858 * xmlExpNewOr: 6859 * @ctxt: the expression context 6860 * @left: left expression 6861 * @right: right expression 6862 * 6863 * Get the atom associated to the choice @left | @right 6864 * Note that @left and @right are consumed in the operation, to keep 6865 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6866 * this is true even in case of failure (unless ctxt == NULL). 6867 * 6868 * Returns the node or NULL in case of error 6869 */ 6870 xmlExpNodePtr 6871 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6872 if (ctxt == NULL) 6873 return(NULL); 6874 if ((left == NULL) || (right == NULL)) { 6875 xmlExpFree(ctxt, left); 6876 xmlExpFree(ctxt, right); 6877 return(NULL); 6878 } 6879 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 6880 } 6881 6882 /** 6883 * xmlExpNewSeq: 6884 * @ctxt: the expression context 6885 * @left: left expression 6886 * @right: right expression 6887 * 6888 * Get the atom associated to the sequence @left , @right 6889 * Note that @left and @right are consumed in the operation, to keep 6890 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6891 * this is true even in case of failure (unless ctxt == NULL). 6892 * 6893 * Returns the node or NULL in case of error 6894 */ 6895 xmlExpNodePtr 6896 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6897 if (ctxt == NULL) 6898 return(NULL); 6899 if ((left == NULL) || (right == NULL)) { 6900 xmlExpFree(ctxt, left); 6901 xmlExpFree(ctxt, right); 6902 return(NULL); 6903 } 6904 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 6905 } 6906 6907 /** 6908 * xmlExpNewRange: 6909 * @ctxt: the expression context 6910 * @subset: the expression to be repeated 6911 * @min: the lower bound for the repetition 6912 * @max: the upper bound for the repetition, -1 means infinite 6913 * 6914 * Get the atom associated to the range (@subset){@min, @max} 6915 * Note that @subset is consumed in the operation, to keep 6916 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 6917 * this is true even in case of failure (unless ctxt == NULL). 6918 * 6919 * Returns the node or NULL in case of error 6920 */ 6921 xmlExpNodePtr 6922 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 6923 if (ctxt == NULL) 6924 return(NULL); 6925 if ((subset == NULL) || (min < 0) || (max < -1) || 6926 ((max >= 0) && (min > max))) { 6927 xmlExpFree(ctxt, subset); 6928 return(NULL); 6929 } 6930 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 6931 NULL, NULL, min, max)); 6932 } 6933 6934 /************************************************************************ 6935 * * 6936 * Public API for operations on expressions * 6937 * * 6938 ************************************************************************/ 6939 6940 static int 6941 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6942 const xmlChar**list, int len, int nb) { 6943 int tmp, tmp2; 6944 tail: 6945 switch (exp->type) { 6946 case XML_EXP_EMPTY: 6947 return(0); 6948 case XML_EXP_ATOM: 6949 for (tmp = 0;tmp < nb;tmp++) 6950 if (list[tmp] == exp->exp_str) 6951 return(0); 6952 if (nb >= len) 6953 return(-2); 6954 list[nb] = exp->exp_str; 6955 return(1); 6956 case XML_EXP_COUNT: 6957 exp = exp->exp_left; 6958 goto tail; 6959 case XML_EXP_SEQ: 6960 case XML_EXP_OR: 6961 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 6962 if (tmp < 0) 6963 return(tmp); 6964 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 6965 nb + tmp); 6966 if (tmp2 < 0) 6967 return(tmp2); 6968 return(tmp + tmp2); 6969 } 6970 return(-1); 6971 } 6972 6973 /** 6974 * xmlExpGetLanguage: 6975 * @ctxt: the expression context 6976 * @exp: the expression 6977 * @langList: where to store the tokens 6978 * @len: the allocated length of @list 6979 * 6980 * Find all the strings used in @exp and store them in @list 6981 * 6982 * Returns the number of unique strings found, -1 in case of errors and 6983 * -2 if there is more than @len strings 6984 */ 6985 int 6986 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6987 const xmlChar**langList, int len) { 6988 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) 6989 return(-1); 6990 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); 6991 } 6992 6993 static int 6994 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6995 const xmlChar**list, int len, int nb) { 6996 int tmp, tmp2; 6997 tail: 6998 switch (exp->type) { 6999 case XML_EXP_FORBID: 7000 return(0); 7001 case XML_EXP_EMPTY: 7002 return(0); 7003 case XML_EXP_ATOM: 7004 for (tmp = 0;tmp < nb;tmp++) 7005 if (list[tmp] == exp->exp_str) 7006 return(0); 7007 if (nb >= len) 7008 return(-2); 7009 list[nb] = exp->exp_str; 7010 return(1); 7011 case XML_EXP_COUNT: 7012 exp = exp->exp_left; 7013 goto tail; 7014 case XML_EXP_SEQ: 7015 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7016 if (tmp < 0) 7017 return(tmp); 7018 if (IS_NILLABLE(exp->exp_left)) { 7019 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7020 nb + tmp); 7021 if (tmp2 < 0) 7022 return(tmp2); 7023 tmp += tmp2; 7024 } 7025 return(tmp); 7026 case XML_EXP_OR: 7027 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7028 if (tmp < 0) 7029 return(tmp); 7030 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7031 nb + tmp); 7032 if (tmp2 < 0) 7033 return(tmp2); 7034 return(tmp + tmp2); 7035 } 7036 return(-1); 7037 } 7038 7039 /** 7040 * xmlExpGetStart: 7041 * @ctxt: the expression context 7042 * @exp: the expression 7043 * @tokList: where to store the tokens 7044 * @len: the allocated length of @list 7045 * 7046 * Find all the strings that appears at the start of the languages 7047 * accepted by @exp and store them in @list. E.g. for (a, b) | c 7048 * it will return the list [a, c] 7049 * 7050 * Returns the number of unique strings found, -1 in case of errors and 7051 * -2 if there is more than @len strings 7052 */ 7053 int 7054 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7055 const xmlChar**tokList, int len) { 7056 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) 7057 return(-1); 7058 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); 7059 } 7060 7061 /** 7062 * xmlExpIsNillable: 7063 * @exp: the expression 7064 * 7065 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce 7066 * 7067 * Returns 1 if nillable, 0 if not and -1 in case of error 7068 */ 7069 int 7070 xmlExpIsNillable(xmlExpNodePtr exp) { 7071 if (exp == NULL) 7072 return(-1); 7073 return(IS_NILLABLE(exp) != 0); 7074 } 7075 7076 static xmlExpNodePtr 7077 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 7078 { 7079 xmlExpNodePtr ret; 7080 7081 switch (exp->type) { 7082 case XML_EXP_EMPTY: 7083 return(forbiddenExp); 7084 case XML_EXP_FORBID: 7085 return(forbiddenExp); 7086 case XML_EXP_ATOM: 7087 if (exp->exp_str == str) { 7088 #ifdef DEBUG_DERIV 7089 printf("deriv atom: equal => Empty\n"); 7090 #endif 7091 ret = emptyExp; 7092 } else { 7093 #ifdef DEBUG_DERIV 7094 printf("deriv atom: mismatch => forbid\n"); 7095 #endif 7096 /* TODO wildcards here */ 7097 ret = forbiddenExp; 7098 } 7099 return(ret); 7100 case XML_EXP_OR: { 7101 xmlExpNodePtr tmp; 7102 7103 #ifdef DEBUG_DERIV 7104 printf("deriv or: => or(derivs)\n"); 7105 #endif 7106 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7107 if (tmp == NULL) { 7108 return(NULL); 7109 } 7110 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7111 if (ret == NULL) { 7112 xmlExpFree(ctxt, tmp); 7113 return(NULL); 7114 } 7115 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 7116 NULL, 0, 0); 7117 return(ret); 7118 } 7119 case XML_EXP_SEQ: 7120 #ifdef DEBUG_DERIV 7121 printf("deriv seq: starting with left\n"); 7122 #endif 7123 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7124 if (ret == NULL) { 7125 return(NULL); 7126 } else if (ret == forbiddenExp) { 7127 if (IS_NILLABLE(exp->exp_left)) { 7128 #ifdef DEBUG_DERIV 7129 printf("deriv seq: left failed but nillable\n"); 7130 #endif 7131 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7132 } 7133 } else { 7134 #ifdef DEBUG_DERIV 7135 printf("deriv seq: left match => sequence\n"); 7136 #endif 7137 exp->exp_right->ref++; 7138 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 7139 NULL, 0, 0); 7140 } 7141 return(ret); 7142 case XML_EXP_COUNT: { 7143 int min, max; 7144 xmlExpNodePtr tmp; 7145 7146 if (exp->exp_max == 0) 7147 return(forbiddenExp); 7148 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7149 if (ret == NULL) 7150 return(NULL); 7151 if (ret == forbiddenExp) { 7152 #ifdef DEBUG_DERIV 7153 printf("deriv count: pattern mismatch => forbid\n"); 7154 #endif 7155 return(ret); 7156 } 7157 if (exp->exp_max == 1) 7158 return(ret); 7159 if (exp->exp_max < 0) /* unbounded */ 7160 max = -1; 7161 else 7162 max = exp->exp_max - 1; 7163 if (exp->exp_min > 0) 7164 min = exp->exp_min - 1; 7165 else 7166 min = 0; 7167 exp->exp_left->ref++; 7168 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 7169 NULL, min, max); 7170 if (ret == emptyExp) { 7171 #ifdef DEBUG_DERIV 7172 printf("deriv count: match to empty => new count\n"); 7173 #endif 7174 return(tmp); 7175 } 7176 #ifdef DEBUG_DERIV 7177 printf("deriv count: match => sequence with new count\n"); 7178 #endif 7179 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 7180 NULL, 0, 0)); 7181 } 7182 } 7183 return(NULL); 7184 } 7185 7186 /** 7187 * xmlExpStringDerive: 7188 * @ctxt: the expression context 7189 * @exp: the expression 7190 * @str: the string 7191 * @len: the string len in bytes if available 7192 * 7193 * Do one step of Brzozowski derivation of the expression @exp with 7194 * respect to the input string 7195 * 7196 * Returns the resulting expression or NULL in case of internal error 7197 */ 7198 xmlExpNodePtr 7199 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7200 const xmlChar *str, int len) { 7201 const xmlChar *input; 7202 7203 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 7204 return(NULL); 7205 } 7206 /* 7207 * check the string is in the dictionnary, if yes use an interned 7208 * copy, otherwise we know it's not an acceptable input 7209 */ 7210 input = xmlDictExists(ctxt->dict, str, len); 7211 if (input == NULL) { 7212 return(forbiddenExp); 7213 } 7214 return(xmlExpStringDeriveInt(ctxt, exp, input)); 7215 } 7216 7217 static int 7218 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 7219 int ret = 1; 7220 7221 if (sub->c_max == -1) { 7222 if (exp->c_max != -1) 7223 ret = 0; 7224 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 7225 ret = 0; 7226 } 7227 #if 0 7228 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 7229 ret = 0; 7230 #endif 7231 return(ret); 7232 } 7233 7234 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7235 xmlExpNodePtr sub); 7236 /** 7237 * xmlExpDivide: 7238 * @ctxt: the expressions context 7239 * @exp: the englobing expression 7240 * @sub: the subexpression 7241 * @mult: the multiple expression 7242 * @remain: the remain from the derivation of the multiple 7243 * 7244 * Check if exp is a multiple of sub, i.e. if there is a finite number n 7245 * so that sub{n} subsume exp 7246 * 7247 * Returns the multiple value if successful, 0 if it is not a multiple 7248 * and -1 in case of internel error. 7249 */ 7250 7251 static int 7252 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 7253 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 7254 int i; 7255 xmlExpNodePtr tmp, tmp2; 7256 7257 if (mult != NULL) *mult = NULL; 7258 if (remain != NULL) *remain = NULL; 7259 if (exp->c_max == -1) return(0); 7260 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 7261 7262 for (i = 1;i <= exp->c_max;i++) { 7263 sub->ref++; 7264 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7265 sub, NULL, NULL, i, i); 7266 if (tmp == NULL) { 7267 return(-1); 7268 } 7269 if (!xmlExpCheckCard(tmp, exp)) { 7270 xmlExpFree(ctxt, tmp); 7271 continue; 7272 } 7273 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 7274 if (tmp2 == NULL) { 7275 xmlExpFree(ctxt, tmp); 7276 return(-1); 7277 } 7278 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 7279 if (remain != NULL) 7280 *remain = tmp2; 7281 else 7282 xmlExpFree(ctxt, tmp2); 7283 if (mult != NULL) 7284 *mult = tmp; 7285 else 7286 xmlExpFree(ctxt, tmp); 7287 #ifdef DEBUG_DERIV 7288 printf("Divide succeeded %d\n", i); 7289 #endif 7290 return(i); 7291 } 7292 xmlExpFree(ctxt, tmp); 7293 xmlExpFree(ctxt, tmp2); 7294 } 7295 #ifdef DEBUG_DERIV 7296 printf("Divide failed\n"); 7297 #endif 7298 return(0); 7299 } 7300 7301 /** 7302 * xmlExpExpDeriveInt: 7303 * @ctxt: the expressions context 7304 * @exp: the englobing expression 7305 * @sub: the subexpression 7306 * 7307 * Try to do a step of Brzozowski derivation but at a higher level 7308 * the input being a subexpression. 7309 * 7310 * Returns the resulting expression or NULL in case of internal error 7311 */ 7312 static xmlExpNodePtr 7313 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7314 xmlExpNodePtr ret, tmp, tmp2, tmp3; 7315 const xmlChar **tab; 7316 int len, i; 7317 7318 /* 7319 * In case of equality and if the expression can only consume a finite 7320 * amount, then the derivation is empty 7321 */ 7322 if ((exp == sub) && (exp->c_max >= 0)) { 7323 #ifdef DEBUG_DERIV 7324 printf("Equal(exp, sub) and finite -> Empty\n"); 7325 #endif 7326 return(emptyExp); 7327 } 7328 /* 7329 * decompose sub sequence first 7330 */ 7331 if (sub->type == XML_EXP_EMPTY) { 7332 #ifdef DEBUG_DERIV 7333 printf("Empty(sub) -> Empty\n"); 7334 #endif 7335 exp->ref++; 7336 return(exp); 7337 } 7338 if (sub->type == XML_EXP_SEQ) { 7339 #ifdef DEBUG_DERIV 7340 printf("Seq(sub) -> decompose\n"); 7341 #endif 7342 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7343 if (tmp == NULL) 7344 return(NULL); 7345 if (tmp == forbiddenExp) 7346 return(tmp); 7347 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 7348 xmlExpFree(ctxt, tmp); 7349 return(ret); 7350 } 7351 if (sub->type == XML_EXP_OR) { 7352 #ifdef DEBUG_DERIV 7353 printf("Or(sub) -> decompose\n"); 7354 #endif 7355 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7356 if (tmp == forbiddenExp) 7357 return(tmp); 7358 if (tmp == NULL) 7359 return(NULL); 7360 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 7361 if ((ret == NULL) || (ret == forbiddenExp)) { 7362 xmlExpFree(ctxt, tmp); 7363 return(ret); 7364 } 7365 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 7366 } 7367 if (!xmlExpCheckCard(exp, sub)) { 7368 #ifdef DEBUG_DERIV 7369 printf("CheckCard(exp, sub) failed -> Forbid\n"); 7370 #endif 7371 return(forbiddenExp); 7372 } 7373 switch (exp->type) { 7374 case XML_EXP_EMPTY: 7375 if (sub == emptyExp) 7376 return(emptyExp); 7377 #ifdef DEBUG_DERIV 7378 printf("Empty(exp) -> Forbid\n"); 7379 #endif 7380 return(forbiddenExp); 7381 case XML_EXP_FORBID: 7382 #ifdef DEBUG_DERIV 7383 printf("Forbid(exp) -> Forbid\n"); 7384 #endif 7385 return(forbiddenExp); 7386 case XML_EXP_ATOM: 7387 if (sub->type == XML_EXP_ATOM) { 7388 /* TODO: handle wildcards */ 7389 if (exp->exp_str == sub->exp_str) { 7390 #ifdef DEBUG_DERIV 7391 printf("Atom match -> Empty\n"); 7392 #endif 7393 return(emptyExp); 7394 } 7395 #ifdef DEBUG_DERIV 7396 printf("Atom mismatch -> Forbid\n"); 7397 #endif 7398 return(forbiddenExp); 7399 } 7400 if ((sub->type == XML_EXP_COUNT) && 7401 (sub->exp_max == 1) && 7402 (sub->exp_left->type == XML_EXP_ATOM)) { 7403 /* TODO: handle wildcards */ 7404 if (exp->exp_str == sub->exp_left->exp_str) { 7405 #ifdef DEBUG_DERIV 7406 printf("Atom match -> Empty\n"); 7407 #endif 7408 return(emptyExp); 7409 } 7410 #ifdef DEBUG_DERIV 7411 printf("Atom mismatch -> Forbid\n"); 7412 #endif 7413 return(forbiddenExp); 7414 } 7415 #ifdef DEBUG_DERIV 7416 printf("Compex exp vs Atom -> Forbid\n"); 7417 #endif 7418 return(forbiddenExp); 7419 case XML_EXP_SEQ: 7420 /* try to get the sequence consumed only if possible */ 7421 if (xmlExpCheckCard(exp->exp_left, sub)) { 7422 /* See if the sequence can be consumed directly */ 7423 #ifdef DEBUG_DERIV 7424 printf("Seq trying left only\n"); 7425 #endif 7426 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7427 if ((ret != forbiddenExp) && (ret != NULL)) { 7428 #ifdef DEBUG_DERIV 7429 printf("Seq trying left only worked\n"); 7430 #endif 7431 /* 7432 * TODO: assumption here that we are determinist 7433 * i.e. we won't get to a nillable exp left 7434 * subset which could be matched by the right 7435 * part too. 7436 * e.g.: (a | b)+,(a | c) and 'a+,a' 7437 */ 7438 exp->exp_right->ref++; 7439 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7440 exp->exp_right, NULL, 0, 0)); 7441 } 7442 #ifdef DEBUG_DERIV 7443 } else { 7444 printf("Seq: left too short\n"); 7445 #endif 7446 } 7447 /* Try instead to decompose */ 7448 if (sub->type == XML_EXP_COUNT) { 7449 int min, max; 7450 7451 #ifdef DEBUG_DERIV 7452 printf("Seq: sub is a count\n"); 7453 #endif 7454 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7455 if (ret == NULL) 7456 return(NULL); 7457 if (ret != forbiddenExp) { 7458 #ifdef DEBUG_DERIV 7459 printf("Seq , Count match on left\n"); 7460 #endif 7461 if (sub->exp_max < 0) 7462 max = -1; 7463 else 7464 max = sub->exp_max -1; 7465 if (sub->exp_min > 0) 7466 min = sub->exp_min -1; 7467 else 7468 min = 0; 7469 exp->exp_right->ref++; 7470 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7471 exp->exp_right, NULL, 0, 0); 7472 if (tmp == NULL) 7473 return(NULL); 7474 7475 sub->exp_left->ref++; 7476 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7477 sub->exp_left, NULL, NULL, min, max); 7478 if (tmp2 == NULL) { 7479 xmlExpFree(ctxt, tmp); 7480 return(NULL); 7481 } 7482 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7483 xmlExpFree(ctxt, tmp); 7484 xmlExpFree(ctxt, tmp2); 7485 return(ret); 7486 } 7487 } 7488 /* we made no progress on structured operations */ 7489 break; 7490 case XML_EXP_OR: 7491 #ifdef DEBUG_DERIV 7492 printf("Or , trying both side\n"); 7493 #endif 7494 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7495 if (ret == NULL) 7496 return(NULL); 7497 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 7498 if (tmp == NULL) { 7499 xmlExpFree(ctxt, ret); 7500 return(NULL); 7501 } 7502 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 7503 case XML_EXP_COUNT: { 7504 int min, max; 7505 7506 if (sub->type == XML_EXP_COUNT) { 7507 /* 7508 * Try to see if the loop is completely subsumed 7509 */ 7510 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7511 if (tmp == NULL) 7512 return(NULL); 7513 if (tmp == forbiddenExp) { 7514 int mult; 7515 7516 #ifdef DEBUG_DERIV 7517 printf("Count, Count inner don't subsume\n"); 7518 #endif 7519 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7520 NULL, &tmp); 7521 if (mult <= 0) { 7522 #ifdef DEBUG_DERIV 7523 printf("Count, Count not multiple => forbidden\n"); 7524 #endif 7525 return(forbiddenExp); 7526 } 7527 if (sub->exp_max == -1) { 7528 max = -1; 7529 if (exp->exp_max == -1) { 7530 if (exp->exp_min <= sub->exp_min * mult) 7531 min = 0; 7532 else 7533 min = exp->exp_min - sub->exp_min * mult; 7534 } else { 7535 #ifdef DEBUG_DERIV 7536 printf("Count, Count finite can't subsume infinite\n"); 7537 #endif 7538 xmlExpFree(ctxt, tmp); 7539 return(forbiddenExp); 7540 } 7541 } else { 7542 if (exp->exp_max == -1) { 7543 #ifdef DEBUG_DERIV 7544 printf("Infinite loop consume mult finite loop\n"); 7545 #endif 7546 if (exp->exp_min > sub->exp_min * mult) { 7547 max = -1; 7548 min = exp->exp_min - sub->exp_min * mult; 7549 } else { 7550 max = -1; 7551 min = 0; 7552 } 7553 } else { 7554 if (exp->exp_max < sub->exp_max * mult) { 7555 #ifdef DEBUG_DERIV 7556 printf("loops max mult mismatch => forbidden\n"); 7557 #endif 7558 xmlExpFree(ctxt, tmp); 7559 return(forbiddenExp); 7560 } 7561 if (sub->exp_max * mult > exp->exp_min) 7562 min = 0; 7563 else 7564 min = exp->exp_min - sub->exp_max * mult; 7565 max = exp->exp_max - sub->exp_max * mult; 7566 } 7567 } 7568 } else if (!IS_NILLABLE(tmp)) { 7569 /* 7570 * TODO: loop here to try to grow if working on finite 7571 * blocks. 7572 */ 7573 #ifdef DEBUG_DERIV 7574 printf("Count, Count remain not nillable => forbidden\n"); 7575 #endif 7576 xmlExpFree(ctxt, tmp); 7577 return(forbiddenExp); 7578 } else if (sub->exp_max == -1) { 7579 if (exp->exp_max == -1) { 7580 if (exp->exp_min <= sub->exp_min) { 7581 #ifdef DEBUG_DERIV 7582 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7583 #endif 7584 max = -1; 7585 min = 0; 7586 } else { 7587 #ifdef DEBUG_DERIV 7588 printf("Infinite loops min => Count(X,Inf)\n"); 7589 #endif 7590 max = -1; 7591 min = exp->exp_min - sub->exp_min; 7592 } 7593 } else if (exp->exp_min > sub->exp_min) { 7594 #ifdef DEBUG_DERIV 7595 printf("loops min mismatch 1 => forbidden ???\n"); 7596 #endif 7597 xmlExpFree(ctxt, tmp); 7598 return(forbiddenExp); 7599 } else { 7600 max = -1; 7601 min = 0; 7602 } 7603 } else { 7604 if (exp->exp_max == -1) { 7605 #ifdef DEBUG_DERIV 7606 printf("Infinite loop consume finite loop\n"); 7607 #endif 7608 if (exp->exp_min > sub->exp_min) { 7609 max = -1; 7610 min = exp->exp_min - sub->exp_min; 7611 } else { 7612 max = -1; 7613 min = 0; 7614 } 7615 } else { 7616 if (exp->exp_max < sub->exp_max) { 7617 #ifdef DEBUG_DERIV 7618 printf("loops max mismatch => forbidden\n"); 7619 #endif 7620 xmlExpFree(ctxt, tmp); 7621 return(forbiddenExp); 7622 } 7623 if (sub->exp_max > exp->exp_min) 7624 min = 0; 7625 else 7626 min = exp->exp_min - sub->exp_max; 7627 max = exp->exp_max - sub->exp_max; 7628 } 7629 } 7630 #ifdef DEBUG_DERIV 7631 printf("loops match => SEQ(COUNT())\n"); 7632 #endif 7633 exp->exp_left->ref++; 7634 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7635 NULL, NULL, min, max); 7636 if (tmp2 == NULL) { 7637 return(NULL); 7638 } 7639 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7640 NULL, 0, 0); 7641 return(ret); 7642 } 7643 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7644 if (tmp == NULL) 7645 return(NULL); 7646 if (tmp == forbiddenExp) { 7647 #ifdef DEBUG_DERIV 7648 printf("loop mismatch => forbidden\n"); 7649 #endif 7650 return(forbiddenExp); 7651 } 7652 if (exp->exp_min > 0) 7653 min = exp->exp_min - 1; 7654 else 7655 min = 0; 7656 if (exp->exp_max < 0) 7657 max = -1; 7658 else 7659 max = exp->exp_max - 1; 7660 7661 #ifdef DEBUG_DERIV 7662 printf("loop match => SEQ(COUNT())\n"); 7663 #endif 7664 exp->exp_left->ref++; 7665 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7666 NULL, NULL, min, max); 7667 if (tmp2 == NULL) 7668 return(NULL); 7669 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7670 NULL, 0, 0); 7671 return(ret); 7672 } 7673 } 7674 7675 #ifdef DEBUG_DERIV 7676 printf("Fallback to derivative\n"); 7677 #endif 7678 if (IS_NILLABLE(sub)) { 7679 if (!(IS_NILLABLE(exp))) 7680 return(forbiddenExp); 7681 else 7682 ret = emptyExp; 7683 } else 7684 ret = NULL; 7685 /* 7686 * here the structured derivation made no progress so 7687 * we use the default token based derivation to force one more step 7688 */ 7689 if (ctxt->tabSize == 0) 7690 ctxt->tabSize = 40; 7691 7692 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7693 sizeof(const xmlChar *)); 7694 if (tab == NULL) { 7695 return(NULL); 7696 } 7697 7698 /* 7699 * collect all the strings accepted by the subexpression on input 7700 */ 7701 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7702 while (len < 0) { 7703 const xmlChar **temp; 7704 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7705 sizeof(const xmlChar *)); 7706 if (temp == NULL) { 7707 xmlFree((xmlChar **) tab); 7708 return(NULL); 7709 } 7710 tab = temp; 7711 ctxt->tabSize *= 2; 7712 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7713 } 7714 for (i = 0;i < len;i++) { 7715 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7716 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7717 xmlExpFree(ctxt, ret); 7718 xmlFree((xmlChar **) tab); 7719 return(tmp); 7720 } 7721 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7722 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7723 xmlExpFree(ctxt, tmp); 7724 xmlExpFree(ctxt, ret); 7725 xmlFree((xmlChar **) tab); 7726 return(tmp); 7727 } 7728 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7729 xmlExpFree(ctxt, tmp); 7730 xmlExpFree(ctxt, tmp2); 7731 7732 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7733 xmlExpFree(ctxt, ret); 7734 xmlFree((xmlChar **) tab); 7735 return(tmp3); 7736 } 7737 7738 if (ret == NULL) 7739 ret = tmp3; 7740 else { 7741 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7742 if (ret == NULL) { 7743 xmlFree((xmlChar **) tab); 7744 return(NULL); 7745 } 7746 } 7747 } 7748 xmlFree((xmlChar **) tab); 7749 return(ret); 7750 } 7751 7752 /** 7753 * xmlExpExpDerive: 7754 * @ctxt: the expressions context 7755 * @exp: the englobing expression 7756 * @sub: the subexpression 7757 * 7758 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7759 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7760 * it usually tatkes less than linear time and can handle expressions generating 7761 * infinite languages. 7762 * 7763 * Returns the resulting expression or NULL in case of internal error, the 7764 * result must be freed 7765 */ 7766 xmlExpNodePtr 7767 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7768 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7769 return(NULL); 7770 7771 /* 7772 * O(1) speedups 7773 */ 7774 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7775 #ifdef DEBUG_DERIV 7776 printf("Sub nillable and not exp : can't subsume\n"); 7777 #endif 7778 return(forbiddenExp); 7779 } 7780 if (xmlExpCheckCard(exp, sub) == 0) { 7781 #ifdef DEBUG_DERIV 7782 printf("sub generate longuer sequances than exp : can't subsume\n"); 7783 #endif 7784 return(forbiddenExp); 7785 } 7786 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7787 } 7788 7789 /** 7790 * xmlExpSubsume: 7791 * @ctxt: the expressions context 7792 * @exp: the englobing expression 7793 * @sub: the subexpression 7794 * 7795 * Check whether @exp accepts all the languages accexpted by @sub 7796 * the input being a subexpression. 7797 * 7798 * Returns 1 if true 0 if false and -1 in case of failure. 7799 */ 7800 int 7801 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7802 xmlExpNodePtr tmp; 7803 7804 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7805 return(-1); 7806 7807 /* 7808 * TODO: speedup by checking the language of sub is a subset of the 7809 * language of exp 7810 */ 7811 /* 7812 * O(1) speedups 7813 */ 7814 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7815 #ifdef DEBUG_DERIV 7816 printf("Sub nillable and not exp : can't subsume\n"); 7817 #endif 7818 return(0); 7819 } 7820 if (xmlExpCheckCard(exp, sub) == 0) { 7821 #ifdef DEBUG_DERIV 7822 printf("sub generate longuer sequances than exp : can't subsume\n"); 7823 #endif 7824 return(0); 7825 } 7826 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7827 #ifdef DEBUG_DERIV 7828 printf("Result derivation :\n"); 7829 PRINT_EXP(tmp); 7830 #endif 7831 if (tmp == NULL) 7832 return(-1); 7833 if (tmp == forbiddenExp) 7834 return(0); 7835 if (tmp == emptyExp) 7836 return(1); 7837 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 7838 xmlExpFree(ctxt, tmp); 7839 return(1); 7840 } 7841 xmlExpFree(ctxt, tmp); 7842 return(0); 7843 } 7844 7845 /************************************************************************ 7846 * * 7847 * Parsing expression * 7848 * * 7849 ************************************************************************/ 7850 7851 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 7852 7853 #undef CUR 7854 #define CUR (*ctxt->cur) 7855 #undef NEXT 7856 #define NEXT ctxt->cur++; 7857 #undef IS_BLANK 7858 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 7859 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 7860 7861 static int 7862 xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 7863 int ret = 0; 7864 7865 SKIP_BLANKS 7866 if (CUR == '*') { 7867 NEXT 7868 return(-1); 7869 } 7870 if ((CUR < '0') || (CUR > '9')) 7871 return(-1); 7872 while ((CUR >= '0') && (CUR <= '9')) { 7873 ret = ret * 10 + (CUR - '0'); 7874 NEXT 7875 } 7876 return(ret); 7877 } 7878 7879 static xmlExpNodePtr 7880 xmlExpParseOr(xmlExpCtxtPtr ctxt) { 7881 const char *base; 7882 xmlExpNodePtr ret; 7883 const xmlChar *val; 7884 7885 SKIP_BLANKS 7886 base = ctxt->cur; 7887 if (*ctxt->cur == '(') { 7888 NEXT 7889 ret = xmlExpParseExpr(ctxt); 7890 SKIP_BLANKS 7891 if (*ctxt->cur != ')') { 7892 fprintf(stderr, "unbalanced '(' : %s\n", base); 7893 xmlExpFree(ctxt, ret); 7894 return(NULL); 7895 } 7896 NEXT; 7897 SKIP_BLANKS 7898 goto parse_quantifier; 7899 } 7900 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 7901 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 7902 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 7903 NEXT; 7904 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 7905 if (val == NULL) 7906 return(NULL); 7907 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 7908 if (ret == NULL) 7909 return(NULL); 7910 SKIP_BLANKS 7911 parse_quantifier: 7912 if (CUR == '{') { 7913 int min, max; 7914 7915 NEXT 7916 min = xmlExpParseNumber(ctxt); 7917 if (min < 0) { 7918 xmlExpFree(ctxt, ret); 7919 return(NULL); 7920 } 7921 SKIP_BLANKS 7922 if (CUR == ',') { 7923 NEXT 7924 max = xmlExpParseNumber(ctxt); 7925 SKIP_BLANKS 7926 } else 7927 max = min; 7928 if (CUR != '}') { 7929 xmlExpFree(ctxt, ret); 7930 return(NULL); 7931 } 7932 NEXT 7933 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7934 min, max); 7935 SKIP_BLANKS 7936 } else if (CUR == '?') { 7937 NEXT 7938 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7939 0, 1); 7940 SKIP_BLANKS 7941 } else if (CUR == '+') { 7942 NEXT 7943 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7944 1, -1); 7945 SKIP_BLANKS 7946 } else if (CUR == '*') { 7947 NEXT 7948 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7949 0, -1); 7950 SKIP_BLANKS 7951 } 7952 return(ret); 7953 } 7954 7955 7956 static xmlExpNodePtr 7957 xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 7958 xmlExpNodePtr ret, right; 7959 7960 ret = xmlExpParseOr(ctxt); 7961 SKIP_BLANKS 7962 while (CUR == '|') { 7963 NEXT 7964 right = xmlExpParseOr(ctxt); 7965 if (right == NULL) { 7966 xmlExpFree(ctxt, ret); 7967 return(NULL); 7968 } 7969 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 7970 if (ret == NULL) 7971 return(NULL); 7972 } 7973 return(ret); 7974 } 7975 7976 static xmlExpNodePtr 7977 xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 7978 xmlExpNodePtr ret, right; 7979 7980 ret = xmlExpParseSeq(ctxt); 7981 SKIP_BLANKS 7982 while (CUR == ',') { 7983 NEXT 7984 right = xmlExpParseSeq(ctxt); 7985 if (right == NULL) { 7986 xmlExpFree(ctxt, ret); 7987 return(NULL); 7988 } 7989 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 7990 if (ret == NULL) 7991 return(NULL); 7992 } 7993 return(ret); 7994 } 7995 7996 /** 7997 * xmlExpParse: 7998 * @ctxt: the expressions context 7999 * @expr: the 0 terminated string 8000 * 8001 * Minimal parser for regexps, it understand the following constructs 8002 * - string terminals 8003 * - choice operator | 8004 * - sequence operator , 8005 * - subexpressions (...) 8006 * - usual cardinality operators + * and ? 8007 * - finite sequences { min, max } 8008 * - infinite sequences { min, * } 8009 * There is minimal checkings made especially no checking on strings values 8010 * 8011 * Returns a new expression or NULL in case of failure 8012 */ 8013 xmlExpNodePtr 8014 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 8015 xmlExpNodePtr ret; 8016 8017 ctxt->expr = expr; 8018 ctxt->cur = expr; 8019 8020 ret = xmlExpParseExpr(ctxt); 8021 SKIP_BLANKS 8022 if (*ctxt->cur != 0) { 8023 xmlExpFree(ctxt, ret); 8024 return(NULL); 8025 } 8026 return(ret); 8027 } 8028 8029 static void 8030 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 8031 xmlExpNodePtr c; 8032 8033 if (expr == NULL) return; 8034 if (glob) xmlBufferWriteChar(buf, "("); 8035 switch (expr->type) { 8036 case XML_EXP_EMPTY: 8037 xmlBufferWriteChar(buf, "empty"); 8038 break; 8039 case XML_EXP_FORBID: 8040 xmlBufferWriteChar(buf, "forbidden"); 8041 break; 8042 case XML_EXP_ATOM: 8043 xmlBufferWriteCHAR(buf, expr->exp_str); 8044 break; 8045 case XML_EXP_SEQ: 8046 c = expr->exp_left; 8047 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8048 xmlExpDumpInt(buf, c, 1); 8049 else 8050 xmlExpDumpInt(buf, c, 0); 8051 xmlBufferWriteChar(buf, " , "); 8052 c = expr->exp_right; 8053 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8054 xmlExpDumpInt(buf, c, 1); 8055 else 8056 xmlExpDumpInt(buf, c, 0); 8057 break; 8058 case XML_EXP_OR: 8059 c = expr->exp_left; 8060 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8061 xmlExpDumpInt(buf, c, 1); 8062 else 8063 xmlExpDumpInt(buf, c, 0); 8064 xmlBufferWriteChar(buf, " | "); 8065 c = expr->exp_right; 8066 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8067 xmlExpDumpInt(buf, c, 1); 8068 else 8069 xmlExpDumpInt(buf, c, 0); 8070 break; 8071 case XML_EXP_COUNT: { 8072 char rep[40]; 8073 8074 c = expr->exp_left; 8075 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8076 xmlExpDumpInt(buf, c, 1); 8077 else 8078 xmlExpDumpInt(buf, c, 0); 8079 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 8080 rep[0] = '?'; 8081 rep[1] = 0; 8082 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 8083 rep[0] = '*'; 8084 rep[1] = 0; 8085 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 8086 rep[0] = '+'; 8087 rep[1] = 0; 8088 } else if (expr->exp_max == expr->exp_min) { 8089 snprintf(rep, 39, "{%d}", expr->exp_min); 8090 } else if (expr->exp_max < 0) { 8091 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 8092 } else { 8093 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 8094 } 8095 rep[39] = 0; 8096 xmlBufferWriteChar(buf, rep); 8097 break; 8098 } 8099 default: 8100 fprintf(stderr, "Error in tree\n"); 8101 } 8102 if (glob) 8103 xmlBufferWriteChar(buf, ")"); 8104 } 8105 /** 8106 * xmlExpDump: 8107 * @buf: a buffer to receive the output 8108 * @expr: the compiled expression 8109 * 8110 * Serialize the expression as compiled to the buffer 8111 */ 8112 void 8113 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 8114 if ((buf == NULL) || (expr == NULL)) 8115 return; 8116 xmlExpDumpInt(buf, expr, 0); 8117 } 8118 8119 /** 8120 * xmlExpMaxToken: 8121 * @expr: a compiled expression 8122 * 8123 * Indicate the maximum number of input a expression can accept 8124 * 8125 * Returns the maximum length or -1 in case of error 8126 */ 8127 int 8128 xmlExpMaxToken(xmlExpNodePtr expr) { 8129 if (expr == NULL) 8130 return(-1); 8131 return(expr->c_max); 8132 } 8133 8134 /** 8135 * xmlExpCtxtNbNodes: 8136 * @ctxt: an expression context 8137 * 8138 * Debugging facility provides the number of allocated nodes at a that point 8139 * 8140 * Returns the number of nodes in use or -1 in case of error 8141 */ 8142 int 8143 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 8144 if (ctxt == NULL) 8145 return(-1); 8146 return(ctxt->nb_nodes); 8147 } 8148 8149 /** 8150 * xmlExpCtxtNbCons: 8151 * @ctxt: an expression context 8152 * 8153 * Debugging facility provides the number of allocated nodes over lifetime 8154 * 8155 * Returns the number of nodes ever allocated or -1 in case of error 8156 */ 8157 int 8158 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 8159 if (ctxt == NULL) 8160 return(-1); 8161 return(ctxt->nb_cons); 8162 } 8163 8164 #endif /* LIBXML_EXPR_ENABLED */ 8165 #define bottom_xmlregexp 8166 #include "elfgcchack.h" 8167 #endif /* LIBXML_REGEXP_ENABLED */ 8168