1 /* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard (at) redhat.com> 15 */ 16 17 #define IN_LIBXML 18 #include "libxml.h" 19 20 #ifdef LIBXML_REGEXP_ENABLED 21 22 /* #define DEBUG_ERR */ 23 24 #include <stdio.h> 25 #include <string.h> 26 #ifdef HAVE_LIMITS_H 27 #include <limits.h> 28 #endif 29 30 #include <libxml/tree.h> 31 #include <libxml/parserInternals.h> 32 #include <libxml/xmlregexp.h> 33 #include <libxml/xmlautomata.h> 34 #include <libxml/xmlunicode.h> 35 36 #ifndef INT_MAX 37 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 38 #endif 39 40 /* #define DEBUG_REGEXP_GRAPH */ 41 /* #define DEBUG_REGEXP_EXEC */ 42 /* #define DEBUG_PUSH */ 43 /* #define DEBUG_COMPACTION */ 44 45 #define MAX_PUSH 10000000 46 47 #define ERROR(str) \ 48 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 49 xmlRegexpErrCompile(ctxt, str); 50 #define NEXT ctxt->cur++ 51 #define CUR (*(ctxt->cur)) 52 #define NXT(index) (ctxt->cur[index]) 53 54 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 55 #define NEXTL(l) ctxt->cur += l; 56 #define XML_REG_STRING_SEPARATOR '|' 57 /* 58 * Need PREV to check on a '-' within a Character Group. May only be used 59 * when it's guaranteed that cur is not at the beginning of ctxt->string! 60 */ 61 #define PREV (ctxt->cur[-1]) 62 63 /** 64 * TODO: 65 * 66 * macro to flag unimplemented blocks 67 */ 68 #define TODO \ 69 xmlGenericError(xmlGenericErrorContext, \ 70 "Unimplemented block at %s:%d\n", \ 71 __FILE__, __LINE__); 72 73 /************************************************************************ 74 * * 75 * Datatypes and structures * 76 * * 77 ************************************************************************/ 78 79 /* 80 * Note: the order of the enums below is significant, do not shuffle 81 */ 82 typedef enum { 83 XML_REGEXP_EPSILON = 1, 84 XML_REGEXP_CHARVAL, 85 XML_REGEXP_RANGES, 86 XML_REGEXP_SUBREG, /* used for () sub regexps */ 87 XML_REGEXP_STRING, 88 XML_REGEXP_ANYCHAR, /* . */ 89 XML_REGEXP_ANYSPACE, /* \s */ 90 XML_REGEXP_NOTSPACE, /* \S */ 91 XML_REGEXP_INITNAME, /* \l */ 92 XML_REGEXP_NOTINITNAME, /* \L */ 93 XML_REGEXP_NAMECHAR, /* \c */ 94 XML_REGEXP_NOTNAMECHAR, /* \C */ 95 XML_REGEXP_DECIMAL, /* \d */ 96 XML_REGEXP_NOTDECIMAL, /* \D */ 97 XML_REGEXP_REALCHAR, /* \w */ 98 XML_REGEXP_NOTREALCHAR, /* \W */ 99 XML_REGEXP_LETTER = 100, 100 XML_REGEXP_LETTER_UPPERCASE, 101 XML_REGEXP_LETTER_LOWERCASE, 102 XML_REGEXP_LETTER_TITLECASE, 103 XML_REGEXP_LETTER_MODIFIER, 104 XML_REGEXP_LETTER_OTHERS, 105 XML_REGEXP_MARK, 106 XML_REGEXP_MARK_NONSPACING, 107 XML_REGEXP_MARK_SPACECOMBINING, 108 XML_REGEXP_MARK_ENCLOSING, 109 XML_REGEXP_NUMBER, 110 XML_REGEXP_NUMBER_DECIMAL, 111 XML_REGEXP_NUMBER_LETTER, 112 XML_REGEXP_NUMBER_OTHERS, 113 XML_REGEXP_PUNCT, 114 XML_REGEXP_PUNCT_CONNECTOR, 115 XML_REGEXP_PUNCT_DASH, 116 XML_REGEXP_PUNCT_OPEN, 117 XML_REGEXP_PUNCT_CLOSE, 118 XML_REGEXP_PUNCT_INITQUOTE, 119 XML_REGEXP_PUNCT_FINQUOTE, 120 XML_REGEXP_PUNCT_OTHERS, 121 XML_REGEXP_SEPAR, 122 XML_REGEXP_SEPAR_SPACE, 123 XML_REGEXP_SEPAR_LINE, 124 XML_REGEXP_SEPAR_PARA, 125 XML_REGEXP_SYMBOL, 126 XML_REGEXP_SYMBOL_MATH, 127 XML_REGEXP_SYMBOL_CURRENCY, 128 XML_REGEXP_SYMBOL_MODIFIER, 129 XML_REGEXP_SYMBOL_OTHERS, 130 XML_REGEXP_OTHER, 131 XML_REGEXP_OTHER_CONTROL, 132 XML_REGEXP_OTHER_FORMAT, 133 XML_REGEXP_OTHER_PRIVATE, 134 XML_REGEXP_OTHER_NA, 135 XML_REGEXP_BLOCK_NAME 136 } xmlRegAtomType; 137 138 typedef enum { 139 XML_REGEXP_QUANT_EPSILON = 1, 140 XML_REGEXP_QUANT_ONCE, 141 XML_REGEXP_QUANT_OPT, 142 XML_REGEXP_QUANT_MULT, 143 XML_REGEXP_QUANT_PLUS, 144 XML_REGEXP_QUANT_ONCEONLY, 145 XML_REGEXP_QUANT_ALL, 146 XML_REGEXP_QUANT_RANGE 147 } xmlRegQuantType; 148 149 typedef enum { 150 XML_REGEXP_START_STATE = 1, 151 XML_REGEXP_FINAL_STATE, 152 XML_REGEXP_TRANS_STATE, 153 XML_REGEXP_SINK_STATE, 154 XML_REGEXP_UNREACH_STATE 155 } xmlRegStateType; 156 157 typedef enum { 158 XML_REGEXP_MARK_NORMAL = 0, 159 XML_REGEXP_MARK_START, 160 XML_REGEXP_MARK_VISITED 161 } xmlRegMarkedType; 162 163 typedef struct _xmlRegRange xmlRegRange; 164 typedef xmlRegRange *xmlRegRangePtr; 165 166 struct _xmlRegRange { 167 int neg; /* 0 normal, 1 not, 2 exclude */ 168 xmlRegAtomType type; 169 int start; 170 int end; 171 xmlChar *blockName; 172 }; 173 174 typedef struct _xmlRegAtom xmlRegAtom; 175 typedef xmlRegAtom *xmlRegAtomPtr; 176 177 typedef struct _xmlAutomataState xmlRegState; 178 typedef xmlRegState *xmlRegStatePtr; 179 180 struct _xmlRegAtom { 181 int no; 182 xmlRegAtomType type; 183 xmlRegQuantType quant; 184 int min; 185 int max; 186 187 void *valuep; 188 void *valuep2; 189 int neg; 190 int codepoint; 191 xmlRegStatePtr start; 192 xmlRegStatePtr start0; 193 xmlRegStatePtr stop; 194 int maxRanges; 195 int nbRanges; 196 xmlRegRangePtr *ranges; 197 void *data; 198 }; 199 200 typedef struct _xmlRegCounter xmlRegCounter; 201 typedef xmlRegCounter *xmlRegCounterPtr; 202 203 struct _xmlRegCounter { 204 int min; 205 int max; 206 }; 207 208 typedef struct _xmlRegTrans xmlRegTrans; 209 typedef xmlRegTrans *xmlRegTransPtr; 210 211 struct _xmlRegTrans { 212 xmlRegAtomPtr atom; 213 int to; 214 int counter; 215 int count; 216 int nd; 217 }; 218 219 struct _xmlAutomataState { 220 xmlRegStateType type; 221 xmlRegMarkedType mark; 222 xmlRegMarkedType reached; 223 int no; 224 int maxTrans; 225 int nbTrans; 226 xmlRegTrans *trans; 227 /* knowing states ponting to us can speed things up */ 228 int maxTransTo; 229 int nbTransTo; 230 int *transTo; 231 }; 232 233 typedef struct _xmlAutomata xmlRegParserCtxt; 234 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 235 236 #define AM_AUTOMATA_RNG 1 237 238 struct _xmlAutomata { 239 xmlChar *string; 240 xmlChar *cur; 241 242 int error; 243 int neg; 244 245 xmlRegStatePtr start; 246 xmlRegStatePtr end; 247 xmlRegStatePtr state; 248 249 xmlRegAtomPtr atom; 250 251 int maxAtoms; 252 int nbAtoms; 253 xmlRegAtomPtr *atoms; 254 255 int maxStates; 256 int nbStates; 257 xmlRegStatePtr *states; 258 259 int maxCounters; 260 int nbCounters; 261 xmlRegCounter *counters; 262 263 int determinist; 264 int negs; 265 int flags; 266 }; 267 268 struct _xmlRegexp { 269 xmlChar *string; 270 int nbStates; 271 xmlRegStatePtr *states; 272 int nbAtoms; 273 xmlRegAtomPtr *atoms; 274 int nbCounters; 275 xmlRegCounter *counters; 276 int determinist; 277 int flags; 278 /* 279 * That's the compact form for determinists automatas 280 */ 281 int nbstates; 282 int *compact; 283 void **transdata; 284 int nbstrings; 285 xmlChar **stringMap; 286 }; 287 288 typedef struct _xmlRegExecRollback xmlRegExecRollback; 289 typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 290 291 struct _xmlRegExecRollback { 292 xmlRegStatePtr state;/* the current state */ 293 int index; /* the index in the input stack */ 294 int nextbranch; /* the next transition to explore in that state */ 295 int *counts; /* save the automata state if it has some */ 296 }; 297 298 typedef struct _xmlRegInputToken xmlRegInputToken; 299 typedef xmlRegInputToken *xmlRegInputTokenPtr; 300 301 struct _xmlRegInputToken { 302 xmlChar *value; 303 void *data; 304 }; 305 306 struct _xmlRegExecCtxt { 307 int status; /* execution status != 0 indicate an error */ 308 int determinist; /* did we find an indeterministic behaviour */ 309 xmlRegexpPtr comp; /* the compiled regexp */ 310 xmlRegExecCallbacks callback; 311 void *data; 312 313 xmlRegStatePtr state;/* the current state */ 314 int transno; /* the current transition on that state */ 315 int transcount; /* the number of chars in char counted transitions */ 316 317 /* 318 * A stack of rollback states 319 */ 320 int maxRollbacks; 321 int nbRollbacks; 322 xmlRegExecRollback *rollbacks; 323 324 /* 325 * The state of the automata if any 326 */ 327 int *counts; 328 329 /* 330 * The input stack 331 */ 332 int inputStackMax; 333 int inputStackNr; 334 int index; 335 int *charStack; 336 const xmlChar *inputString; /* when operating on characters */ 337 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 338 339 /* 340 * error handling 341 */ 342 int errStateNo; /* the error state number */ 343 xmlRegStatePtr errState; /* the error state */ 344 xmlChar *errString; /* the string raising the error */ 345 int *errCounts; /* counters at the error state */ 346 int nbPush; 347 }; 348 349 #define REGEXP_ALL_COUNTER 0x123456 350 #define REGEXP_ALL_LAX_COUNTER 0x123457 351 352 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 353 static void xmlRegFreeState(xmlRegStatePtr state); 354 static void xmlRegFreeAtom(xmlRegAtomPtr atom); 355 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 356 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 357 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 358 int neg, int start, int end, const xmlChar *blockName); 359 360 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags); 361 362 /************************************************************************ 363 * * 364 * Regexp memory error handler * 365 * * 366 ************************************************************************/ 367 /** 368 * xmlRegexpErrMemory: 369 * @extra: extra information 370 * 371 * Handle an out of memory condition 372 */ 373 static void 374 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 375 { 376 const char *regexp = NULL; 377 if (ctxt != NULL) { 378 regexp = (const char *) ctxt->string; 379 ctxt->error = XML_ERR_NO_MEMORY; 380 } 381 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 382 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 383 regexp, NULL, 0, 0, 384 "Memory allocation failed : %s\n", extra); 385 } 386 387 /** 388 * xmlRegexpErrCompile: 389 * @extra: extra information 390 * 391 * Handle a compilation failure 392 */ 393 static void 394 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 395 { 396 const char *regexp = NULL; 397 int idx = 0; 398 399 if (ctxt != NULL) { 400 regexp = (const char *) ctxt->string; 401 idx = ctxt->cur - ctxt->string; 402 ctxt->error = XML_REGEXP_COMPILE_ERROR; 403 } 404 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 405 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 406 regexp, NULL, idx, 0, 407 "failed to compile: %s\n", extra); 408 } 409 410 /************************************************************************ 411 * * 412 * Allocation/Deallocation * 413 * * 414 ************************************************************************/ 415 416 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 417 /** 418 * xmlRegEpxFromParse: 419 * @ctxt: the parser context used to build it 420 * 421 * Allocate a new regexp and fill it with the result from the parser 422 * 423 * Returns the new regexp or NULL in case of error 424 */ 425 static xmlRegexpPtr 426 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 427 xmlRegexpPtr ret; 428 429 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 430 if (ret == NULL) { 431 xmlRegexpErrMemory(ctxt, "compiling regexp"); 432 return(NULL); 433 } 434 memset(ret, 0, sizeof(xmlRegexp)); 435 ret->string = ctxt->string; 436 ret->nbStates = ctxt->nbStates; 437 ret->states = ctxt->states; 438 ret->nbAtoms = ctxt->nbAtoms; 439 ret->atoms = ctxt->atoms; 440 ret->nbCounters = ctxt->nbCounters; 441 ret->counters = ctxt->counters; 442 ret->determinist = ctxt->determinist; 443 ret->flags = ctxt->flags; 444 if (ret->determinist == -1) { 445 xmlRegexpIsDeterminist(ret); 446 } 447 448 if ((ret->determinist != 0) && 449 (ret->nbCounters == 0) && 450 (ctxt->negs == 0) && 451 (ret->atoms != NULL) && 452 (ret->atoms[0] != NULL) && 453 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 454 int i, j, nbstates = 0, nbatoms = 0; 455 int *stateRemap; 456 int *stringRemap; 457 int *transitions; 458 void **transdata; 459 xmlChar **stringMap; 460 xmlChar *value; 461 462 /* 463 * Switch to a compact representation 464 * 1/ counting the effective number of states left 465 * 2/ counting the unique number of atoms, and check that 466 * they are all of the string type 467 * 3/ build a table state x atom for the transitions 468 */ 469 470 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 471 if (stateRemap == NULL) { 472 xmlRegexpErrMemory(ctxt, "compiling regexp"); 473 xmlFree(ret); 474 return(NULL); 475 } 476 for (i = 0;i < ret->nbStates;i++) { 477 if (ret->states[i] != NULL) { 478 stateRemap[i] = nbstates; 479 nbstates++; 480 } else { 481 stateRemap[i] = -1; 482 } 483 } 484 #ifdef DEBUG_COMPACTION 485 printf("Final: %d states\n", nbstates); 486 #endif 487 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 488 if (stringMap == NULL) { 489 xmlRegexpErrMemory(ctxt, "compiling regexp"); 490 xmlFree(stateRemap); 491 xmlFree(ret); 492 return(NULL); 493 } 494 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 495 if (stringRemap == NULL) { 496 xmlRegexpErrMemory(ctxt, "compiling regexp"); 497 xmlFree(stringMap); 498 xmlFree(stateRemap); 499 xmlFree(ret); 500 return(NULL); 501 } 502 for (i = 0;i < ret->nbAtoms;i++) { 503 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 504 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 505 value = ret->atoms[i]->valuep; 506 for (j = 0;j < nbatoms;j++) { 507 if (xmlStrEqual(stringMap[j], value)) { 508 stringRemap[i] = j; 509 break; 510 } 511 } 512 if (j >= nbatoms) { 513 stringRemap[i] = nbatoms; 514 stringMap[nbatoms] = xmlStrdup(value); 515 if (stringMap[nbatoms] == NULL) { 516 for (i = 0;i < nbatoms;i++) 517 xmlFree(stringMap[i]); 518 xmlFree(stringRemap); 519 xmlFree(stringMap); 520 xmlFree(stateRemap); 521 xmlFree(ret); 522 return(NULL); 523 } 524 nbatoms++; 525 } 526 } else { 527 xmlFree(stateRemap); 528 xmlFree(stringRemap); 529 for (i = 0;i < nbatoms;i++) 530 xmlFree(stringMap[i]); 531 xmlFree(stringMap); 532 xmlFree(ret); 533 return(NULL); 534 } 535 } 536 #ifdef DEBUG_COMPACTION 537 printf("Final: %d atoms\n", nbatoms); 538 #endif 539 transitions = (int *) xmlMalloc((nbstates + 1) * 540 (nbatoms + 1) * sizeof(int)); 541 if (transitions == NULL) { 542 xmlFree(stateRemap); 543 xmlFree(stringRemap); 544 xmlFree(stringMap); 545 xmlFree(ret); 546 return(NULL); 547 } 548 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 549 550 /* 551 * Allocate the transition table. The first entry for each 552 * state corresponds to the state type. 553 */ 554 transdata = NULL; 555 556 for (i = 0;i < ret->nbStates;i++) { 557 int stateno, atomno, targetno, prev; 558 xmlRegStatePtr state; 559 xmlRegTransPtr trans; 560 561 stateno = stateRemap[i]; 562 if (stateno == -1) 563 continue; 564 state = ret->states[i]; 565 566 transitions[stateno * (nbatoms + 1)] = state->type; 567 568 for (j = 0;j < state->nbTrans;j++) { 569 trans = &(state->trans[j]); 570 if ((trans->to == -1) || (trans->atom == NULL)) 571 continue; 572 atomno = stringRemap[trans->atom->no]; 573 if ((trans->atom->data != NULL) && (transdata == NULL)) { 574 transdata = (void **) xmlMalloc(nbstates * nbatoms * 575 sizeof(void *)); 576 if (transdata != NULL) 577 memset(transdata, 0, 578 nbstates * nbatoms * sizeof(void *)); 579 else { 580 xmlRegexpErrMemory(ctxt, "compiling regexp"); 581 break; 582 } 583 } 584 targetno = stateRemap[trans->to]; 585 /* 586 * if the same atom can generate transitions to 2 different 587 * states then it means the automata is not determinist and 588 * the compact form can't be used ! 589 */ 590 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 591 if (prev != 0) { 592 if (prev != targetno + 1) { 593 ret->determinist = 0; 594 #ifdef DEBUG_COMPACTION 595 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 596 i, j, trans->atom->no, trans->to, atomno, targetno); 597 printf(" previous to is %d\n", prev); 598 #endif 599 if (transdata != NULL) 600 xmlFree(transdata); 601 xmlFree(transitions); 602 xmlFree(stateRemap); 603 xmlFree(stringRemap); 604 for (i = 0;i < nbatoms;i++) 605 xmlFree(stringMap[i]); 606 xmlFree(stringMap); 607 goto not_determ; 608 } 609 } else { 610 #if 0 611 printf("State %d trans %d: atom %d to %d : %d to %d\n", 612 i, j, trans->atom->no, trans->to, atomno, targetno); 613 #endif 614 transitions[stateno * (nbatoms + 1) + atomno + 1] = 615 targetno + 1; /* to avoid 0 */ 616 if (transdata != NULL) 617 transdata[stateno * nbatoms + atomno] = 618 trans->atom->data; 619 } 620 } 621 } 622 ret->determinist = 1; 623 #ifdef DEBUG_COMPACTION 624 /* 625 * Debug 626 */ 627 for (i = 0;i < nbstates;i++) { 628 for (j = 0;j < nbatoms + 1;j++) { 629 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 630 } 631 printf("\n"); 632 } 633 printf("\n"); 634 #endif 635 /* 636 * Cleanup of the old data 637 */ 638 if (ret->states != NULL) { 639 for (i = 0;i < ret->nbStates;i++) 640 xmlRegFreeState(ret->states[i]); 641 xmlFree(ret->states); 642 } 643 ret->states = NULL; 644 ret->nbStates = 0; 645 if (ret->atoms != NULL) { 646 for (i = 0;i < ret->nbAtoms;i++) 647 xmlRegFreeAtom(ret->atoms[i]); 648 xmlFree(ret->atoms); 649 } 650 ret->atoms = NULL; 651 ret->nbAtoms = 0; 652 653 ret->compact = transitions; 654 ret->transdata = transdata; 655 ret->stringMap = stringMap; 656 ret->nbstrings = nbatoms; 657 ret->nbstates = nbstates; 658 xmlFree(stateRemap); 659 xmlFree(stringRemap); 660 } 661 not_determ: 662 ctxt->string = NULL; 663 ctxt->nbStates = 0; 664 ctxt->states = NULL; 665 ctxt->nbAtoms = 0; 666 ctxt->atoms = NULL; 667 ctxt->nbCounters = 0; 668 ctxt->counters = NULL; 669 return(ret); 670 } 671 672 /** 673 * xmlRegNewParserCtxt: 674 * @string: the string to parse 675 * 676 * Allocate a new regexp parser context 677 * 678 * Returns the new context or NULL in case of error 679 */ 680 static xmlRegParserCtxtPtr 681 xmlRegNewParserCtxt(const xmlChar *string) { 682 xmlRegParserCtxtPtr ret; 683 684 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 685 if (ret == NULL) 686 return(NULL); 687 memset(ret, 0, sizeof(xmlRegParserCtxt)); 688 if (string != NULL) 689 ret->string = xmlStrdup(string); 690 ret->cur = ret->string; 691 ret->neg = 0; 692 ret->negs = 0; 693 ret->error = 0; 694 ret->determinist = -1; 695 return(ret); 696 } 697 698 /** 699 * xmlRegNewRange: 700 * @ctxt: the regexp parser context 701 * @neg: is that negative 702 * @type: the type of range 703 * @start: the start codepoint 704 * @end: the end codepoint 705 * 706 * Allocate a new regexp range 707 * 708 * Returns the new range or NULL in case of error 709 */ 710 static xmlRegRangePtr 711 xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 712 int neg, xmlRegAtomType type, int start, int end) { 713 xmlRegRangePtr ret; 714 715 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 716 if (ret == NULL) { 717 xmlRegexpErrMemory(ctxt, "allocating range"); 718 return(NULL); 719 } 720 ret->neg = neg; 721 ret->type = type; 722 ret->start = start; 723 ret->end = end; 724 return(ret); 725 } 726 727 /** 728 * xmlRegFreeRange: 729 * @range: the regexp range 730 * 731 * Free a regexp range 732 */ 733 static void 734 xmlRegFreeRange(xmlRegRangePtr range) { 735 if (range == NULL) 736 return; 737 738 if (range->blockName != NULL) 739 xmlFree(range->blockName); 740 xmlFree(range); 741 } 742 743 /** 744 * xmlRegCopyRange: 745 * @range: the regexp range 746 * 747 * Copy a regexp range 748 * 749 * Returns the new copy or NULL in case of error. 750 */ 751 static xmlRegRangePtr 752 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) { 753 xmlRegRangePtr ret; 754 755 if (range == NULL) 756 return(NULL); 757 758 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start, 759 range->end); 760 if (ret == NULL) 761 return(NULL); 762 if (range->blockName != NULL) { 763 ret->blockName = xmlStrdup(range->blockName); 764 if (ret->blockName == NULL) { 765 xmlRegexpErrMemory(ctxt, "allocating range"); 766 xmlRegFreeRange(ret); 767 return(NULL); 768 } 769 } 770 return(ret); 771 } 772 773 /** 774 * xmlRegNewAtom: 775 * @ctxt: the regexp parser context 776 * @type: the type of atom 777 * 778 * Allocate a new atom 779 * 780 * Returns the new atom or NULL in case of error 781 */ 782 static xmlRegAtomPtr 783 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 784 xmlRegAtomPtr ret; 785 786 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 787 if (ret == NULL) { 788 xmlRegexpErrMemory(ctxt, "allocating atom"); 789 return(NULL); 790 } 791 memset(ret, 0, sizeof(xmlRegAtom)); 792 ret->type = type; 793 ret->quant = XML_REGEXP_QUANT_ONCE; 794 ret->min = 0; 795 ret->max = 0; 796 return(ret); 797 } 798 799 /** 800 * xmlRegFreeAtom: 801 * @atom: the regexp atom 802 * 803 * Free a regexp atom 804 */ 805 static void 806 xmlRegFreeAtom(xmlRegAtomPtr atom) { 807 int i; 808 809 if (atom == NULL) 810 return; 811 812 for (i = 0;i < atom->nbRanges;i++) 813 xmlRegFreeRange(atom->ranges[i]); 814 if (atom->ranges != NULL) 815 xmlFree(atom->ranges); 816 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 817 xmlFree(atom->valuep); 818 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 819 xmlFree(atom->valuep2); 820 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 821 xmlFree(atom->valuep); 822 xmlFree(atom); 823 } 824 825 /** 826 * xmlRegCopyAtom: 827 * @ctxt: the regexp parser context 828 * @atom: the oiginal atom 829 * 830 * Allocate a new regexp range 831 * 832 * Returns the new atom or NULL in case of error 833 */ 834 static xmlRegAtomPtr 835 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 836 xmlRegAtomPtr ret; 837 838 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 839 if (ret == NULL) { 840 xmlRegexpErrMemory(ctxt, "copying atom"); 841 return(NULL); 842 } 843 memset(ret, 0, sizeof(xmlRegAtom)); 844 ret->type = atom->type; 845 ret->quant = atom->quant; 846 ret->min = atom->min; 847 ret->max = atom->max; 848 if (atom->nbRanges > 0) { 849 int i; 850 851 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) * 852 atom->nbRanges); 853 if (ret->ranges == NULL) { 854 xmlRegexpErrMemory(ctxt, "copying atom"); 855 goto error; 856 } 857 for (i = 0;i < atom->nbRanges;i++) { 858 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]); 859 if (ret->ranges[i] == NULL) 860 goto error; 861 ret->nbRanges = i + 1; 862 } 863 } 864 return(ret); 865 866 error: 867 xmlRegFreeAtom(ret); 868 return(NULL); 869 } 870 871 static xmlRegStatePtr 872 xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 873 xmlRegStatePtr ret; 874 875 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 876 if (ret == NULL) { 877 xmlRegexpErrMemory(ctxt, "allocating state"); 878 return(NULL); 879 } 880 memset(ret, 0, sizeof(xmlRegState)); 881 ret->type = XML_REGEXP_TRANS_STATE; 882 ret->mark = XML_REGEXP_MARK_NORMAL; 883 return(ret); 884 } 885 886 /** 887 * xmlRegFreeState: 888 * @state: the regexp state 889 * 890 * Free a regexp state 891 */ 892 static void 893 xmlRegFreeState(xmlRegStatePtr state) { 894 if (state == NULL) 895 return; 896 897 if (state->trans != NULL) 898 xmlFree(state->trans); 899 if (state->transTo != NULL) 900 xmlFree(state->transTo); 901 xmlFree(state); 902 } 903 904 /** 905 * xmlRegFreeParserCtxt: 906 * @ctxt: the regexp parser context 907 * 908 * Free a regexp parser context 909 */ 910 static void 911 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 912 int i; 913 if (ctxt == NULL) 914 return; 915 916 if (ctxt->string != NULL) 917 xmlFree(ctxt->string); 918 if (ctxt->states != NULL) { 919 for (i = 0;i < ctxt->nbStates;i++) 920 xmlRegFreeState(ctxt->states[i]); 921 xmlFree(ctxt->states); 922 } 923 if (ctxt->atoms != NULL) { 924 for (i = 0;i < ctxt->nbAtoms;i++) 925 xmlRegFreeAtom(ctxt->atoms[i]); 926 xmlFree(ctxt->atoms); 927 } 928 if (ctxt->counters != NULL) 929 xmlFree(ctxt->counters); 930 xmlFree(ctxt); 931 } 932 933 /************************************************************************ 934 * * 935 * Display of Data structures * 936 * * 937 ************************************************************************/ 938 939 static void 940 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 941 switch (type) { 942 case XML_REGEXP_EPSILON: 943 fprintf(output, "epsilon "); break; 944 case XML_REGEXP_CHARVAL: 945 fprintf(output, "charval "); break; 946 case XML_REGEXP_RANGES: 947 fprintf(output, "ranges "); break; 948 case XML_REGEXP_SUBREG: 949 fprintf(output, "subexpr "); break; 950 case XML_REGEXP_STRING: 951 fprintf(output, "string "); break; 952 case XML_REGEXP_ANYCHAR: 953 fprintf(output, "anychar "); break; 954 case XML_REGEXP_ANYSPACE: 955 fprintf(output, "anyspace "); break; 956 case XML_REGEXP_NOTSPACE: 957 fprintf(output, "notspace "); break; 958 case XML_REGEXP_INITNAME: 959 fprintf(output, "initname "); break; 960 case XML_REGEXP_NOTINITNAME: 961 fprintf(output, "notinitname "); break; 962 case XML_REGEXP_NAMECHAR: 963 fprintf(output, "namechar "); break; 964 case XML_REGEXP_NOTNAMECHAR: 965 fprintf(output, "notnamechar "); break; 966 case XML_REGEXP_DECIMAL: 967 fprintf(output, "decimal "); break; 968 case XML_REGEXP_NOTDECIMAL: 969 fprintf(output, "notdecimal "); break; 970 case XML_REGEXP_REALCHAR: 971 fprintf(output, "realchar "); break; 972 case XML_REGEXP_NOTREALCHAR: 973 fprintf(output, "notrealchar "); break; 974 case XML_REGEXP_LETTER: 975 fprintf(output, "LETTER "); break; 976 case XML_REGEXP_LETTER_UPPERCASE: 977 fprintf(output, "LETTER_UPPERCASE "); break; 978 case XML_REGEXP_LETTER_LOWERCASE: 979 fprintf(output, "LETTER_LOWERCASE "); break; 980 case XML_REGEXP_LETTER_TITLECASE: 981 fprintf(output, "LETTER_TITLECASE "); break; 982 case XML_REGEXP_LETTER_MODIFIER: 983 fprintf(output, "LETTER_MODIFIER "); break; 984 case XML_REGEXP_LETTER_OTHERS: 985 fprintf(output, "LETTER_OTHERS "); break; 986 case XML_REGEXP_MARK: 987 fprintf(output, "MARK "); break; 988 case XML_REGEXP_MARK_NONSPACING: 989 fprintf(output, "MARK_NONSPACING "); break; 990 case XML_REGEXP_MARK_SPACECOMBINING: 991 fprintf(output, "MARK_SPACECOMBINING "); break; 992 case XML_REGEXP_MARK_ENCLOSING: 993 fprintf(output, "MARK_ENCLOSING "); break; 994 case XML_REGEXP_NUMBER: 995 fprintf(output, "NUMBER "); break; 996 case XML_REGEXP_NUMBER_DECIMAL: 997 fprintf(output, "NUMBER_DECIMAL "); break; 998 case XML_REGEXP_NUMBER_LETTER: 999 fprintf(output, "NUMBER_LETTER "); break; 1000 case XML_REGEXP_NUMBER_OTHERS: 1001 fprintf(output, "NUMBER_OTHERS "); break; 1002 case XML_REGEXP_PUNCT: 1003 fprintf(output, "PUNCT "); break; 1004 case XML_REGEXP_PUNCT_CONNECTOR: 1005 fprintf(output, "PUNCT_CONNECTOR "); break; 1006 case XML_REGEXP_PUNCT_DASH: 1007 fprintf(output, "PUNCT_DASH "); break; 1008 case XML_REGEXP_PUNCT_OPEN: 1009 fprintf(output, "PUNCT_OPEN "); break; 1010 case XML_REGEXP_PUNCT_CLOSE: 1011 fprintf(output, "PUNCT_CLOSE "); break; 1012 case XML_REGEXP_PUNCT_INITQUOTE: 1013 fprintf(output, "PUNCT_INITQUOTE "); break; 1014 case XML_REGEXP_PUNCT_FINQUOTE: 1015 fprintf(output, "PUNCT_FINQUOTE "); break; 1016 case XML_REGEXP_PUNCT_OTHERS: 1017 fprintf(output, "PUNCT_OTHERS "); break; 1018 case XML_REGEXP_SEPAR: 1019 fprintf(output, "SEPAR "); break; 1020 case XML_REGEXP_SEPAR_SPACE: 1021 fprintf(output, "SEPAR_SPACE "); break; 1022 case XML_REGEXP_SEPAR_LINE: 1023 fprintf(output, "SEPAR_LINE "); break; 1024 case XML_REGEXP_SEPAR_PARA: 1025 fprintf(output, "SEPAR_PARA "); break; 1026 case XML_REGEXP_SYMBOL: 1027 fprintf(output, "SYMBOL "); break; 1028 case XML_REGEXP_SYMBOL_MATH: 1029 fprintf(output, "SYMBOL_MATH "); break; 1030 case XML_REGEXP_SYMBOL_CURRENCY: 1031 fprintf(output, "SYMBOL_CURRENCY "); break; 1032 case XML_REGEXP_SYMBOL_MODIFIER: 1033 fprintf(output, "SYMBOL_MODIFIER "); break; 1034 case XML_REGEXP_SYMBOL_OTHERS: 1035 fprintf(output, "SYMBOL_OTHERS "); break; 1036 case XML_REGEXP_OTHER: 1037 fprintf(output, "OTHER "); break; 1038 case XML_REGEXP_OTHER_CONTROL: 1039 fprintf(output, "OTHER_CONTROL "); break; 1040 case XML_REGEXP_OTHER_FORMAT: 1041 fprintf(output, "OTHER_FORMAT "); break; 1042 case XML_REGEXP_OTHER_PRIVATE: 1043 fprintf(output, "OTHER_PRIVATE "); break; 1044 case XML_REGEXP_OTHER_NA: 1045 fprintf(output, "OTHER_NA "); break; 1046 case XML_REGEXP_BLOCK_NAME: 1047 fprintf(output, "BLOCK "); break; 1048 } 1049 } 1050 1051 static void 1052 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 1053 switch (type) { 1054 case XML_REGEXP_QUANT_EPSILON: 1055 fprintf(output, "epsilon "); break; 1056 case XML_REGEXP_QUANT_ONCE: 1057 fprintf(output, "once "); break; 1058 case XML_REGEXP_QUANT_OPT: 1059 fprintf(output, "? "); break; 1060 case XML_REGEXP_QUANT_MULT: 1061 fprintf(output, "* "); break; 1062 case XML_REGEXP_QUANT_PLUS: 1063 fprintf(output, "+ "); break; 1064 case XML_REGEXP_QUANT_RANGE: 1065 fprintf(output, "range "); break; 1066 case XML_REGEXP_QUANT_ONCEONLY: 1067 fprintf(output, "onceonly "); break; 1068 case XML_REGEXP_QUANT_ALL: 1069 fprintf(output, "all "); break; 1070 } 1071 } 1072 static void 1073 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 1074 fprintf(output, " range: "); 1075 if (range->neg) 1076 fprintf(output, "negative "); 1077 xmlRegPrintAtomType(output, range->type); 1078 fprintf(output, "%c - %c\n", range->start, range->end); 1079 } 1080 1081 static void 1082 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 1083 fprintf(output, " atom: "); 1084 if (atom == NULL) { 1085 fprintf(output, "NULL\n"); 1086 return; 1087 } 1088 if (atom->neg) 1089 fprintf(output, "not "); 1090 xmlRegPrintAtomType(output, atom->type); 1091 xmlRegPrintQuantType(output, atom->quant); 1092 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1093 fprintf(output, "%d-%d ", atom->min, atom->max); 1094 if (atom->type == XML_REGEXP_STRING) 1095 fprintf(output, "'%s' ", (char *) atom->valuep); 1096 if (atom->type == XML_REGEXP_CHARVAL) 1097 fprintf(output, "char %c\n", atom->codepoint); 1098 else if (atom->type == XML_REGEXP_RANGES) { 1099 int i; 1100 fprintf(output, "%d entries\n", atom->nbRanges); 1101 for (i = 0; i < atom->nbRanges;i++) 1102 xmlRegPrintRange(output, atom->ranges[i]); 1103 } else if (atom->type == XML_REGEXP_SUBREG) { 1104 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1105 } else { 1106 fprintf(output, "\n"); 1107 } 1108 } 1109 1110 static void 1111 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1112 fprintf(output, " trans: "); 1113 if (trans == NULL) { 1114 fprintf(output, "NULL\n"); 1115 return; 1116 } 1117 if (trans->to < 0) { 1118 fprintf(output, "removed\n"); 1119 return; 1120 } 1121 if (trans->nd != 0) { 1122 if (trans->nd == 2) 1123 fprintf(output, "last not determinist, "); 1124 else 1125 fprintf(output, "not determinist, "); 1126 } 1127 if (trans->counter >= 0) { 1128 fprintf(output, "counted %d, ", trans->counter); 1129 } 1130 if (trans->count == REGEXP_ALL_COUNTER) { 1131 fprintf(output, "all transition, "); 1132 } else if (trans->count >= 0) { 1133 fprintf(output, "count based %d, ", trans->count); 1134 } 1135 if (trans->atom == NULL) { 1136 fprintf(output, "epsilon to %d\n", trans->to); 1137 return; 1138 } 1139 if (trans->atom->type == XML_REGEXP_CHARVAL) 1140 fprintf(output, "char %c ", trans->atom->codepoint); 1141 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1142 } 1143 1144 static void 1145 xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1146 int i; 1147 1148 fprintf(output, " state: "); 1149 if (state == NULL) { 1150 fprintf(output, "NULL\n"); 1151 return; 1152 } 1153 if (state->type == XML_REGEXP_START_STATE) 1154 fprintf(output, "START "); 1155 if (state->type == XML_REGEXP_FINAL_STATE) 1156 fprintf(output, "FINAL "); 1157 1158 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1159 for (i = 0;i < state->nbTrans; i++) { 1160 xmlRegPrintTrans(output, &(state->trans[i])); 1161 } 1162 } 1163 1164 #ifdef DEBUG_REGEXP_GRAPH 1165 static void 1166 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1167 int i; 1168 1169 fprintf(output, " ctxt: "); 1170 if (ctxt == NULL) { 1171 fprintf(output, "NULL\n"); 1172 return; 1173 } 1174 fprintf(output, "'%s' ", ctxt->string); 1175 if (ctxt->error) 1176 fprintf(output, "error "); 1177 if (ctxt->neg) 1178 fprintf(output, "neg "); 1179 fprintf(output, "\n"); 1180 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1181 for (i = 0;i < ctxt->nbAtoms; i++) { 1182 fprintf(output, " %02d ", i); 1183 xmlRegPrintAtom(output, ctxt->atoms[i]); 1184 } 1185 if (ctxt->atom != NULL) { 1186 fprintf(output, "current atom:\n"); 1187 xmlRegPrintAtom(output, ctxt->atom); 1188 } 1189 fprintf(output, "%d states:", ctxt->nbStates); 1190 if (ctxt->start != NULL) 1191 fprintf(output, " start: %d", ctxt->start->no); 1192 if (ctxt->end != NULL) 1193 fprintf(output, " end: %d", ctxt->end->no); 1194 fprintf(output, "\n"); 1195 for (i = 0;i < ctxt->nbStates; i++) { 1196 xmlRegPrintState(output, ctxt->states[i]); 1197 } 1198 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1199 for (i = 0;i < ctxt->nbCounters; i++) { 1200 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1201 ctxt->counters[i].max); 1202 } 1203 } 1204 #endif 1205 1206 /************************************************************************ 1207 * * 1208 * Finite Automata structures manipulations * 1209 * * 1210 ************************************************************************/ 1211 1212 static void 1213 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1214 int neg, xmlRegAtomType type, int start, int end, 1215 xmlChar *blockName) { 1216 xmlRegRangePtr range; 1217 1218 if (atom == NULL) { 1219 ERROR("add range: atom is NULL"); 1220 return; 1221 } 1222 if (atom->type != XML_REGEXP_RANGES) { 1223 ERROR("add range: atom is not ranges"); 1224 return; 1225 } 1226 if (atom->maxRanges == 0) { 1227 atom->maxRanges = 4; 1228 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1229 sizeof(xmlRegRangePtr)); 1230 if (atom->ranges == NULL) { 1231 xmlRegexpErrMemory(ctxt, "adding ranges"); 1232 atom->maxRanges = 0; 1233 return; 1234 } 1235 } else if (atom->nbRanges >= atom->maxRanges) { 1236 xmlRegRangePtr *tmp; 1237 atom->maxRanges *= 2; 1238 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1239 sizeof(xmlRegRangePtr)); 1240 if (tmp == NULL) { 1241 xmlRegexpErrMemory(ctxt, "adding ranges"); 1242 atom->maxRanges /= 2; 1243 return; 1244 } 1245 atom->ranges = tmp; 1246 } 1247 range = xmlRegNewRange(ctxt, neg, type, start, end); 1248 if (range == NULL) 1249 return; 1250 range->blockName = blockName; 1251 atom->ranges[atom->nbRanges++] = range; 1252 1253 } 1254 1255 static int 1256 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1257 if (ctxt->maxCounters == 0) { 1258 ctxt->maxCounters = 4; 1259 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1260 sizeof(xmlRegCounter)); 1261 if (ctxt->counters == NULL) { 1262 xmlRegexpErrMemory(ctxt, "allocating counter"); 1263 ctxt->maxCounters = 0; 1264 return(-1); 1265 } 1266 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1267 xmlRegCounter *tmp; 1268 ctxt->maxCounters *= 2; 1269 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1270 sizeof(xmlRegCounter)); 1271 if (tmp == NULL) { 1272 xmlRegexpErrMemory(ctxt, "allocating counter"); 1273 ctxt->maxCounters /= 2; 1274 return(-1); 1275 } 1276 ctxt->counters = tmp; 1277 } 1278 ctxt->counters[ctxt->nbCounters].min = -1; 1279 ctxt->counters[ctxt->nbCounters].max = -1; 1280 return(ctxt->nbCounters++); 1281 } 1282 1283 static int 1284 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1285 if (atom == NULL) { 1286 ERROR("atom push: atom is NULL"); 1287 return(-1); 1288 } 1289 if (ctxt->maxAtoms == 0) { 1290 ctxt->maxAtoms = 4; 1291 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1292 sizeof(xmlRegAtomPtr)); 1293 if (ctxt->atoms == NULL) { 1294 xmlRegexpErrMemory(ctxt, "pushing atom"); 1295 ctxt->maxAtoms = 0; 1296 return(-1); 1297 } 1298 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1299 xmlRegAtomPtr *tmp; 1300 ctxt->maxAtoms *= 2; 1301 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1302 sizeof(xmlRegAtomPtr)); 1303 if (tmp == NULL) { 1304 xmlRegexpErrMemory(ctxt, "allocating counter"); 1305 ctxt->maxAtoms /= 2; 1306 return(-1); 1307 } 1308 ctxt->atoms = tmp; 1309 } 1310 atom->no = ctxt->nbAtoms; 1311 ctxt->atoms[ctxt->nbAtoms++] = atom; 1312 return(0); 1313 } 1314 1315 static void 1316 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1317 int from) { 1318 if (target->maxTransTo == 0) { 1319 target->maxTransTo = 8; 1320 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1321 sizeof(int)); 1322 if (target->transTo == NULL) { 1323 xmlRegexpErrMemory(ctxt, "adding transition"); 1324 target->maxTransTo = 0; 1325 return; 1326 } 1327 } else if (target->nbTransTo >= target->maxTransTo) { 1328 int *tmp; 1329 target->maxTransTo *= 2; 1330 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1331 sizeof(int)); 1332 if (tmp == NULL) { 1333 xmlRegexpErrMemory(ctxt, "adding transition"); 1334 target->maxTransTo /= 2; 1335 return; 1336 } 1337 target->transTo = tmp; 1338 } 1339 target->transTo[target->nbTransTo] = from; 1340 target->nbTransTo++; 1341 } 1342 1343 static void 1344 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1345 xmlRegAtomPtr atom, xmlRegStatePtr target, 1346 int counter, int count) { 1347 1348 int nrtrans; 1349 1350 if (state == NULL) { 1351 ERROR("add state: state is NULL"); 1352 return; 1353 } 1354 if (target == NULL) { 1355 ERROR("add state: target is NULL"); 1356 return; 1357 } 1358 /* 1359 * Other routines follow the philosophy 'When in doubt, add a transition' 1360 * so we check here whether such a transition is already present and, if 1361 * so, silently ignore this request. 1362 */ 1363 1364 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1365 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1366 if ((trans->atom == atom) && 1367 (trans->to == target->no) && 1368 (trans->counter == counter) && 1369 (trans->count == count)) { 1370 #ifdef DEBUG_REGEXP_GRAPH 1371 printf("Ignoring duplicate transition from %d to %d\n", 1372 state->no, target->no); 1373 #endif 1374 return; 1375 } 1376 } 1377 1378 if (state->maxTrans == 0) { 1379 state->maxTrans = 8; 1380 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1381 sizeof(xmlRegTrans)); 1382 if (state->trans == NULL) { 1383 xmlRegexpErrMemory(ctxt, "adding transition"); 1384 state->maxTrans = 0; 1385 return; 1386 } 1387 } else if (state->nbTrans >= state->maxTrans) { 1388 xmlRegTrans *tmp; 1389 state->maxTrans *= 2; 1390 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1391 sizeof(xmlRegTrans)); 1392 if (tmp == NULL) { 1393 xmlRegexpErrMemory(ctxt, "adding transition"); 1394 state->maxTrans /= 2; 1395 return; 1396 } 1397 state->trans = tmp; 1398 } 1399 #ifdef DEBUG_REGEXP_GRAPH 1400 printf("Add trans from %d to %d ", state->no, target->no); 1401 if (count == REGEXP_ALL_COUNTER) 1402 printf("all transition\n"); 1403 else if (count >= 0) 1404 printf("count based %d\n", count); 1405 else if (counter >= 0) 1406 printf("counted %d\n", counter); 1407 else if (atom == NULL) 1408 printf("epsilon transition\n"); 1409 else if (atom != NULL) 1410 xmlRegPrintAtom(stdout, atom); 1411 #endif 1412 1413 state->trans[state->nbTrans].atom = atom; 1414 state->trans[state->nbTrans].to = target->no; 1415 state->trans[state->nbTrans].counter = counter; 1416 state->trans[state->nbTrans].count = count; 1417 state->trans[state->nbTrans].nd = 0; 1418 state->nbTrans++; 1419 xmlRegStateAddTransTo(ctxt, target, state->no); 1420 } 1421 1422 static int 1423 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1424 if (state == NULL) return(-1); 1425 if (ctxt->maxStates == 0) { 1426 ctxt->maxStates = 4; 1427 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1428 sizeof(xmlRegStatePtr)); 1429 if (ctxt->states == NULL) { 1430 xmlRegexpErrMemory(ctxt, "adding state"); 1431 ctxt->maxStates = 0; 1432 return(-1); 1433 } 1434 } else if (ctxt->nbStates >= ctxt->maxStates) { 1435 xmlRegStatePtr *tmp; 1436 ctxt->maxStates *= 2; 1437 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1438 sizeof(xmlRegStatePtr)); 1439 if (tmp == NULL) { 1440 xmlRegexpErrMemory(ctxt, "adding state"); 1441 ctxt->maxStates /= 2; 1442 return(-1); 1443 } 1444 ctxt->states = tmp; 1445 } 1446 state->no = ctxt->nbStates; 1447 ctxt->states[ctxt->nbStates++] = state; 1448 return(0); 1449 } 1450 1451 /** 1452 * xmlFAGenerateAllTransition: 1453 * @ctxt: a regexp parser context 1454 * @from: the from state 1455 * @to: the target state or NULL for building a new one 1456 * @lax: 1457 * 1458 */ 1459 static void 1460 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1461 xmlRegStatePtr from, xmlRegStatePtr to, 1462 int lax) { 1463 if (to == NULL) { 1464 to = xmlRegNewState(ctxt); 1465 xmlRegStatePush(ctxt, to); 1466 ctxt->state = to; 1467 } 1468 if (lax) 1469 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1470 else 1471 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1472 } 1473 1474 /** 1475 * xmlFAGenerateEpsilonTransition: 1476 * @ctxt: a regexp parser context 1477 * @from: the from state 1478 * @to: the target state or NULL for building a new one 1479 * 1480 */ 1481 static void 1482 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1483 xmlRegStatePtr from, xmlRegStatePtr to) { 1484 if (to == NULL) { 1485 to = xmlRegNewState(ctxt); 1486 xmlRegStatePush(ctxt, to); 1487 ctxt->state = to; 1488 } 1489 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1490 } 1491 1492 /** 1493 * xmlFAGenerateCountedEpsilonTransition: 1494 * @ctxt: a regexp parser context 1495 * @from: the from state 1496 * @to: the target state or NULL for building a new one 1497 * counter: the counter for that transition 1498 * 1499 */ 1500 static void 1501 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1502 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1503 if (to == NULL) { 1504 to = xmlRegNewState(ctxt); 1505 xmlRegStatePush(ctxt, to); 1506 ctxt->state = to; 1507 } 1508 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1509 } 1510 1511 /** 1512 * xmlFAGenerateCountedTransition: 1513 * @ctxt: a regexp parser context 1514 * @from: the from state 1515 * @to: the target state or NULL for building a new one 1516 * counter: the counter for that transition 1517 * 1518 */ 1519 static void 1520 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1521 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1522 if (to == NULL) { 1523 to = xmlRegNewState(ctxt); 1524 xmlRegStatePush(ctxt, to); 1525 ctxt->state = to; 1526 } 1527 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1528 } 1529 1530 /** 1531 * xmlFAGenerateTransitions: 1532 * @ctxt: a regexp parser context 1533 * @from: the from state 1534 * @to: the target state or NULL for building a new one 1535 * @atom: the atom generating the transition 1536 * 1537 * Returns 0 if success and -1 in case of error. 1538 */ 1539 static int 1540 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1541 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1542 xmlRegStatePtr end; 1543 1544 if (atom == NULL) { 1545 ERROR("genrate transition: atom == NULL"); 1546 return(-1); 1547 } 1548 if (atom->type == XML_REGEXP_SUBREG) { 1549 /* 1550 * this is a subexpression handling one should not need to 1551 * create a new node except for XML_REGEXP_QUANT_RANGE. 1552 */ 1553 if (xmlRegAtomPush(ctxt, atom) < 0) { 1554 return(-1); 1555 } 1556 if ((to != NULL) && (atom->stop != to) && 1557 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1558 /* 1559 * Generate an epsilon transition to link to the target 1560 */ 1561 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1562 #ifdef DV 1563 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1564 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1565 to = xmlRegNewState(ctxt); 1566 xmlRegStatePush(ctxt, to); 1567 ctxt->state = to; 1568 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1569 #endif 1570 } 1571 switch (atom->quant) { 1572 case XML_REGEXP_QUANT_OPT: 1573 atom->quant = XML_REGEXP_QUANT_ONCE; 1574 /* 1575 * transition done to the state after end of atom. 1576 * 1. set transition from atom start to new state 1577 * 2. set transition from atom end to this state. 1578 */ 1579 if (to == NULL) { 1580 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1581 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, 1582 ctxt->state); 1583 } else { 1584 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to); 1585 } 1586 break; 1587 case XML_REGEXP_QUANT_MULT: 1588 atom->quant = XML_REGEXP_QUANT_ONCE; 1589 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1590 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1591 break; 1592 case XML_REGEXP_QUANT_PLUS: 1593 atom->quant = XML_REGEXP_QUANT_ONCE; 1594 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1595 break; 1596 case XML_REGEXP_QUANT_RANGE: { 1597 int counter; 1598 xmlRegStatePtr inter, newstate; 1599 1600 /* 1601 * create the final state now if needed 1602 */ 1603 if (to != NULL) { 1604 newstate = to; 1605 } else { 1606 newstate = xmlRegNewState(ctxt); 1607 xmlRegStatePush(ctxt, newstate); 1608 } 1609 1610 /* 1611 * The principle here is to use counted transition 1612 * to avoid explosion in the number of states in the 1613 * graph. This is clearly more complex but should not 1614 * be exploitable at runtime. 1615 */ 1616 if ((atom->min == 0) && (atom->start0 == NULL)) { 1617 xmlRegAtomPtr copy; 1618 /* 1619 * duplicate a transition based on atom to count next 1620 * occurences after 1. We cannot loop to atom->start 1621 * directly because we need an epsilon transition to 1622 * newstate. 1623 */ 1624 /* ???? For some reason it seems we never reach that 1625 case, I suppose this got optimized out before when 1626 building the automata */ 1627 copy = xmlRegCopyAtom(ctxt, atom); 1628 if (copy == NULL) 1629 return(-1); 1630 copy->quant = XML_REGEXP_QUANT_ONCE; 1631 copy->min = 0; 1632 copy->max = 0; 1633 1634 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy) 1635 < 0) 1636 return(-1); 1637 inter = ctxt->state; 1638 counter = xmlRegGetCounter(ctxt); 1639 ctxt->counters[counter].min = atom->min - 1; 1640 ctxt->counters[counter].max = atom->max - 1; 1641 /* count the number of times we see it again */ 1642 xmlFAGenerateCountedEpsilonTransition(ctxt, inter, 1643 atom->stop, counter); 1644 /* allow a way out based on the count */ 1645 xmlFAGenerateCountedTransition(ctxt, inter, 1646 newstate, counter); 1647 /* and also allow a direct exit for 0 */ 1648 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1649 newstate); 1650 } else { 1651 /* 1652 * either we need the atom at least once or there 1653 * is an atom->start0 allowing to easilly plug the 1654 * epsilon transition. 1655 */ 1656 counter = xmlRegGetCounter(ctxt); 1657 ctxt->counters[counter].min = atom->min - 1; 1658 ctxt->counters[counter].max = atom->max - 1; 1659 /* count the number of times we see it again */ 1660 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1661 atom->start, counter); 1662 /* allow a way out based on the count */ 1663 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1664 newstate, counter); 1665 /* and if needed allow a direct exit for 0 */ 1666 if (atom->min == 0) 1667 xmlFAGenerateEpsilonTransition(ctxt, atom->start0, 1668 newstate); 1669 1670 } 1671 atom->min = 0; 1672 atom->max = 0; 1673 atom->quant = XML_REGEXP_QUANT_ONCE; 1674 ctxt->state = newstate; 1675 } 1676 default: 1677 break; 1678 } 1679 return(0); 1680 } 1681 if ((atom->min == 0) && (atom->max == 0) && 1682 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1683 /* 1684 * we can discard the atom and generate an epsilon transition instead 1685 */ 1686 if (to == NULL) { 1687 to = xmlRegNewState(ctxt); 1688 if (to != NULL) 1689 xmlRegStatePush(ctxt, to); 1690 else { 1691 return(-1); 1692 } 1693 } 1694 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1695 ctxt->state = to; 1696 xmlRegFreeAtom(atom); 1697 return(0); 1698 } 1699 if (to == NULL) { 1700 to = xmlRegNewState(ctxt); 1701 if (to != NULL) 1702 xmlRegStatePush(ctxt, to); 1703 else { 1704 return(-1); 1705 } 1706 } 1707 end = to; 1708 if ((atom->quant == XML_REGEXP_QUANT_MULT) || 1709 (atom->quant == XML_REGEXP_QUANT_PLUS)) { 1710 /* 1711 * Do not pollute the target state by adding transitions from 1712 * it as it is likely to be the shared target of multiple branches. 1713 * So isolate with an epsilon transition. 1714 */ 1715 xmlRegStatePtr tmp; 1716 1717 tmp = xmlRegNewState(ctxt); 1718 if (tmp != NULL) 1719 xmlRegStatePush(ctxt, tmp); 1720 else { 1721 return(-1); 1722 } 1723 xmlFAGenerateEpsilonTransition(ctxt, tmp, to); 1724 to = tmp; 1725 } 1726 if (xmlRegAtomPush(ctxt, atom) < 0) { 1727 return(-1); 1728 } 1729 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1730 ctxt->state = end; 1731 switch (atom->quant) { 1732 case XML_REGEXP_QUANT_OPT: 1733 atom->quant = XML_REGEXP_QUANT_ONCE; 1734 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1735 break; 1736 case XML_REGEXP_QUANT_MULT: 1737 atom->quant = XML_REGEXP_QUANT_ONCE; 1738 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1739 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1740 break; 1741 case XML_REGEXP_QUANT_PLUS: 1742 atom->quant = XML_REGEXP_QUANT_ONCE; 1743 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1744 break; 1745 case XML_REGEXP_QUANT_RANGE: 1746 #if DV_test 1747 if (atom->min == 0) { 1748 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1749 } 1750 #endif 1751 break; 1752 default: 1753 break; 1754 } 1755 return(0); 1756 } 1757 1758 /** 1759 * xmlFAReduceEpsilonTransitions: 1760 * @ctxt: a regexp parser context 1761 * @fromnr: the from state 1762 * @tonr: the to state 1763 * @counter: should that transition be associated to a counted 1764 * 1765 */ 1766 static void 1767 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1768 int tonr, int counter) { 1769 int transnr; 1770 xmlRegStatePtr from; 1771 xmlRegStatePtr to; 1772 1773 #ifdef DEBUG_REGEXP_GRAPH 1774 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1775 #endif 1776 from = ctxt->states[fromnr]; 1777 if (from == NULL) 1778 return; 1779 to = ctxt->states[tonr]; 1780 if (to == NULL) 1781 return; 1782 if ((to->mark == XML_REGEXP_MARK_START) || 1783 (to->mark == XML_REGEXP_MARK_VISITED)) 1784 return; 1785 1786 to->mark = XML_REGEXP_MARK_VISITED; 1787 if (to->type == XML_REGEXP_FINAL_STATE) { 1788 #ifdef DEBUG_REGEXP_GRAPH 1789 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1790 #endif 1791 from->type = XML_REGEXP_FINAL_STATE; 1792 } 1793 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1794 if (to->trans[transnr].to < 0) 1795 continue; 1796 if (to->trans[transnr].atom == NULL) { 1797 /* 1798 * Don't remove counted transitions 1799 * Don't loop either 1800 */ 1801 if (to->trans[transnr].to != fromnr) { 1802 if (to->trans[transnr].count >= 0) { 1803 int newto = to->trans[transnr].to; 1804 1805 xmlRegStateAddTrans(ctxt, from, NULL, 1806 ctxt->states[newto], 1807 -1, to->trans[transnr].count); 1808 } else { 1809 #ifdef DEBUG_REGEXP_GRAPH 1810 printf("Found epsilon trans %d from %d to %d\n", 1811 transnr, tonr, to->trans[transnr].to); 1812 #endif 1813 if (to->trans[transnr].counter >= 0) { 1814 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1815 to->trans[transnr].to, 1816 to->trans[transnr].counter); 1817 } else { 1818 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1819 to->trans[transnr].to, 1820 counter); 1821 } 1822 } 1823 } 1824 } else { 1825 int newto = to->trans[transnr].to; 1826 1827 if (to->trans[transnr].counter >= 0) { 1828 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1829 ctxt->states[newto], 1830 to->trans[transnr].counter, -1); 1831 } else { 1832 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1833 ctxt->states[newto], counter, -1); 1834 } 1835 } 1836 } 1837 to->mark = XML_REGEXP_MARK_NORMAL; 1838 } 1839 1840 /** 1841 * xmlFAEliminateSimpleEpsilonTransitions: 1842 * @ctxt: a regexp parser context 1843 * 1844 * Eliminating general epsilon transitions can get costly in the general 1845 * algorithm due to the large amount of generated new transitions and 1846 * associated comparisons. However for simple epsilon transition used just 1847 * to separate building blocks when generating the automata this can be 1848 * reduced to state elimination: 1849 * - if there exists an epsilon from X to Y 1850 * - if there is no other transition from X 1851 * then X and Y are semantically equivalent and X can be eliminated 1852 * If X is the start state then make Y the start state, else replace the 1853 * target of all transitions to X by transitions to Y. 1854 */ 1855 static void 1856 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1857 int statenr, i, j, newto; 1858 xmlRegStatePtr state, tmp; 1859 1860 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1861 state = ctxt->states[statenr]; 1862 if (state == NULL) 1863 continue; 1864 if (state->nbTrans != 1) 1865 continue; 1866 if (state->type == XML_REGEXP_UNREACH_STATE) 1867 continue; 1868 /* is the only transition out a basic transition */ 1869 if ((state->trans[0].atom == NULL) && 1870 (state->trans[0].to >= 0) && 1871 (state->trans[0].to != statenr) && 1872 (state->trans[0].counter < 0) && 1873 (state->trans[0].count < 0)) { 1874 newto = state->trans[0].to; 1875 1876 if (state->type == XML_REGEXP_START_STATE) { 1877 #ifdef DEBUG_REGEXP_GRAPH 1878 printf("Found simple epsilon trans from start %d to %d\n", 1879 statenr, newto); 1880 #endif 1881 } else { 1882 #ifdef DEBUG_REGEXP_GRAPH 1883 printf("Found simple epsilon trans from %d to %d\n", 1884 statenr, newto); 1885 #endif 1886 for (i = 0;i < state->nbTransTo;i++) { 1887 tmp = ctxt->states[state->transTo[i]]; 1888 for (j = 0;j < tmp->nbTrans;j++) { 1889 if (tmp->trans[j].to == statenr) { 1890 #ifdef DEBUG_REGEXP_GRAPH 1891 printf("Changed transition %d on %d to go to %d\n", 1892 j, tmp->no, newto); 1893 #endif 1894 tmp->trans[j].to = -1; 1895 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom, 1896 ctxt->states[newto], 1897 tmp->trans[j].counter, 1898 tmp->trans[j].count); 1899 } 1900 } 1901 } 1902 if (state->type == XML_REGEXP_FINAL_STATE) 1903 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1904 /* eliminate the transition completely */ 1905 state->nbTrans = 0; 1906 1907 state->type = XML_REGEXP_UNREACH_STATE; 1908 1909 } 1910 1911 } 1912 } 1913 } 1914 /** 1915 * xmlFAEliminateEpsilonTransitions: 1916 * @ctxt: a regexp parser context 1917 * 1918 */ 1919 static void 1920 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1921 int statenr, transnr; 1922 xmlRegStatePtr state; 1923 int has_epsilon; 1924 1925 if (ctxt->states == NULL) return; 1926 1927 /* 1928 * Eliminate simple epsilon transition and the associated unreachable 1929 * states. 1930 */ 1931 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1932 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1933 state = ctxt->states[statenr]; 1934 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) { 1935 #ifdef DEBUG_REGEXP_GRAPH 1936 printf("Removed unreachable state %d\n", statenr); 1937 #endif 1938 xmlRegFreeState(state); 1939 ctxt->states[statenr] = NULL; 1940 } 1941 } 1942 1943 has_epsilon = 0; 1944 1945 /* 1946 * Build the completed transitions bypassing the epsilons 1947 * Use a marking algorithm to avoid loops 1948 * Mark sink states too. 1949 * Process from the latests states backward to the start when 1950 * there is long cascading epsilon chains this minimize the 1951 * recursions and transition compares when adding the new ones 1952 */ 1953 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) { 1954 state = ctxt->states[statenr]; 1955 if (state == NULL) 1956 continue; 1957 if ((state->nbTrans == 0) && 1958 (state->type != XML_REGEXP_FINAL_STATE)) { 1959 state->type = XML_REGEXP_SINK_STATE; 1960 } 1961 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1962 if ((state->trans[transnr].atom == NULL) && 1963 (state->trans[transnr].to >= 0)) { 1964 if (state->trans[transnr].to == statenr) { 1965 state->trans[transnr].to = -1; 1966 #ifdef DEBUG_REGEXP_GRAPH 1967 printf("Removed loopback epsilon trans %d on %d\n", 1968 transnr, statenr); 1969 #endif 1970 } else if (state->trans[transnr].count < 0) { 1971 int newto = state->trans[transnr].to; 1972 1973 #ifdef DEBUG_REGEXP_GRAPH 1974 printf("Found epsilon trans %d from %d to %d\n", 1975 transnr, statenr, newto); 1976 #endif 1977 has_epsilon = 1; 1978 state->trans[transnr].to = -2; 1979 state->mark = XML_REGEXP_MARK_START; 1980 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1981 newto, state->trans[transnr].counter); 1982 state->mark = XML_REGEXP_MARK_NORMAL; 1983 #ifdef DEBUG_REGEXP_GRAPH 1984 } else { 1985 printf("Found counted transition %d on %d\n", 1986 transnr, statenr); 1987 #endif 1988 } 1989 } 1990 } 1991 } 1992 /* 1993 * Eliminate the epsilon transitions 1994 */ 1995 if (has_epsilon) { 1996 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1997 state = ctxt->states[statenr]; 1998 if (state == NULL) 1999 continue; 2000 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2001 xmlRegTransPtr trans = &(state->trans[transnr]); 2002 if ((trans->atom == NULL) && 2003 (trans->count < 0) && 2004 (trans->to >= 0)) { 2005 trans->to = -1; 2006 } 2007 } 2008 } 2009 } 2010 2011 /* 2012 * Use this pass to detect unreachable states too 2013 */ 2014 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2015 state = ctxt->states[statenr]; 2016 if (state != NULL) 2017 state->reached = XML_REGEXP_MARK_NORMAL; 2018 } 2019 state = ctxt->states[0]; 2020 if (state != NULL) 2021 state->reached = XML_REGEXP_MARK_START; 2022 while (state != NULL) { 2023 xmlRegStatePtr target = NULL; 2024 state->reached = XML_REGEXP_MARK_VISITED; 2025 /* 2026 * Mark all states reachable from the current reachable state 2027 */ 2028 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2029 if ((state->trans[transnr].to >= 0) && 2030 ((state->trans[transnr].atom != NULL) || 2031 (state->trans[transnr].count >= 0))) { 2032 int newto = state->trans[transnr].to; 2033 2034 if (ctxt->states[newto] == NULL) 2035 continue; 2036 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 2037 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 2038 target = ctxt->states[newto]; 2039 } 2040 } 2041 } 2042 2043 /* 2044 * find the next accessible state not explored 2045 */ 2046 if (target == NULL) { 2047 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 2048 state = ctxt->states[statenr]; 2049 if ((state != NULL) && (state->reached == 2050 XML_REGEXP_MARK_START)) { 2051 target = state; 2052 break; 2053 } 2054 } 2055 } 2056 state = target; 2057 } 2058 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2059 state = ctxt->states[statenr]; 2060 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 2061 #ifdef DEBUG_REGEXP_GRAPH 2062 printf("Removed unreachable state %d\n", statenr); 2063 #endif 2064 xmlRegFreeState(state); 2065 ctxt->states[statenr] = NULL; 2066 } 2067 } 2068 2069 } 2070 2071 static int 2072 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 2073 int ret = 0; 2074 2075 if ((range1->type == XML_REGEXP_RANGES) || 2076 (range2->type == XML_REGEXP_RANGES) || 2077 (range2->type == XML_REGEXP_SUBREG) || 2078 (range1->type == XML_REGEXP_SUBREG) || 2079 (range1->type == XML_REGEXP_STRING) || 2080 (range2->type == XML_REGEXP_STRING)) 2081 return(-1); 2082 2083 /* put them in order */ 2084 if (range1->type > range2->type) { 2085 xmlRegRangePtr tmp; 2086 2087 tmp = range1; 2088 range1 = range2; 2089 range2 = tmp; 2090 } 2091 if ((range1->type == XML_REGEXP_ANYCHAR) || 2092 (range2->type == XML_REGEXP_ANYCHAR)) { 2093 ret = 1; 2094 } else if ((range1->type == XML_REGEXP_EPSILON) || 2095 (range2->type == XML_REGEXP_EPSILON)) { 2096 return(0); 2097 } else if (range1->type == range2->type) { 2098 if (range1->type != XML_REGEXP_CHARVAL) 2099 ret = 1; 2100 else if ((range1->end < range2->start) || 2101 (range2->end < range1->start)) 2102 ret = 0; 2103 else 2104 ret = 1; 2105 } else if (range1->type == XML_REGEXP_CHARVAL) { 2106 int codepoint; 2107 int neg = 0; 2108 2109 /* 2110 * just check all codepoints in the range for acceptance, 2111 * this is usually way cheaper since done only once at 2112 * compilation than testing over and over at runtime or 2113 * pushing too many states when evaluating. 2114 */ 2115 if (((range1->neg == 0) && (range2->neg != 0)) || 2116 ((range1->neg != 0) && (range2->neg == 0))) 2117 neg = 1; 2118 2119 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 2120 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 2121 0, range2->start, range2->end, 2122 range2->blockName); 2123 if (ret < 0) 2124 return(-1); 2125 if (((neg == 1) && (ret == 0)) || 2126 ((neg == 0) && (ret == 1))) 2127 return(1); 2128 } 2129 return(0); 2130 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 2131 (range2->type == XML_REGEXP_BLOCK_NAME)) { 2132 if (range1->type == range2->type) { 2133 ret = xmlStrEqual(range1->blockName, range2->blockName); 2134 } else { 2135 /* 2136 * comparing a block range with anything else is way 2137 * too costly, and maintining the table is like too much 2138 * memory too, so let's force the automata to save state 2139 * here. 2140 */ 2141 return(1); 2142 } 2143 } else if ((range1->type < XML_REGEXP_LETTER) || 2144 (range2->type < XML_REGEXP_LETTER)) { 2145 if ((range1->type == XML_REGEXP_ANYSPACE) && 2146 (range2->type == XML_REGEXP_NOTSPACE)) 2147 ret = 0; 2148 else if ((range1->type == XML_REGEXP_INITNAME) && 2149 (range2->type == XML_REGEXP_NOTINITNAME)) 2150 ret = 0; 2151 else if ((range1->type == XML_REGEXP_NAMECHAR) && 2152 (range2->type == XML_REGEXP_NOTNAMECHAR)) 2153 ret = 0; 2154 else if ((range1->type == XML_REGEXP_DECIMAL) && 2155 (range2->type == XML_REGEXP_NOTDECIMAL)) 2156 ret = 0; 2157 else if ((range1->type == XML_REGEXP_REALCHAR) && 2158 (range2->type == XML_REGEXP_NOTREALCHAR)) 2159 ret = 0; 2160 else { 2161 /* same thing to limit complexity */ 2162 return(1); 2163 } 2164 } else { 2165 ret = 0; 2166 /* range1->type < range2->type here */ 2167 switch (range1->type) { 2168 case XML_REGEXP_LETTER: 2169 /* all disjoint except in the subgroups */ 2170 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2171 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2172 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2173 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2174 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2175 ret = 1; 2176 break; 2177 case XML_REGEXP_MARK: 2178 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2179 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2180 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2181 ret = 1; 2182 break; 2183 case XML_REGEXP_NUMBER: 2184 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2185 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2186 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2187 ret = 1; 2188 break; 2189 case XML_REGEXP_PUNCT: 2190 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2191 (range2->type == XML_REGEXP_PUNCT_DASH) || 2192 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2193 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2194 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2195 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2196 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2197 ret = 1; 2198 break; 2199 case XML_REGEXP_SEPAR: 2200 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2201 (range2->type == XML_REGEXP_SEPAR_LINE) || 2202 (range2->type == XML_REGEXP_SEPAR_PARA)) 2203 ret = 1; 2204 break; 2205 case XML_REGEXP_SYMBOL: 2206 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2207 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2208 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2209 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2210 ret = 1; 2211 break; 2212 case XML_REGEXP_OTHER: 2213 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2214 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2215 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2216 ret = 1; 2217 break; 2218 default: 2219 if ((range2->type >= XML_REGEXP_LETTER) && 2220 (range2->type < XML_REGEXP_BLOCK_NAME)) 2221 ret = 0; 2222 else { 2223 /* safety net ! */ 2224 return(1); 2225 } 2226 } 2227 } 2228 if (((range1->neg == 0) && (range2->neg != 0)) || 2229 ((range1->neg != 0) && (range2->neg == 0))) 2230 ret = !ret; 2231 return(ret); 2232 } 2233 2234 /** 2235 * xmlFACompareAtomTypes: 2236 * @type1: an atom type 2237 * @type2: an atom type 2238 * 2239 * Compares two atoms type to check whether they intersect in some ways, 2240 * this is used by xmlFACompareAtoms only 2241 * 2242 * Returns 1 if they may intersect and 0 otherwise 2243 */ 2244 static int 2245 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2246 if ((type1 == XML_REGEXP_EPSILON) || 2247 (type1 == XML_REGEXP_CHARVAL) || 2248 (type1 == XML_REGEXP_RANGES) || 2249 (type1 == XML_REGEXP_SUBREG) || 2250 (type1 == XML_REGEXP_STRING) || 2251 (type1 == XML_REGEXP_ANYCHAR)) 2252 return(1); 2253 if ((type2 == XML_REGEXP_EPSILON) || 2254 (type2 == XML_REGEXP_CHARVAL) || 2255 (type2 == XML_REGEXP_RANGES) || 2256 (type2 == XML_REGEXP_SUBREG) || 2257 (type2 == XML_REGEXP_STRING) || 2258 (type2 == XML_REGEXP_ANYCHAR)) 2259 return(1); 2260 2261 if (type1 == type2) return(1); 2262 2263 /* simplify subsequent compares by making sure type1 < type2 */ 2264 if (type1 > type2) { 2265 xmlRegAtomType tmp = type1; 2266 type1 = type2; 2267 type2 = tmp; 2268 } 2269 switch (type1) { 2270 case XML_REGEXP_ANYSPACE: /* \s */ 2271 /* can't be a letter, number, mark, pontuation, symbol */ 2272 if ((type2 == XML_REGEXP_NOTSPACE) || 2273 ((type2 >= XML_REGEXP_LETTER) && 2274 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2275 ((type2 >= XML_REGEXP_NUMBER) && 2276 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2277 ((type2 >= XML_REGEXP_MARK) && 2278 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2279 ((type2 >= XML_REGEXP_PUNCT) && 2280 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2281 ((type2 >= XML_REGEXP_SYMBOL) && 2282 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2283 ) return(0); 2284 break; 2285 case XML_REGEXP_NOTSPACE: /* \S */ 2286 break; 2287 case XML_REGEXP_INITNAME: /* \l */ 2288 /* can't be a number, mark, separator, pontuation, symbol or other */ 2289 if ((type2 == XML_REGEXP_NOTINITNAME) || 2290 ((type2 >= XML_REGEXP_NUMBER) && 2291 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2292 ((type2 >= XML_REGEXP_MARK) && 2293 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2294 ((type2 >= XML_REGEXP_SEPAR) && 2295 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2296 ((type2 >= XML_REGEXP_PUNCT) && 2297 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2298 ((type2 >= XML_REGEXP_SYMBOL) && 2299 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2300 ((type2 >= XML_REGEXP_OTHER) && 2301 (type2 <= XML_REGEXP_OTHER_NA)) 2302 ) return(0); 2303 break; 2304 case XML_REGEXP_NOTINITNAME: /* \L */ 2305 break; 2306 case XML_REGEXP_NAMECHAR: /* \c */ 2307 /* can't be a mark, separator, pontuation, symbol or other */ 2308 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2309 ((type2 >= XML_REGEXP_MARK) && 2310 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2311 ((type2 >= XML_REGEXP_PUNCT) && 2312 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2313 ((type2 >= XML_REGEXP_SEPAR) && 2314 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2315 ((type2 >= XML_REGEXP_SYMBOL) && 2316 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2317 ((type2 >= XML_REGEXP_OTHER) && 2318 (type2 <= XML_REGEXP_OTHER_NA)) 2319 ) return(0); 2320 break; 2321 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2322 break; 2323 case XML_REGEXP_DECIMAL: /* \d */ 2324 /* can't be a letter, mark, separator, pontuation, symbol or other */ 2325 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2326 (type2 == XML_REGEXP_REALCHAR) || 2327 ((type2 >= XML_REGEXP_LETTER) && 2328 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2329 ((type2 >= XML_REGEXP_MARK) && 2330 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2331 ((type2 >= XML_REGEXP_PUNCT) && 2332 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2333 ((type2 >= XML_REGEXP_SEPAR) && 2334 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2335 ((type2 >= XML_REGEXP_SYMBOL) && 2336 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2337 ((type2 >= XML_REGEXP_OTHER) && 2338 (type2 <= XML_REGEXP_OTHER_NA)) 2339 )return(0); 2340 break; 2341 case XML_REGEXP_NOTDECIMAL: /* \D */ 2342 break; 2343 case XML_REGEXP_REALCHAR: /* \w */ 2344 /* can't be a mark, separator, pontuation, symbol or other */ 2345 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2346 ((type2 >= XML_REGEXP_MARK) && 2347 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2348 ((type2 >= XML_REGEXP_PUNCT) && 2349 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2350 ((type2 >= XML_REGEXP_SEPAR) && 2351 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2352 ((type2 >= XML_REGEXP_SYMBOL) && 2353 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2354 ((type2 >= XML_REGEXP_OTHER) && 2355 (type2 <= XML_REGEXP_OTHER_NA)) 2356 )return(0); 2357 break; 2358 case XML_REGEXP_NOTREALCHAR: /* \W */ 2359 break; 2360 /* 2361 * at that point we know both type 1 and type2 are from 2362 * character categories are ordered and are different, 2363 * it becomes simple because this is a partition 2364 */ 2365 case XML_REGEXP_LETTER: 2366 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2367 return(1); 2368 return(0); 2369 case XML_REGEXP_LETTER_UPPERCASE: 2370 case XML_REGEXP_LETTER_LOWERCASE: 2371 case XML_REGEXP_LETTER_TITLECASE: 2372 case XML_REGEXP_LETTER_MODIFIER: 2373 case XML_REGEXP_LETTER_OTHERS: 2374 return(0); 2375 case XML_REGEXP_MARK: 2376 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2377 return(1); 2378 return(0); 2379 case XML_REGEXP_MARK_NONSPACING: 2380 case XML_REGEXP_MARK_SPACECOMBINING: 2381 case XML_REGEXP_MARK_ENCLOSING: 2382 return(0); 2383 case XML_REGEXP_NUMBER: 2384 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2385 return(1); 2386 return(0); 2387 case XML_REGEXP_NUMBER_DECIMAL: 2388 case XML_REGEXP_NUMBER_LETTER: 2389 case XML_REGEXP_NUMBER_OTHERS: 2390 return(0); 2391 case XML_REGEXP_PUNCT: 2392 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2393 return(1); 2394 return(0); 2395 case XML_REGEXP_PUNCT_CONNECTOR: 2396 case XML_REGEXP_PUNCT_DASH: 2397 case XML_REGEXP_PUNCT_OPEN: 2398 case XML_REGEXP_PUNCT_CLOSE: 2399 case XML_REGEXP_PUNCT_INITQUOTE: 2400 case XML_REGEXP_PUNCT_FINQUOTE: 2401 case XML_REGEXP_PUNCT_OTHERS: 2402 return(0); 2403 case XML_REGEXP_SEPAR: 2404 if (type2 <= XML_REGEXP_SEPAR_PARA) 2405 return(1); 2406 return(0); 2407 case XML_REGEXP_SEPAR_SPACE: 2408 case XML_REGEXP_SEPAR_LINE: 2409 case XML_REGEXP_SEPAR_PARA: 2410 return(0); 2411 case XML_REGEXP_SYMBOL: 2412 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2413 return(1); 2414 return(0); 2415 case XML_REGEXP_SYMBOL_MATH: 2416 case XML_REGEXP_SYMBOL_CURRENCY: 2417 case XML_REGEXP_SYMBOL_MODIFIER: 2418 case XML_REGEXP_SYMBOL_OTHERS: 2419 return(0); 2420 case XML_REGEXP_OTHER: 2421 if (type2 <= XML_REGEXP_OTHER_NA) 2422 return(1); 2423 return(0); 2424 case XML_REGEXP_OTHER_CONTROL: 2425 case XML_REGEXP_OTHER_FORMAT: 2426 case XML_REGEXP_OTHER_PRIVATE: 2427 case XML_REGEXP_OTHER_NA: 2428 return(0); 2429 default: 2430 break; 2431 } 2432 return(1); 2433 } 2434 2435 /** 2436 * xmlFAEqualAtoms: 2437 * @atom1: an atom 2438 * @atom2: an atom 2439 * @deep: if not set only compare string pointers 2440 * 2441 * Compares two atoms to check whether they are the same exactly 2442 * this is used to remove equivalent transitions 2443 * 2444 * Returns 1 if same and 0 otherwise 2445 */ 2446 static int 2447 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2448 int ret = 0; 2449 2450 if (atom1 == atom2) 2451 return(1); 2452 if ((atom1 == NULL) || (atom2 == NULL)) 2453 return(0); 2454 2455 if (atom1->type != atom2->type) 2456 return(0); 2457 switch (atom1->type) { 2458 case XML_REGEXP_EPSILON: 2459 ret = 0; 2460 break; 2461 case XML_REGEXP_STRING: 2462 if (!deep) 2463 ret = (atom1->valuep == atom2->valuep); 2464 else 2465 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2466 (xmlChar *)atom2->valuep); 2467 break; 2468 case XML_REGEXP_CHARVAL: 2469 ret = (atom1->codepoint == atom2->codepoint); 2470 break; 2471 case XML_REGEXP_RANGES: 2472 /* too hard to do in the general case */ 2473 ret = 0; 2474 default: 2475 break; 2476 } 2477 return(ret); 2478 } 2479 2480 /** 2481 * xmlFACompareAtoms: 2482 * @atom1: an atom 2483 * @atom2: an atom 2484 * @deep: if not set only compare string pointers 2485 * 2486 * Compares two atoms to check whether they intersect in some ways, 2487 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2488 * 2489 * Returns 1 if yes and 0 otherwise 2490 */ 2491 static int 2492 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2493 int ret = 1; 2494 2495 if (atom1 == atom2) 2496 return(1); 2497 if ((atom1 == NULL) || (atom2 == NULL)) 2498 return(0); 2499 2500 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2501 (atom2->type == XML_REGEXP_ANYCHAR)) 2502 return(1); 2503 2504 if (atom1->type > atom2->type) { 2505 xmlRegAtomPtr tmp; 2506 tmp = atom1; 2507 atom1 = atom2; 2508 atom2 = tmp; 2509 } 2510 if (atom1->type != atom2->type) { 2511 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2512 /* if they can't intersect at the type level break now */ 2513 if (ret == 0) 2514 return(0); 2515 } 2516 switch (atom1->type) { 2517 case XML_REGEXP_STRING: 2518 if (!deep) 2519 ret = (atom1->valuep != atom2->valuep); 2520 else 2521 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, 2522 (xmlChar *)atom2->valuep); 2523 break; 2524 case XML_REGEXP_EPSILON: 2525 goto not_determinist; 2526 case XML_REGEXP_CHARVAL: 2527 if (atom2->type == XML_REGEXP_CHARVAL) { 2528 ret = (atom1->codepoint == atom2->codepoint); 2529 } else { 2530 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2531 if (ret < 0) 2532 ret = 1; 2533 } 2534 break; 2535 case XML_REGEXP_RANGES: 2536 if (atom2->type == XML_REGEXP_RANGES) { 2537 int i, j, res; 2538 xmlRegRangePtr r1, r2; 2539 2540 /* 2541 * need to check that none of the ranges eventually matches 2542 */ 2543 for (i = 0;i < atom1->nbRanges;i++) { 2544 for (j = 0;j < atom2->nbRanges;j++) { 2545 r1 = atom1->ranges[i]; 2546 r2 = atom2->ranges[j]; 2547 res = xmlFACompareRanges(r1, r2); 2548 if (res == 1) { 2549 ret = 1; 2550 goto done; 2551 } 2552 } 2553 } 2554 ret = 0; 2555 } 2556 break; 2557 default: 2558 goto not_determinist; 2559 } 2560 done: 2561 if (atom1->neg != atom2->neg) { 2562 ret = !ret; 2563 } 2564 if (ret == 0) 2565 return(0); 2566 not_determinist: 2567 return(1); 2568 } 2569 2570 /** 2571 * xmlFARecurseDeterminism: 2572 * @ctxt: a regexp parser context 2573 * 2574 * Check whether the associated regexp is determinist, 2575 * should be called after xmlFAEliminateEpsilonTransitions() 2576 * 2577 */ 2578 static int 2579 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2580 int to, xmlRegAtomPtr atom) { 2581 int ret = 1; 2582 int res; 2583 int transnr, nbTrans; 2584 xmlRegTransPtr t1; 2585 int deep = 1; 2586 2587 if (state == NULL) 2588 return(ret); 2589 2590 if (ctxt->flags & AM_AUTOMATA_RNG) 2591 deep = 0; 2592 2593 /* 2594 * don't recurse on transitions potentially added in the course of 2595 * the elimination. 2596 */ 2597 nbTrans = state->nbTrans; 2598 for (transnr = 0;transnr < nbTrans;transnr++) { 2599 t1 = &(state->trans[transnr]); 2600 /* 2601 * check transitions conflicting with the one looked at 2602 */ 2603 if (t1->atom == NULL) { 2604 if (t1->to < 0) 2605 continue; 2606 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2607 to, atom); 2608 if (res == 0) { 2609 ret = 0; 2610 /* t1->nd = 1; */ 2611 } 2612 continue; 2613 } 2614 if (t1->to != to) 2615 continue; 2616 if (xmlFACompareAtoms(t1->atom, atom, deep)) { 2617 ret = 0; 2618 /* mark the transition as non-deterministic */ 2619 t1->nd = 1; 2620 } 2621 } 2622 return(ret); 2623 } 2624 2625 /** 2626 * xmlFAComputesDeterminism: 2627 * @ctxt: a regexp parser context 2628 * 2629 * Check whether the associated regexp is determinist, 2630 * should be called after xmlFAEliminateEpsilonTransitions() 2631 * 2632 */ 2633 static int 2634 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2635 int statenr, transnr; 2636 xmlRegStatePtr state; 2637 xmlRegTransPtr t1, t2, last; 2638 int i; 2639 int ret = 1; 2640 int deep = 1; 2641 2642 #ifdef DEBUG_REGEXP_GRAPH 2643 printf("xmlFAComputesDeterminism\n"); 2644 xmlRegPrintCtxt(stdout, ctxt); 2645 #endif 2646 if (ctxt->determinist != -1) 2647 return(ctxt->determinist); 2648 2649 if (ctxt->flags & AM_AUTOMATA_RNG) 2650 deep = 0; 2651 2652 /* 2653 * First cleanup the automata removing cancelled transitions 2654 */ 2655 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2656 state = ctxt->states[statenr]; 2657 if (state == NULL) 2658 continue; 2659 if (state->nbTrans < 2) 2660 continue; 2661 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2662 t1 = &(state->trans[transnr]); 2663 /* 2664 * Determinism checks in case of counted or all transitions 2665 * will have to be handled separately 2666 */ 2667 if (t1->atom == NULL) { 2668 /* t1->nd = 1; */ 2669 continue; 2670 } 2671 if (t1->to == -1) /* eliminated */ 2672 continue; 2673 for (i = 0;i < transnr;i++) { 2674 t2 = &(state->trans[i]); 2675 if (t2->to == -1) /* eliminated */ 2676 continue; 2677 if (t2->atom != NULL) { 2678 if (t1->to == t2->to) { 2679 /* 2680 * Here we use deep because we want to keep the 2681 * transitions which indicate a conflict 2682 */ 2683 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) && 2684 (t1->counter == t2->counter) && 2685 (t1->count == t2->count)) 2686 t2->to = -1; /* eliminated */ 2687 } 2688 } 2689 } 2690 } 2691 } 2692 2693 /* 2694 * Check for all states that there aren't 2 transitions 2695 * with the same atom and a different target. 2696 */ 2697 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2698 state = ctxt->states[statenr]; 2699 if (state == NULL) 2700 continue; 2701 if (state->nbTrans < 2) 2702 continue; 2703 last = NULL; 2704 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2705 t1 = &(state->trans[transnr]); 2706 /* 2707 * Determinism checks in case of counted or all transitions 2708 * will have to be handled separately 2709 */ 2710 if (t1->atom == NULL) { 2711 continue; 2712 } 2713 if (t1->to == -1) /* eliminated */ 2714 continue; 2715 for (i = 0;i < transnr;i++) { 2716 t2 = &(state->trans[i]); 2717 if (t2->to == -1) /* eliminated */ 2718 continue; 2719 if (t2->atom != NULL) { 2720 /* 2721 * But here we don't use deep because we want to 2722 * find transitions which indicate a conflict 2723 */ 2724 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) { 2725 ret = 0; 2726 /* mark the transitions as non-deterministic ones */ 2727 t1->nd = 1; 2728 t2->nd = 1; 2729 last = t1; 2730 } 2731 } else if (t1->to != -1) { 2732 /* 2733 * do the closure in case of remaining specific 2734 * epsilon transitions like choices or all 2735 */ 2736 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2737 t2->to, t2->atom); 2738 /* don't shortcut the computation so all non deterministic 2739 transition get marked down 2740 if (ret == 0) 2741 return(0); 2742 */ 2743 if (ret == 0) { 2744 t1->nd = 1; 2745 /* t2->nd = 1; */ 2746 last = t1; 2747 } 2748 } 2749 } 2750 /* don't shortcut the computation so all non deterministic 2751 transition get marked down 2752 if (ret == 0) 2753 break; */ 2754 } 2755 2756 /* 2757 * mark specifically the last non-deterministic transition 2758 * from a state since there is no need to set-up rollback 2759 * from it 2760 */ 2761 if (last != NULL) { 2762 last->nd = 2; 2763 } 2764 2765 /* don't shortcut the computation so all non deterministic 2766 transition get marked down 2767 if (ret == 0) 2768 break; */ 2769 } 2770 2771 ctxt->determinist = ret; 2772 return(ret); 2773 } 2774 2775 /************************************************************************ 2776 * * 2777 * Routines to check input against transition atoms * 2778 * * 2779 ************************************************************************/ 2780 2781 static int 2782 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2783 int start, int end, const xmlChar *blockName) { 2784 int ret = 0; 2785 2786 switch (type) { 2787 case XML_REGEXP_STRING: 2788 case XML_REGEXP_SUBREG: 2789 case XML_REGEXP_RANGES: 2790 case XML_REGEXP_EPSILON: 2791 return(-1); 2792 case XML_REGEXP_ANYCHAR: 2793 ret = ((codepoint != '\n') && (codepoint != '\r')); 2794 break; 2795 case XML_REGEXP_CHARVAL: 2796 ret = ((codepoint >= start) && (codepoint <= end)); 2797 break; 2798 case XML_REGEXP_NOTSPACE: 2799 neg = !neg; 2800 case XML_REGEXP_ANYSPACE: 2801 ret = ((codepoint == '\n') || (codepoint == '\r') || 2802 (codepoint == '\t') || (codepoint == ' ')); 2803 break; 2804 case XML_REGEXP_NOTINITNAME: 2805 neg = !neg; 2806 case XML_REGEXP_INITNAME: 2807 ret = (IS_LETTER(codepoint) || 2808 (codepoint == '_') || (codepoint == ':')); 2809 break; 2810 case XML_REGEXP_NOTNAMECHAR: 2811 neg = !neg; 2812 case XML_REGEXP_NAMECHAR: 2813 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2814 (codepoint == '.') || (codepoint == '-') || 2815 (codepoint == '_') || (codepoint == ':') || 2816 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2817 break; 2818 case XML_REGEXP_NOTDECIMAL: 2819 neg = !neg; 2820 case XML_REGEXP_DECIMAL: 2821 ret = xmlUCSIsCatNd(codepoint); 2822 break; 2823 case XML_REGEXP_REALCHAR: 2824 neg = !neg; 2825 case XML_REGEXP_NOTREALCHAR: 2826 ret = xmlUCSIsCatP(codepoint); 2827 if (ret == 0) 2828 ret = xmlUCSIsCatZ(codepoint); 2829 if (ret == 0) 2830 ret = xmlUCSIsCatC(codepoint); 2831 break; 2832 case XML_REGEXP_LETTER: 2833 ret = xmlUCSIsCatL(codepoint); 2834 break; 2835 case XML_REGEXP_LETTER_UPPERCASE: 2836 ret = xmlUCSIsCatLu(codepoint); 2837 break; 2838 case XML_REGEXP_LETTER_LOWERCASE: 2839 ret = xmlUCSIsCatLl(codepoint); 2840 break; 2841 case XML_REGEXP_LETTER_TITLECASE: 2842 ret = xmlUCSIsCatLt(codepoint); 2843 break; 2844 case XML_REGEXP_LETTER_MODIFIER: 2845 ret = xmlUCSIsCatLm(codepoint); 2846 break; 2847 case XML_REGEXP_LETTER_OTHERS: 2848 ret = xmlUCSIsCatLo(codepoint); 2849 break; 2850 case XML_REGEXP_MARK: 2851 ret = xmlUCSIsCatM(codepoint); 2852 break; 2853 case XML_REGEXP_MARK_NONSPACING: 2854 ret = xmlUCSIsCatMn(codepoint); 2855 break; 2856 case XML_REGEXP_MARK_SPACECOMBINING: 2857 ret = xmlUCSIsCatMc(codepoint); 2858 break; 2859 case XML_REGEXP_MARK_ENCLOSING: 2860 ret = xmlUCSIsCatMe(codepoint); 2861 break; 2862 case XML_REGEXP_NUMBER: 2863 ret = xmlUCSIsCatN(codepoint); 2864 break; 2865 case XML_REGEXP_NUMBER_DECIMAL: 2866 ret = xmlUCSIsCatNd(codepoint); 2867 break; 2868 case XML_REGEXP_NUMBER_LETTER: 2869 ret = xmlUCSIsCatNl(codepoint); 2870 break; 2871 case XML_REGEXP_NUMBER_OTHERS: 2872 ret = xmlUCSIsCatNo(codepoint); 2873 break; 2874 case XML_REGEXP_PUNCT: 2875 ret = xmlUCSIsCatP(codepoint); 2876 break; 2877 case XML_REGEXP_PUNCT_CONNECTOR: 2878 ret = xmlUCSIsCatPc(codepoint); 2879 break; 2880 case XML_REGEXP_PUNCT_DASH: 2881 ret = xmlUCSIsCatPd(codepoint); 2882 break; 2883 case XML_REGEXP_PUNCT_OPEN: 2884 ret = xmlUCSIsCatPs(codepoint); 2885 break; 2886 case XML_REGEXP_PUNCT_CLOSE: 2887 ret = xmlUCSIsCatPe(codepoint); 2888 break; 2889 case XML_REGEXP_PUNCT_INITQUOTE: 2890 ret = xmlUCSIsCatPi(codepoint); 2891 break; 2892 case XML_REGEXP_PUNCT_FINQUOTE: 2893 ret = xmlUCSIsCatPf(codepoint); 2894 break; 2895 case XML_REGEXP_PUNCT_OTHERS: 2896 ret = xmlUCSIsCatPo(codepoint); 2897 break; 2898 case XML_REGEXP_SEPAR: 2899 ret = xmlUCSIsCatZ(codepoint); 2900 break; 2901 case XML_REGEXP_SEPAR_SPACE: 2902 ret = xmlUCSIsCatZs(codepoint); 2903 break; 2904 case XML_REGEXP_SEPAR_LINE: 2905 ret = xmlUCSIsCatZl(codepoint); 2906 break; 2907 case XML_REGEXP_SEPAR_PARA: 2908 ret = xmlUCSIsCatZp(codepoint); 2909 break; 2910 case XML_REGEXP_SYMBOL: 2911 ret = xmlUCSIsCatS(codepoint); 2912 break; 2913 case XML_REGEXP_SYMBOL_MATH: 2914 ret = xmlUCSIsCatSm(codepoint); 2915 break; 2916 case XML_REGEXP_SYMBOL_CURRENCY: 2917 ret = xmlUCSIsCatSc(codepoint); 2918 break; 2919 case XML_REGEXP_SYMBOL_MODIFIER: 2920 ret = xmlUCSIsCatSk(codepoint); 2921 break; 2922 case XML_REGEXP_SYMBOL_OTHERS: 2923 ret = xmlUCSIsCatSo(codepoint); 2924 break; 2925 case XML_REGEXP_OTHER: 2926 ret = xmlUCSIsCatC(codepoint); 2927 break; 2928 case XML_REGEXP_OTHER_CONTROL: 2929 ret = xmlUCSIsCatCc(codepoint); 2930 break; 2931 case XML_REGEXP_OTHER_FORMAT: 2932 ret = xmlUCSIsCatCf(codepoint); 2933 break; 2934 case XML_REGEXP_OTHER_PRIVATE: 2935 ret = xmlUCSIsCatCo(codepoint); 2936 break; 2937 case XML_REGEXP_OTHER_NA: 2938 /* ret = xmlUCSIsCatCn(codepoint); */ 2939 /* Seems it doesn't exist anymore in recent Unicode releases */ 2940 ret = 0; 2941 break; 2942 case XML_REGEXP_BLOCK_NAME: 2943 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2944 break; 2945 } 2946 if (neg) 2947 return(!ret); 2948 return(ret); 2949 } 2950 2951 static int 2952 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2953 int i, ret = 0; 2954 xmlRegRangePtr range; 2955 2956 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2957 return(-1); 2958 2959 switch (atom->type) { 2960 case XML_REGEXP_SUBREG: 2961 case XML_REGEXP_EPSILON: 2962 return(-1); 2963 case XML_REGEXP_CHARVAL: 2964 return(codepoint == atom->codepoint); 2965 case XML_REGEXP_RANGES: { 2966 int accept = 0; 2967 2968 for (i = 0;i < atom->nbRanges;i++) { 2969 range = atom->ranges[i]; 2970 if (range->neg == 2) { 2971 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2972 0, range->start, range->end, 2973 range->blockName); 2974 if (ret != 0) 2975 return(0); /* excluded char */ 2976 } else if (range->neg) { 2977 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2978 0, range->start, range->end, 2979 range->blockName); 2980 if (ret == 0) 2981 accept = 1; 2982 else 2983 return(0); 2984 } else { 2985 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2986 0, range->start, range->end, 2987 range->blockName); 2988 if (ret != 0) 2989 accept = 1; /* might still be excluded */ 2990 } 2991 } 2992 return(accept); 2993 } 2994 case XML_REGEXP_STRING: 2995 printf("TODO: XML_REGEXP_STRING\n"); 2996 return(-1); 2997 case XML_REGEXP_ANYCHAR: 2998 case XML_REGEXP_ANYSPACE: 2999 case XML_REGEXP_NOTSPACE: 3000 case XML_REGEXP_INITNAME: 3001 case XML_REGEXP_NOTINITNAME: 3002 case XML_REGEXP_NAMECHAR: 3003 case XML_REGEXP_NOTNAMECHAR: 3004 case XML_REGEXP_DECIMAL: 3005 case XML_REGEXP_NOTDECIMAL: 3006 case XML_REGEXP_REALCHAR: 3007 case XML_REGEXP_NOTREALCHAR: 3008 case XML_REGEXP_LETTER: 3009 case XML_REGEXP_LETTER_UPPERCASE: 3010 case XML_REGEXP_LETTER_LOWERCASE: 3011 case XML_REGEXP_LETTER_TITLECASE: 3012 case XML_REGEXP_LETTER_MODIFIER: 3013 case XML_REGEXP_LETTER_OTHERS: 3014 case XML_REGEXP_MARK: 3015 case XML_REGEXP_MARK_NONSPACING: 3016 case XML_REGEXP_MARK_SPACECOMBINING: 3017 case XML_REGEXP_MARK_ENCLOSING: 3018 case XML_REGEXP_NUMBER: 3019 case XML_REGEXP_NUMBER_DECIMAL: 3020 case XML_REGEXP_NUMBER_LETTER: 3021 case XML_REGEXP_NUMBER_OTHERS: 3022 case XML_REGEXP_PUNCT: 3023 case XML_REGEXP_PUNCT_CONNECTOR: 3024 case XML_REGEXP_PUNCT_DASH: 3025 case XML_REGEXP_PUNCT_OPEN: 3026 case XML_REGEXP_PUNCT_CLOSE: 3027 case XML_REGEXP_PUNCT_INITQUOTE: 3028 case XML_REGEXP_PUNCT_FINQUOTE: 3029 case XML_REGEXP_PUNCT_OTHERS: 3030 case XML_REGEXP_SEPAR: 3031 case XML_REGEXP_SEPAR_SPACE: 3032 case XML_REGEXP_SEPAR_LINE: 3033 case XML_REGEXP_SEPAR_PARA: 3034 case XML_REGEXP_SYMBOL: 3035 case XML_REGEXP_SYMBOL_MATH: 3036 case XML_REGEXP_SYMBOL_CURRENCY: 3037 case XML_REGEXP_SYMBOL_MODIFIER: 3038 case XML_REGEXP_SYMBOL_OTHERS: 3039 case XML_REGEXP_OTHER: 3040 case XML_REGEXP_OTHER_CONTROL: 3041 case XML_REGEXP_OTHER_FORMAT: 3042 case XML_REGEXP_OTHER_PRIVATE: 3043 case XML_REGEXP_OTHER_NA: 3044 case XML_REGEXP_BLOCK_NAME: 3045 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 3046 (const xmlChar *)atom->valuep); 3047 if (atom->neg) 3048 ret = !ret; 3049 break; 3050 } 3051 return(ret); 3052 } 3053 3054 /************************************************************************ 3055 * * 3056 * Saving and restoring state of an execution context * 3057 * * 3058 ************************************************************************/ 3059 3060 #ifdef DEBUG_REGEXP_EXEC 3061 static void 3062 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 3063 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 3064 if (exec->inputStack != NULL) { 3065 int i; 3066 printf(": "); 3067 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 3068 printf("%s ", (const char *) 3069 exec->inputStack[exec->inputStackNr - (i + 1)].value); 3070 } else { 3071 printf(": %s", &(exec->inputString[exec->index])); 3072 } 3073 printf("\n"); 3074 } 3075 #endif 3076 3077 static void 3078 xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 3079 #ifdef DEBUG_REGEXP_EXEC 3080 printf("saving "); 3081 exec->transno++; 3082 xmlFARegDebugExec(exec); 3083 exec->transno--; 3084 #endif 3085 #ifdef MAX_PUSH 3086 if (exec->nbPush > MAX_PUSH) { 3087 return; 3088 } 3089 exec->nbPush++; 3090 #endif 3091 3092 if (exec->maxRollbacks == 0) { 3093 exec->maxRollbacks = 4; 3094 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 3095 sizeof(xmlRegExecRollback)); 3096 if (exec->rollbacks == NULL) { 3097 xmlRegexpErrMemory(NULL, "saving regexp"); 3098 exec->maxRollbacks = 0; 3099 return; 3100 } 3101 memset(exec->rollbacks, 0, 3102 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3103 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 3104 xmlRegExecRollback *tmp; 3105 int len = exec->maxRollbacks; 3106 3107 exec->maxRollbacks *= 2; 3108 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 3109 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3110 if (tmp == NULL) { 3111 xmlRegexpErrMemory(NULL, "saving regexp"); 3112 exec->maxRollbacks /= 2; 3113 return; 3114 } 3115 exec->rollbacks = tmp; 3116 tmp = &exec->rollbacks[len]; 3117 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 3118 } 3119 exec->rollbacks[exec->nbRollbacks].state = exec->state; 3120 exec->rollbacks[exec->nbRollbacks].index = exec->index; 3121 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 3122 if (exec->comp->nbCounters > 0) { 3123 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3124 exec->rollbacks[exec->nbRollbacks].counts = (int *) 3125 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 3126 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3127 xmlRegexpErrMemory(NULL, "saving regexp"); 3128 exec->status = -5; 3129 return; 3130 } 3131 } 3132 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 3133 exec->comp->nbCounters * sizeof(int)); 3134 } 3135 exec->nbRollbacks++; 3136 } 3137 3138 static void 3139 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 3140 if (exec->nbRollbacks <= 0) { 3141 exec->status = -1; 3142 #ifdef DEBUG_REGEXP_EXEC 3143 printf("rollback failed on empty stack\n"); 3144 #endif 3145 return; 3146 } 3147 exec->nbRollbacks--; 3148 exec->state = exec->rollbacks[exec->nbRollbacks].state; 3149 exec->index = exec->rollbacks[exec->nbRollbacks].index; 3150 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 3151 if (exec->comp->nbCounters > 0) { 3152 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3153 fprintf(stderr, "exec save: allocation failed"); 3154 exec->status = -6; 3155 return; 3156 } 3157 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 3158 exec->comp->nbCounters * sizeof(int)); 3159 } 3160 3161 #ifdef DEBUG_REGEXP_EXEC 3162 printf("restored "); 3163 xmlFARegDebugExec(exec); 3164 #endif 3165 } 3166 3167 /************************************************************************ 3168 * * 3169 * Verifier, running an input against a compiled regexp * 3170 * * 3171 ************************************************************************/ 3172 3173 static int 3174 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 3175 xmlRegExecCtxt execval; 3176 xmlRegExecCtxtPtr exec = &execval; 3177 int ret, codepoint = 0, len, deter; 3178 3179 exec->inputString = content; 3180 exec->index = 0; 3181 exec->nbPush = 0; 3182 exec->determinist = 1; 3183 exec->maxRollbacks = 0; 3184 exec->nbRollbacks = 0; 3185 exec->rollbacks = NULL; 3186 exec->status = 0; 3187 exec->comp = comp; 3188 exec->state = comp->states[0]; 3189 exec->transno = 0; 3190 exec->transcount = 0; 3191 exec->inputStack = NULL; 3192 exec->inputStackMax = 0; 3193 if (comp->nbCounters > 0) { 3194 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3195 if (exec->counts == NULL) { 3196 xmlRegexpErrMemory(NULL, "running regexp"); 3197 return(-1); 3198 } 3199 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3200 } else 3201 exec->counts = NULL; 3202 while ((exec->status == 0) && 3203 ((exec->inputString[exec->index] != 0) || 3204 ((exec->state != NULL) && 3205 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3206 xmlRegTransPtr trans; 3207 xmlRegAtomPtr atom; 3208 3209 /* 3210 * If end of input on non-terminal state, rollback, however we may 3211 * still have epsilon like transition for counted transitions 3212 * on counters, in that case don't break too early. Additionally, 3213 * if we are working on a range like "AB{0,2}", where B is not present, 3214 * we don't want to break. 3215 */ 3216 len = 1; 3217 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3218 /* 3219 * if there is a transition, we must check if 3220 * atom allows minOccurs of 0 3221 */ 3222 if (exec->transno < exec->state->nbTrans) { 3223 trans = &exec->state->trans[exec->transno]; 3224 if (trans->to >=0) { 3225 atom = trans->atom; 3226 if (!((atom->min == 0) && (atom->max > 0))) 3227 goto rollback; 3228 } 3229 } else 3230 goto rollback; 3231 } 3232 3233 exec->transcount = 0; 3234 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3235 trans = &exec->state->trans[exec->transno]; 3236 if (trans->to < 0) 3237 continue; 3238 atom = trans->atom; 3239 ret = 0; 3240 deter = 1; 3241 if (trans->count >= 0) { 3242 int count; 3243 xmlRegCounterPtr counter; 3244 3245 if (exec->counts == NULL) { 3246 exec->status = -1; 3247 goto error; 3248 } 3249 /* 3250 * A counted transition. 3251 */ 3252 3253 count = exec->counts[trans->count]; 3254 counter = &exec->comp->counters[trans->count]; 3255 #ifdef DEBUG_REGEXP_EXEC 3256 printf("testing count %d: val %d, min %d, max %d\n", 3257 trans->count, count, counter->min, counter->max); 3258 #endif 3259 ret = ((count >= counter->min) && (count <= counter->max)); 3260 if ((ret) && (counter->min != counter->max)) 3261 deter = 0; 3262 } else if (atom == NULL) { 3263 fprintf(stderr, "epsilon transition left at runtime\n"); 3264 exec->status = -2; 3265 break; 3266 } else if (exec->inputString[exec->index] != 0) { 3267 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3268 ret = xmlRegCheckCharacter(atom, codepoint); 3269 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3270 xmlRegStatePtr to = comp->states[trans->to]; 3271 3272 /* 3273 * this is a multiple input sequence 3274 * If there is a counter associated increment it now. 3275 * before potentially saving and rollback 3276 * do not increment if the counter is already over the 3277 * maximum limit in which case get to next transition 3278 */ 3279 if (trans->counter >= 0) { 3280 xmlRegCounterPtr counter; 3281 3282 if ((exec->counts == NULL) || 3283 (exec->comp == NULL) || 3284 (exec->comp->counters == NULL)) { 3285 exec->status = -1; 3286 goto error; 3287 } 3288 counter = &exec->comp->counters[trans->counter]; 3289 if (exec->counts[trans->counter] >= counter->max) 3290 continue; /* for loop on transitions */ 3291 3292 #ifdef DEBUG_REGEXP_EXEC 3293 printf("Increasing count %d\n", trans->counter); 3294 #endif 3295 exec->counts[trans->counter]++; 3296 } 3297 if (exec->state->nbTrans > exec->transno + 1) { 3298 xmlFARegExecSave(exec); 3299 } 3300 exec->transcount = 1; 3301 do { 3302 /* 3303 * Try to progress as much as possible on the input 3304 */ 3305 if (exec->transcount == atom->max) { 3306 break; 3307 } 3308 exec->index += len; 3309 /* 3310 * End of input: stop here 3311 */ 3312 if (exec->inputString[exec->index] == 0) { 3313 exec->index -= len; 3314 break; 3315 } 3316 if (exec->transcount >= atom->min) { 3317 int transno = exec->transno; 3318 xmlRegStatePtr state = exec->state; 3319 3320 /* 3321 * The transition is acceptable save it 3322 */ 3323 exec->transno = -1; /* trick */ 3324 exec->state = to; 3325 xmlFARegExecSave(exec); 3326 exec->transno = transno; 3327 exec->state = state; 3328 } 3329 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3330 len); 3331 ret = xmlRegCheckCharacter(atom, codepoint); 3332 exec->transcount++; 3333 } while (ret == 1); 3334 if (exec->transcount < atom->min) 3335 ret = 0; 3336 3337 /* 3338 * If the last check failed but one transition was found 3339 * possible, rollback 3340 */ 3341 if (ret < 0) 3342 ret = 0; 3343 if (ret == 0) { 3344 goto rollback; 3345 } 3346 if (trans->counter >= 0) { 3347 if (exec->counts == NULL) { 3348 exec->status = -1; 3349 goto error; 3350 } 3351 #ifdef DEBUG_REGEXP_EXEC 3352 printf("Decreasing count %d\n", trans->counter); 3353 #endif 3354 exec->counts[trans->counter]--; 3355 } 3356 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3357 /* 3358 * we don't match on the codepoint, but minOccurs of 0 3359 * says that's ok. Setting len to 0 inhibits stepping 3360 * over the codepoint. 3361 */ 3362 exec->transcount = 1; 3363 len = 0; 3364 ret = 1; 3365 } 3366 } else if ((atom->min == 0) && (atom->max > 0)) { 3367 /* another spot to match when minOccurs is 0 */ 3368 exec->transcount = 1; 3369 len = 0; 3370 ret = 1; 3371 } 3372 if (ret == 1) { 3373 if ((trans->nd == 1) || 3374 ((trans->count >= 0) && (deter == 0) && 3375 (exec->state->nbTrans > exec->transno + 1))) { 3376 #ifdef DEBUG_REGEXP_EXEC 3377 if (trans->nd == 1) 3378 printf("Saving on nd transition atom %d for %c at %d\n", 3379 trans->atom->no, codepoint, exec->index); 3380 else 3381 printf("Saving on counted transition count %d for %c at %d\n", 3382 trans->count, codepoint, exec->index); 3383 #endif 3384 xmlFARegExecSave(exec); 3385 } 3386 if (trans->counter >= 0) { 3387 xmlRegCounterPtr counter; 3388 3389 /* make sure we don't go over the counter maximum value */ 3390 if ((exec->counts == NULL) || 3391 (exec->comp == NULL) || 3392 (exec->comp->counters == NULL)) { 3393 exec->status = -1; 3394 goto error; 3395 } 3396 counter = &exec->comp->counters[trans->counter]; 3397 if (exec->counts[trans->counter] >= counter->max) 3398 continue; /* for loop on transitions */ 3399 #ifdef DEBUG_REGEXP_EXEC 3400 printf("Increasing count %d\n", trans->counter); 3401 #endif 3402 exec->counts[trans->counter]++; 3403 } 3404 if ((trans->count >= 0) && 3405 (trans->count < REGEXP_ALL_COUNTER)) { 3406 if (exec->counts == NULL) { 3407 exec->status = -1; 3408 goto error; 3409 } 3410 #ifdef DEBUG_REGEXP_EXEC 3411 printf("resetting count %d on transition\n", 3412 trans->count); 3413 #endif 3414 exec->counts[trans->count] = 0; 3415 } 3416 #ifdef DEBUG_REGEXP_EXEC 3417 printf("entering state %d\n", trans->to); 3418 #endif 3419 exec->state = comp->states[trans->to]; 3420 exec->transno = 0; 3421 if (trans->atom != NULL) { 3422 exec->index += len; 3423 } 3424 goto progress; 3425 } else if (ret < 0) { 3426 exec->status = -4; 3427 break; 3428 } 3429 } 3430 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3431 rollback: 3432 /* 3433 * Failed to find a way out 3434 */ 3435 exec->determinist = 0; 3436 #ifdef DEBUG_REGEXP_EXEC 3437 printf("rollback from state %d on %d:%c\n", exec->state->no, 3438 codepoint,codepoint); 3439 #endif 3440 xmlFARegExecRollBack(exec); 3441 } 3442 progress: 3443 continue; 3444 } 3445 error: 3446 if (exec->rollbacks != NULL) { 3447 if (exec->counts != NULL) { 3448 int i; 3449 3450 for (i = 0;i < exec->maxRollbacks;i++) 3451 if (exec->rollbacks[i].counts != NULL) 3452 xmlFree(exec->rollbacks[i].counts); 3453 } 3454 xmlFree(exec->rollbacks); 3455 } 3456 if (exec->counts != NULL) 3457 xmlFree(exec->counts); 3458 if (exec->status == 0) 3459 return(1); 3460 if (exec->status == -1) { 3461 if (exec->nbPush > MAX_PUSH) 3462 return(-1); 3463 return(0); 3464 } 3465 return(exec->status); 3466 } 3467 3468 /************************************************************************ 3469 * * 3470 * Progressive interface to the verifier one atom at a time * 3471 * * 3472 ************************************************************************/ 3473 #ifdef DEBUG_ERR 3474 static void testerr(xmlRegExecCtxtPtr exec); 3475 #endif 3476 3477 /** 3478 * xmlRegNewExecCtxt: 3479 * @comp: a precompiled regular expression 3480 * @callback: a callback function used for handling progresses in the 3481 * automata matching phase 3482 * @data: the context data associated to the callback in this context 3483 * 3484 * Build a context used for progressive evaluation of a regexp. 3485 * 3486 * Returns the new context 3487 */ 3488 xmlRegExecCtxtPtr 3489 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3490 xmlRegExecCtxtPtr exec; 3491 3492 if (comp == NULL) 3493 return(NULL); 3494 if ((comp->compact == NULL) && (comp->states == NULL)) 3495 return(NULL); 3496 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3497 if (exec == NULL) { 3498 xmlRegexpErrMemory(NULL, "creating execution context"); 3499 return(NULL); 3500 } 3501 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3502 exec->inputString = NULL; 3503 exec->index = 0; 3504 exec->determinist = 1; 3505 exec->maxRollbacks = 0; 3506 exec->nbRollbacks = 0; 3507 exec->rollbacks = NULL; 3508 exec->status = 0; 3509 exec->comp = comp; 3510 if (comp->compact == NULL) 3511 exec->state = comp->states[0]; 3512 exec->transno = 0; 3513 exec->transcount = 0; 3514 exec->callback = callback; 3515 exec->data = data; 3516 if (comp->nbCounters > 0) { 3517 /* 3518 * For error handling, exec->counts is allocated twice the size 3519 * the second half is used to store the data in case of rollback 3520 */ 3521 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3522 * 2); 3523 if (exec->counts == NULL) { 3524 xmlRegexpErrMemory(NULL, "creating execution context"); 3525 xmlFree(exec); 3526 return(NULL); 3527 } 3528 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3529 exec->errCounts = &exec->counts[comp->nbCounters]; 3530 } else { 3531 exec->counts = NULL; 3532 exec->errCounts = NULL; 3533 } 3534 exec->inputStackMax = 0; 3535 exec->inputStackNr = 0; 3536 exec->inputStack = NULL; 3537 exec->errStateNo = -1; 3538 exec->errString = NULL; 3539 exec->nbPush = 0; 3540 return(exec); 3541 } 3542 3543 /** 3544 * xmlRegFreeExecCtxt: 3545 * @exec: a regular expression evaulation context 3546 * 3547 * Free the structures associated to a regular expression evaulation context. 3548 */ 3549 void 3550 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3551 if (exec == NULL) 3552 return; 3553 3554 if (exec->rollbacks != NULL) { 3555 if (exec->counts != NULL) { 3556 int i; 3557 3558 for (i = 0;i < exec->maxRollbacks;i++) 3559 if (exec->rollbacks[i].counts != NULL) 3560 xmlFree(exec->rollbacks[i].counts); 3561 } 3562 xmlFree(exec->rollbacks); 3563 } 3564 if (exec->counts != NULL) 3565 xmlFree(exec->counts); 3566 if (exec->inputStack != NULL) { 3567 int i; 3568 3569 for (i = 0;i < exec->inputStackNr;i++) { 3570 if (exec->inputStack[i].value != NULL) 3571 xmlFree(exec->inputStack[i].value); 3572 } 3573 xmlFree(exec->inputStack); 3574 } 3575 if (exec->errString != NULL) 3576 xmlFree(exec->errString); 3577 xmlFree(exec); 3578 } 3579 3580 static void 3581 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3582 void *data) { 3583 #ifdef DEBUG_PUSH 3584 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3585 #endif 3586 if (exec->inputStackMax == 0) { 3587 exec->inputStackMax = 4; 3588 exec->inputStack = (xmlRegInputTokenPtr) 3589 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3590 if (exec->inputStack == NULL) { 3591 xmlRegexpErrMemory(NULL, "pushing input string"); 3592 exec->inputStackMax = 0; 3593 return; 3594 } 3595 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3596 xmlRegInputTokenPtr tmp; 3597 3598 exec->inputStackMax *= 2; 3599 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3600 exec->inputStackMax * sizeof(xmlRegInputToken)); 3601 if (tmp == NULL) { 3602 xmlRegexpErrMemory(NULL, "pushing input string"); 3603 exec->inputStackMax /= 2; 3604 return; 3605 } 3606 exec->inputStack = tmp; 3607 } 3608 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3609 exec->inputStack[exec->inputStackNr].data = data; 3610 exec->inputStackNr++; 3611 exec->inputStack[exec->inputStackNr].value = NULL; 3612 exec->inputStack[exec->inputStackNr].data = NULL; 3613 } 3614 3615 /** 3616 * xmlRegStrEqualWildcard: 3617 * @expStr: the string to be evaluated 3618 * @valStr: the validation string 3619 * 3620 * Checks if both strings are equal or have the same content. "*" 3621 * can be used as a wildcard in @valStr; "|" is used as a seperator of 3622 * substrings in both @expStr and @valStr. 3623 * 3624 * Returns 1 if the comparison is satisfied and the number of substrings 3625 * is equal, 0 otherwise. 3626 */ 3627 3628 static int 3629 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3630 if (expStr == valStr) return(1); 3631 if (expStr == NULL) return(0); 3632 if (valStr == NULL) return(0); 3633 do { 3634 /* 3635 * Eval if we have a wildcard for the current item. 3636 */ 3637 if (*expStr != *valStr) { 3638 /* if one of them starts with a wildcard make valStr be it */ 3639 if (*valStr == '*') { 3640 const xmlChar *tmp; 3641 3642 tmp = valStr; 3643 valStr = expStr; 3644 expStr = tmp; 3645 } 3646 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3647 do { 3648 if (*valStr == XML_REG_STRING_SEPARATOR) 3649 break; 3650 valStr++; 3651 } while (*valStr != 0); 3652 continue; 3653 } else 3654 return(0); 3655 } 3656 expStr++; 3657 valStr++; 3658 } while (*valStr != 0); 3659 if (*expStr != 0) 3660 return (0); 3661 else 3662 return (1); 3663 } 3664 3665 /** 3666 * xmlRegCompactPushString: 3667 * @exec: a regexp execution context 3668 * @comp: the precompiled exec with a compact table 3669 * @value: a string token input 3670 * @data: data associated to the token to reuse in callbacks 3671 * 3672 * Push one input token in the execution context 3673 * 3674 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3675 * a negative value in case of error. 3676 */ 3677 static int 3678 xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3679 xmlRegexpPtr comp, 3680 const xmlChar *value, 3681 void *data) { 3682 int state = exec->index; 3683 int i, target; 3684 3685 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3686 return(-1); 3687 3688 if (value == NULL) { 3689 /* 3690 * are we at a final state ? 3691 */ 3692 if (comp->compact[state * (comp->nbstrings + 1)] == 3693 XML_REGEXP_FINAL_STATE) 3694 return(1); 3695 return(0); 3696 } 3697 3698 #ifdef DEBUG_PUSH 3699 printf("value pushed: %s\n", value); 3700 #endif 3701 3702 /* 3703 * Examine all outside transitions from current state 3704 */ 3705 for (i = 0;i < comp->nbstrings;i++) { 3706 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3707 if ((target > 0) && (target <= comp->nbstates)) { 3708 target--; /* to avoid 0 */ 3709 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3710 exec->index = target; 3711 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3712 exec->callback(exec->data, value, 3713 comp->transdata[state * comp->nbstrings + i], data); 3714 } 3715 #ifdef DEBUG_PUSH 3716 printf("entering state %d\n", target); 3717 #endif 3718 if (comp->compact[target * (comp->nbstrings + 1)] == 3719 XML_REGEXP_SINK_STATE) 3720 goto error; 3721 3722 if (comp->compact[target * (comp->nbstrings + 1)] == 3723 XML_REGEXP_FINAL_STATE) 3724 return(1); 3725 return(0); 3726 } 3727 } 3728 } 3729 /* 3730 * Failed to find an exit transition out from current state for the 3731 * current token 3732 */ 3733 #ifdef DEBUG_PUSH 3734 printf("failed to find a transition for %s on state %d\n", value, state); 3735 #endif 3736 error: 3737 if (exec->errString != NULL) 3738 xmlFree(exec->errString); 3739 exec->errString = xmlStrdup(value); 3740 exec->errStateNo = state; 3741 exec->status = -1; 3742 #ifdef DEBUG_ERR 3743 testerr(exec); 3744 #endif 3745 return(-1); 3746 } 3747 3748 /** 3749 * xmlRegExecPushStringInternal: 3750 * @exec: a regexp execution context or NULL to indicate the end 3751 * @value: a string token input 3752 * @data: data associated to the token to reuse in callbacks 3753 * @compound: value was assembled from 2 strings 3754 * 3755 * Push one input token in the execution context 3756 * 3757 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3758 * a negative value in case of error. 3759 */ 3760 static int 3761 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3762 void *data, int compound) { 3763 xmlRegTransPtr trans; 3764 xmlRegAtomPtr atom; 3765 int ret; 3766 int final = 0; 3767 int progress = 1; 3768 3769 if (exec == NULL) 3770 return(-1); 3771 if (exec->comp == NULL) 3772 return(-1); 3773 if (exec->status != 0) 3774 return(exec->status); 3775 3776 if (exec->comp->compact != NULL) 3777 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3778 3779 if (value == NULL) { 3780 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3781 return(1); 3782 final = 1; 3783 } 3784 3785 #ifdef DEBUG_PUSH 3786 printf("value pushed: %s\n", value); 3787 #endif 3788 /* 3789 * If we have an active rollback stack push the new value there 3790 * and get back to where we were left 3791 */ 3792 if ((value != NULL) && (exec->inputStackNr > 0)) { 3793 xmlFARegExecSaveInputString(exec, value, data); 3794 value = exec->inputStack[exec->index].value; 3795 data = exec->inputStack[exec->index].data; 3796 #ifdef DEBUG_PUSH 3797 printf("value loaded: %s\n", value); 3798 #endif 3799 } 3800 3801 while ((exec->status == 0) && 3802 ((value != NULL) || 3803 ((final == 1) && 3804 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3805 3806 /* 3807 * End of input on non-terminal state, rollback, however we may 3808 * still have epsilon like transition for counted transitions 3809 * on counters, in that case don't break too early. 3810 */ 3811 if ((value == NULL) && (exec->counts == NULL)) 3812 goto rollback; 3813 3814 exec->transcount = 0; 3815 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3816 trans = &exec->state->trans[exec->transno]; 3817 if (trans->to < 0) 3818 continue; 3819 atom = trans->atom; 3820 ret = 0; 3821 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3822 int i; 3823 int count; 3824 xmlRegTransPtr t; 3825 xmlRegCounterPtr counter; 3826 3827 ret = 0; 3828 3829 #ifdef DEBUG_PUSH 3830 printf("testing all lax %d\n", trans->count); 3831 #endif 3832 /* 3833 * Check all counted transitions from the current state 3834 */ 3835 if ((value == NULL) && (final)) { 3836 ret = 1; 3837 } else if (value != NULL) { 3838 for (i = 0;i < exec->state->nbTrans;i++) { 3839 t = &exec->state->trans[i]; 3840 if ((t->counter < 0) || (t == trans)) 3841 continue; 3842 counter = &exec->comp->counters[t->counter]; 3843 count = exec->counts[t->counter]; 3844 if ((count < counter->max) && 3845 (t->atom != NULL) && 3846 (xmlStrEqual(value, t->atom->valuep))) { 3847 ret = 0; 3848 break; 3849 } 3850 if ((count >= counter->min) && 3851 (count < counter->max) && 3852 (t->atom != NULL) && 3853 (xmlStrEqual(value, t->atom->valuep))) { 3854 ret = 1; 3855 break; 3856 } 3857 } 3858 } 3859 } else if (trans->count == REGEXP_ALL_COUNTER) { 3860 int i; 3861 int count; 3862 xmlRegTransPtr t; 3863 xmlRegCounterPtr counter; 3864 3865 ret = 1; 3866 3867 #ifdef DEBUG_PUSH 3868 printf("testing all %d\n", trans->count); 3869 #endif 3870 /* 3871 * Check all counted transitions from the current state 3872 */ 3873 for (i = 0;i < exec->state->nbTrans;i++) { 3874 t = &exec->state->trans[i]; 3875 if ((t->counter < 0) || (t == trans)) 3876 continue; 3877 counter = &exec->comp->counters[t->counter]; 3878 count = exec->counts[t->counter]; 3879 if ((count < counter->min) || (count > counter->max)) { 3880 ret = 0; 3881 break; 3882 } 3883 } 3884 } else if (trans->count >= 0) { 3885 int count; 3886 xmlRegCounterPtr counter; 3887 3888 /* 3889 * A counted transition. 3890 */ 3891 3892 count = exec->counts[trans->count]; 3893 counter = &exec->comp->counters[trans->count]; 3894 #ifdef DEBUG_PUSH 3895 printf("testing count %d: val %d, min %d, max %d\n", 3896 trans->count, count, counter->min, counter->max); 3897 #endif 3898 ret = ((count >= counter->min) && (count <= counter->max)); 3899 } else if (atom == NULL) { 3900 fprintf(stderr, "epsilon transition left at runtime\n"); 3901 exec->status = -2; 3902 break; 3903 } else if (value != NULL) { 3904 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3905 if (atom->neg) { 3906 ret = !ret; 3907 if (!compound) 3908 ret = 0; 3909 } 3910 if ((ret == 1) && (trans->counter >= 0)) { 3911 xmlRegCounterPtr counter; 3912 int count; 3913 3914 count = exec->counts[trans->counter]; 3915 counter = &exec->comp->counters[trans->counter]; 3916 if (count >= counter->max) 3917 ret = 0; 3918 } 3919 3920 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3921 xmlRegStatePtr to = exec->comp->states[trans->to]; 3922 3923 /* 3924 * this is a multiple input sequence 3925 */ 3926 if (exec->state->nbTrans > exec->transno + 1) { 3927 if (exec->inputStackNr <= 0) { 3928 xmlFARegExecSaveInputString(exec, value, data); 3929 } 3930 xmlFARegExecSave(exec); 3931 } 3932 exec->transcount = 1; 3933 do { 3934 /* 3935 * Try to progress as much as possible on the input 3936 */ 3937 if (exec->transcount == atom->max) { 3938 break; 3939 } 3940 exec->index++; 3941 value = exec->inputStack[exec->index].value; 3942 data = exec->inputStack[exec->index].data; 3943 #ifdef DEBUG_PUSH 3944 printf("value loaded: %s\n", value); 3945 #endif 3946 3947 /* 3948 * End of input: stop here 3949 */ 3950 if (value == NULL) { 3951 exec->index --; 3952 break; 3953 } 3954 if (exec->transcount >= atom->min) { 3955 int transno = exec->transno; 3956 xmlRegStatePtr state = exec->state; 3957 3958 /* 3959 * The transition is acceptable save it 3960 */ 3961 exec->transno = -1; /* trick */ 3962 exec->state = to; 3963 if (exec->inputStackNr <= 0) { 3964 xmlFARegExecSaveInputString(exec, value, data); 3965 } 3966 xmlFARegExecSave(exec); 3967 exec->transno = transno; 3968 exec->state = state; 3969 } 3970 ret = xmlStrEqual(value, atom->valuep); 3971 exec->transcount++; 3972 } while (ret == 1); 3973 if (exec->transcount < atom->min) 3974 ret = 0; 3975 3976 /* 3977 * If the last check failed but one transition was found 3978 * possible, rollback 3979 */ 3980 if (ret < 0) 3981 ret = 0; 3982 if (ret == 0) { 3983 goto rollback; 3984 } 3985 } 3986 } 3987 if (ret == 1) { 3988 if ((exec->callback != NULL) && (atom != NULL) && 3989 (data != NULL)) { 3990 exec->callback(exec->data, atom->valuep, 3991 atom->data, data); 3992 } 3993 if (exec->state->nbTrans > exec->transno + 1) { 3994 if (exec->inputStackNr <= 0) { 3995 xmlFARegExecSaveInputString(exec, value, data); 3996 } 3997 xmlFARegExecSave(exec); 3998 } 3999 if (trans->counter >= 0) { 4000 #ifdef DEBUG_PUSH 4001 printf("Increasing count %d\n", trans->counter); 4002 #endif 4003 exec->counts[trans->counter]++; 4004 } 4005 if ((trans->count >= 0) && 4006 (trans->count < REGEXP_ALL_COUNTER)) { 4007 #ifdef DEBUG_REGEXP_EXEC 4008 printf("resetting count %d on transition\n", 4009 trans->count); 4010 #endif 4011 exec->counts[trans->count] = 0; 4012 } 4013 #ifdef DEBUG_PUSH 4014 printf("entering state %d\n", trans->to); 4015 #endif 4016 if ((exec->comp->states[trans->to] != NULL) && 4017 (exec->comp->states[trans->to]->type == 4018 XML_REGEXP_SINK_STATE)) { 4019 /* 4020 * entering a sink state, save the current state as error 4021 * state. 4022 */ 4023 if (exec->errString != NULL) 4024 xmlFree(exec->errString); 4025 exec->errString = xmlStrdup(value); 4026 exec->errState = exec->state; 4027 memcpy(exec->errCounts, exec->counts, 4028 exec->comp->nbCounters * sizeof(int)); 4029 } 4030 exec->state = exec->comp->states[trans->to]; 4031 exec->transno = 0; 4032 if (trans->atom != NULL) { 4033 if (exec->inputStack != NULL) { 4034 exec->index++; 4035 if (exec->index < exec->inputStackNr) { 4036 value = exec->inputStack[exec->index].value; 4037 data = exec->inputStack[exec->index].data; 4038 #ifdef DEBUG_PUSH 4039 printf("value loaded: %s\n", value); 4040 #endif 4041 } else { 4042 value = NULL; 4043 data = NULL; 4044 #ifdef DEBUG_PUSH 4045 printf("end of input\n"); 4046 #endif 4047 } 4048 } else { 4049 value = NULL; 4050 data = NULL; 4051 #ifdef DEBUG_PUSH 4052 printf("end of input\n"); 4053 #endif 4054 } 4055 } 4056 goto progress; 4057 } else if (ret < 0) { 4058 exec->status = -4; 4059 break; 4060 } 4061 } 4062 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4063 rollback: 4064 /* 4065 * if we didn't yet rollback on the current input 4066 * store the current state as the error state. 4067 */ 4068 if ((progress) && (exec->state != NULL) && 4069 (exec->state->type != XML_REGEXP_SINK_STATE)) { 4070 progress = 0; 4071 if (exec->errString != NULL) 4072 xmlFree(exec->errString); 4073 exec->errString = xmlStrdup(value); 4074 exec->errState = exec->state; 4075 memcpy(exec->errCounts, exec->counts, 4076 exec->comp->nbCounters * sizeof(int)); 4077 } 4078 4079 /* 4080 * Failed to find a way out 4081 */ 4082 exec->determinist = 0; 4083 xmlFARegExecRollBack(exec); 4084 if (exec->status == 0) { 4085 value = exec->inputStack[exec->index].value; 4086 data = exec->inputStack[exec->index].data; 4087 #ifdef DEBUG_PUSH 4088 printf("value loaded: %s\n", value); 4089 #endif 4090 } 4091 } 4092 continue; 4093 progress: 4094 progress = 1; 4095 continue; 4096 } 4097 if (exec->status == 0) { 4098 return(exec->state->type == XML_REGEXP_FINAL_STATE); 4099 } 4100 #ifdef DEBUG_ERR 4101 if (exec->status < 0) { 4102 testerr(exec); 4103 } 4104 #endif 4105 return(exec->status); 4106 } 4107 4108 /** 4109 * xmlRegExecPushString: 4110 * @exec: a regexp execution context or NULL to indicate the end 4111 * @value: a string token input 4112 * @data: data associated to the token to reuse in callbacks 4113 * 4114 * Push one input token in the execution context 4115 * 4116 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4117 * a negative value in case of error. 4118 */ 4119 int 4120 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 4121 void *data) { 4122 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 4123 } 4124 4125 /** 4126 * xmlRegExecPushString2: 4127 * @exec: a regexp execution context or NULL to indicate the end 4128 * @value: the first string token input 4129 * @value2: the second string token input 4130 * @data: data associated to the token to reuse in callbacks 4131 * 4132 * Push one input token in the execution context 4133 * 4134 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4135 * a negative value in case of error. 4136 */ 4137 int 4138 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 4139 const xmlChar *value2, void *data) { 4140 xmlChar buf[150]; 4141 int lenn, lenp, ret; 4142 xmlChar *str; 4143 4144 if (exec == NULL) 4145 return(-1); 4146 if (exec->comp == NULL) 4147 return(-1); 4148 if (exec->status != 0) 4149 return(exec->status); 4150 4151 if (value2 == NULL) 4152 return(xmlRegExecPushString(exec, value, data)); 4153 4154 lenn = strlen((char *) value2); 4155 lenp = strlen((char *) value); 4156 4157 if (150 < lenn + lenp + 2) { 4158 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4159 if (str == NULL) { 4160 exec->status = -1; 4161 return(-1); 4162 } 4163 } else { 4164 str = buf; 4165 } 4166 memcpy(&str[0], value, lenp); 4167 str[lenp] = XML_REG_STRING_SEPARATOR; 4168 memcpy(&str[lenp + 1], value2, lenn); 4169 str[lenn + lenp + 1] = 0; 4170 4171 if (exec->comp->compact != NULL) 4172 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 4173 else 4174 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 4175 4176 if (str != buf) 4177 xmlFree(str); 4178 return(ret); 4179 } 4180 4181 /** 4182 * xmlRegExecGetValues: 4183 * @exec: a regexp execution context 4184 * @err: error extraction or normal one 4185 * @nbval: pointer to the number of accepted values IN/OUT 4186 * @nbneg: return number of negative transitions 4187 * @values: pointer to the array of acceptable values 4188 * @terminal: return value if this was a terminal state 4189 * 4190 * Extract informations from the regexp execution, internal routine to 4191 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 4192 * 4193 * Returns: 0 in case of success or -1 in case of error. 4194 */ 4195 static int 4196 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 4197 int *nbval, int *nbneg, 4198 xmlChar **values, int *terminal) { 4199 int maxval; 4200 int nb = 0; 4201 4202 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 4203 (values == NULL) || (*nbval <= 0)) 4204 return(-1); 4205 4206 maxval = *nbval; 4207 *nbval = 0; 4208 *nbneg = 0; 4209 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4210 xmlRegexpPtr comp; 4211 int target, i, state; 4212 4213 comp = exec->comp; 4214 4215 if (err) { 4216 if (exec->errStateNo == -1) return(-1); 4217 state = exec->errStateNo; 4218 } else { 4219 state = exec->index; 4220 } 4221 if (terminal != NULL) { 4222 if (comp->compact[state * (comp->nbstrings + 1)] == 4223 XML_REGEXP_FINAL_STATE) 4224 *terminal = 1; 4225 else 4226 *terminal = 0; 4227 } 4228 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4229 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4230 if ((target > 0) && (target <= comp->nbstates) && 4231 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4232 XML_REGEXP_SINK_STATE)) { 4233 values[nb++] = comp->stringMap[i]; 4234 (*nbval)++; 4235 } 4236 } 4237 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4238 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4239 if ((target > 0) && (target <= comp->nbstates) && 4240 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4241 XML_REGEXP_SINK_STATE)) { 4242 values[nb++] = comp->stringMap[i]; 4243 (*nbneg)++; 4244 } 4245 } 4246 } else { 4247 int transno; 4248 xmlRegTransPtr trans; 4249 xmlRegAtomPtr atom; 4250 xmlRegStatePtr state; 4251 4252 if (terminal != NULL) { 4253 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4254 *terminal = 1; 4255 else 4256 *terminal = 0; 4257 } 4258 4259 if (err) { 4260 if (exec->errState == NULL) return(-1); 4261 state = exec->errState; 4262 } else { 4263 if (exec->state == NULL) return(-1); 4264 state = exec->state; 4265 } 4266 for (transno = 0; 4267 (transno < state->nbTrans) && (nb < maxval); 4268 transno++) { 4269 trans = &state->trans[transno]; 4270 if (trans->to < 0) 4271 continue; 4272 atom = trans->atom; 4273 if ((atom == NULL) || (atom->valuep == NULL)) 4274 continue; 4275 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4276 /* this should not be reached but ... */ 4277 TODO; 4278 } else if (trans->count == REGEXP_ALL_COUNTER) { 4279 /* this should not be reached but ... */ 4280 TODO; 4281 } else if (trans->counter >= 0) { 4282 xmlRegCounterPtr counter = NULL; 4283 int count; 4284 4285 if (err) 4286 count = exec->errCounts[trans->counter]; 4287 else 4288 count = exec->counts[trans->counter]; 4289 if (exec->comp != NULL) 4290 counter = &exec->comp->counters[trans->counter]; 4291 if ((counter == NULL) || (count < counter->max)) { 4292 if (atom->neg) 4293 values[nb++] = (xmlChar *) atom->valuep2; 4294 else 4295 values[nb++] = (xmlChar *) atom->valuep; 4296 (*nbval)++; 4297 } 4298 } else { 4299 if ((exec->comp->states[trans->to] != NULL) && 4300 (exec->comp->states[trans->to]->type != 4301 XML_REGEXP_SINK_STATE)) { 4302 if (atom->neg) 4303 values[nb++] = (xmlChar *) atom->valuep2; 4304 else 4305 values[nb++] = (xmlChar *) atom->valuep; 4306 (*nbval)++; 4307 } 4308 } 4309 } 4310 for (transno = 0; 4311 (transno < state->nbTrans) && (nb < maxval); 4312 transno++) { 4313 trans = &state->trans[transno]; 4314 if (trans->to < 0) 4315 continue; 4316 atom = trans->atom; 4317 if ((atom == NULL) || (atom->valuep == NULL)) 4318 continue; 4319 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4320 continue; 4321 } else if (trans->count == REGEXP_ALL_COUNTER) { 4322 continue; 4323 } else if (trans->counter >= 0) { 4324 continue; 4325 } else { 4326 if ((exec->comp->states[trans->to] != NULL) && 4327 (exec->comp->states[trans->to]->type == 4328 XML_REGEXP_SINK_STATE)) { 4329 if (atom->neg) 4330 values[nb++] = (xmlChar *) atom->valuep2; 4331 else 4332 values[nb++] = (xmlChar *) atom->valuep; 4333 (*nbneg)++; 4334 } 4335 } 4336 } 4337 } 4338 return(0); 4339 } 4340 4341 /** 4342 * xmlRegExecNextValues: 4343 * @exec: a regexp execution context 4344 * @nbval: pointer to the number of accepted values IN/OUT 4345 * @nbneg: return number of negative transitions 4346 * @values: pointer to the array of acceptable values 4347 * @terminal: return value if this was a terminal state 4348 * 4349 * Extract informations from the regexp execution, 4350 * the parameter @values must point to an array of @nbval string pointers 4351 * on return nbval will contain the number of possible strings in that 4352 * state and the @values array will be updated with them. The string values 4353 * returned will be freed with the @exec context and don't need to be 4354 * deallocated. 4355 * 4356 * Returns: 0 in case of success or -1 in case of error. 4357 */ 4358 int 4359 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4360 xmlChar **values, int *terminal) { 4361 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4362 } 4363 4364 /** 4365 * xmlRegExecErrInfo: 4366 * @exec: a regexp execution context generating an error 4367 * @string: return value for the error string 4368 * @nbval: pointer to the number of accepted values IN/OUT 4369 * @nbneg: return number of negative transitions 4370 * @values: pointer to the array of acceptable values 4371 * @terminal: return value if this was a terminal state 4372 * 4373 * Extract error informations from the regexp execution, the parameter 4374 * @string will be updated with the value pushed and not accepted, 4375 * the parameter @values must point to an array of @nbval string pointers 4376 * on return nbval will contain the number of possible strings in that 4377 * state and the @values array will be updated with them. The string values 4378 * returned will be freed with the @exec context and don't need to be 4379 * deallocated. 4380 * 4381 * Returns: 0 in case of success or -1 in case of error. 4382 */ 4383 int 4384 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4385 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4386 if (exec == NULL) 4387 return(-1); 4388 if (string != NULL) { 4389 if (exec->status != 0) 4390 *string = exec->errString; 4391 else 4392 *string = NULL; 4393 } 4394 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4395 } 4396 4397 #ifdef DEBUG_ERR 4398 static void testerr(xmlRegExecCtxtPtr exec) { 4399 const xmlChar *string; 4400 xmlChar *values[5]; 4401 int nb = 5; 4402 int nbneg; 4403 int terminal; 4404 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4405 } 4406 #endif 4407 4408 #if 0 4409 static int 4410 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4411 xmlRegTransPtr trans; 4412 xmlRegAtomPtr atom; 4413 int ret; 4414 int codepoint, len; 4415 4416 if (exec == NULL) 4417 return(-1); 4418 if (exec->status != 0) 4419 return(exec->status); 4420 4421 while ((exec->status == 0) && 4422 ((exec->inputString[exec->index] != 0) || 4423 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4424 4425 /* 4426 * End of input on non-terminal state, rollback, however we may 4427 * still have epsilon like transition for counted transitions 4428 * on counters, in that case don't break too early. 4429 */ 4430 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4431 goto rollback; 4432 4433 exec->transcount = 0; 4434 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4435 trans = &exec->state->trans[exec->transno]; 4436 if (trans->to < 0) 4437 continue; 4438 atom = trans->atom; 4439 ret = 0; 4440 if (trans->count >= 0) { 4441 int count; 4442 xmlRegCounterPtr counter; 4443 4444 /* 4445 * A counted transition. 4446 */ 4447 4448 count = exec->counts[trans->count]; 4449 counter = &exec->comp->counters[trans->count]; 4450 #ifdef DEBUG_REGEXP_EXEC 4451 printf("testing count %d: val %d, min %d, max %d\n", 4452 trans->count, count, counter->min, counter->max); 4453 #endif 4454 ret = ((count >= counter->min) && (count <= counter->max)); 4455 } else if (atom == NULL) { 4456 fprintf(stderr, "epsilon transition left at runtime\n"); 4457 exec->status = -2; 4458 break; 4459 } else if (exec->inputString[exec->index] != 0) { 4460 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4461 ret = xmlRegCheckCharacter(atom, codepoint); 4462 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4463 xmlRegStatePtr to = exec->comp->states[trans->to]; 4464 4465 /* 4466 * this is a multiple input sequence 4467 */ 4468 if (exec->state->nbTrans > exec->transno + 1) { 4469 xmlFARegExecSave(exec); 4470 } 4471 exec->transcount = 1; 4472 do { 4473 /* 4474 * Try to progress as much as possible on the input 4475 */ 4476 if (exec->transcount == atom->max) { 4477 break; 4478 } 4479 exec->index += len; 4480 /* 4481 * End of input: stop here 4482 */ 4483 if (exec->inputString[exec->index] == 0) { 4484 exec->index -= len; 4485 break; 4486 } 4487 if (exec->transcount >= atom->min) { 4488 int transno = exec->transno; 4489 xmlRegStatePtr state = exec->state; 4490 4491 /* 4492 * The transition is acceptable save it 4493 */ 4494 exec->transno = -1; /* trick */ 4495 exec->state = to; 4496 xmlFARegExecSave(exec); 4497 exec->transno = transno; 4498 exec->state = state; 4499 } 4500 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4501 len); 4502 ret = xmlRegCheckCharacter(atom, codepoint); 4503 exec->transcount++; 4504 } while (ret == 1); 4505 if (exec->transcount < atom->min) 4506 ret = 0; 4507 4508 /* 4509 * If the last check failed but one transition was found 4510 * possible, rollback 4511 */ 4512 if (ret < 0) 4513 ret = 0; 4514 if (ret == 0) { 4515 goto rollback; 4516 } 4517 } 4518 } 4519 if (ret == 1) { 4520 if (exec->state->nbTrans > exec->transno + 1) { 4521 xmlFARegExecSave(exec); 4522 } 4523 /* 4524 * restart count for expressions like this ((abc){2})* 4525 */ 4526 if (trans->count >= 0) { 4527 #ifdef DEBUG_REGEXP_EXEC 4528 printf("Reset count %d\n", trans->count); 4529 #endif 4530 exec->counts[trans->count] = 0; 4531 } 4532 if (trans->counter >= 0) { 4533 #ifdef DEBUG_REGEXP_EXEC 4534 printf("Increasing count %d\n", trans->counter); 4535 #endif 4536 exec->counts[trans->counter]++; 4537 } 4538 #ifdef DEBUG_REGEXP_EXEC 4539 printf("entering state %d\n", trans->to); 4540 #endif 4541 exec->state = exec->comp->states[trans->to]; 4542 exec->transno = 0; 4543 if (trans->atom != NULL) { 4544 exec->index += len; 4545 } 4546 goto progress; 4547 } else if (ret < 0) { 4548 exec->status = -4; 4549 break; 4550 } 4551 } 4552 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4553 rollback: 4554 /* 4555 * Failed to find a way out 4556 */ 4557 exec->determinist = 0; 4558 xmlFARegExecRollBack(exec); 4559 } 4560 progress: 4561 continue; 4562 } 4563 } 4564 #endif 4565 /************************************************************************ 4566 * * 4567 * Parser for the Schemas Datatype Regular Expressions * 4568 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4569 * * 4570 ************************************************************************/ 4571 4572 /** 4573 * xmlFAIsChar: 4574 * @ctxt: a regexp parser context 4575 * 4576 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4577 */ 4578 static int 4579 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4580 int cur; 4581 int len; 4582 4583 cur = CUR_SCHAR(ctxt->cur, len); 4584 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4585 (cur == '*') || (cur == '+') || (cur == '(') || 4586 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4587 (cur == 0x5D) || (cur == 0)) 4588 return(-1); 4589 return(cur); 4590 } 4591 4592 /** 4593 * xmlFAParseCharProp: 4594 * @ctxt: a regexp parser context 4595 * 4596 * [27] charProp ::= IsCategory | IsBlock 4597 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4598 * Separators | Symbols | Others 4599 * [29] Letters ::= 'L' [ultmo]? 4600 * [30] Marks ::= 'M' [nce]? 4601 * [31] Numbers ::= 'N' [dlo]? 4602 * [32] Punctuation ::= 'P' [cdseifo]? 4603 * [33] Separators ::= 'Z' [slp]? 4604 * [34] Symbols ::= 'S' [mcko]? 4605 * [35] Others ::= 'C' [cfon]? 4606 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4607 */ 4608 static void 4609 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4610 int cur; 4611 xmlRegAtomType type = (xmlRegAtomType) 0; 4612 xmlChar *blockName = NULL; 4613 4614 cur = CUR; 4615 if (cur == 'L') { 4616 NEXT; 4617 cur = CUR; 4618 if (cur == 'u') { 4619 NEXT; 4620 type = XML_REGEXP_LETTER_UPPERCASE; 4621 } else if (cur == 'l') { 4622 NEXT; 4623 type = XML_REGEXP_LETTER_LOWERCASE; 4624 } else if (cur == 't') { 4625 NEXT; 4626 type = XML_REGEXP_LETTER_TITLECASE; 4627 } else if (cur == 'm') { 4628 NEXT; 4629 type = XML_REGEXP_LETTER_MODIFIER; 4630 } else if (cur == 'o') { 4631 NEXT; 4632 type = XML_REGEXP_LETTER_OTHERS; 4633 } else { 4634 type = XML_REGEXP_LETTER; 4635 } 4636 } else if (cur == 'M') { 4637 NEXT; 4638 cur = CUR; 4639 if (cur == 'n') { 4640 NEXT; 4641 /* nonspacing */ 4642 type = XML_REGEXP_MARK_NONSPACING; 4643 } else if (cur == 'c') { 4644 NEXT; 4645 /* spacing combining */ 4646 type = XML_REGEXP_MARK_SPACECOMBINING; 4647 } else if (cur == 'e') { 4648 NEXT; 4649 /* enclosing */ 4650 type = XML_REGEXP_MARK_ENCLOSING; 4651 } else { 4652 /* all marks */ 4653 type = XML_REGEXP_MARK; 4654 } 4655 } else if (cur == 'N') { 4656 NEXT; 4657 cur = CUR; 4658 if (cur == 'd') { 4659 NEXT; 4660 /* digital */ 4661 type = XML_REGEXP_NUMBER_DECIMAL; 4662 } else if (cur == 'l') { 4663 NEXT; 4664 /* letter */ 4665 type = XML_REGEXP_NUMBER_LETTER; 4666 } else if (cur == 'o') { 4667 NEXT; 4668 /* other */ 4669 type = XML_REGEXP_NUMBER_OTHERS; 4670 } else { 4671 /* all numbers */ 4672 type = XML_REGEXP_NUMBER; 4673 } 4674 } else if (cur == 'P') { 4675 NEXT; 4676 cur = CUR; 4677 if (cur == 'c') { 4678 NEXT; 4679 /* connector */ 4680 type = XML_REGEXP_PUNCT_CONNECTOR; 4681 } else if (cur == 'd') { 4682 NEXT; 4683 /* dash */ 4684 type = XML_REGEXP_PUNCT_DASH; 4685 } else if (cur == 's') { 4686 NEXT; 4687 /* open */ 4688 type = XML_REGEXP_PUNCT_OPEN; 4689 } else if (cur == 'e') { 4690 NEXT; 4691 /* close */ 4692 type = XML_REGEXP_PUNCT_CLOSE; 4693 } else if (cur == 'i') { 4694 NEXT; 4695 /* initial quote */ 4696 type = XML_REGEXP_PUNCT_INITQUOTE; 4697 } else if (cur == 'f') { 4698 NEXT; 4699 /* final quote */ 4700 type = XML_REGEXP_PUNCT_FINQUOTE; 4701 } else if (cur == 'o') { 4702 NEXT; 4703 /* other */ 4704 type = XML_REGEXP_PUNCT_OTHERS; 4705 } else { 4706 /* all punctuation */ 4707 type = XML_REGEXP_PUNCT; 4708 } 4709 } else if (cur == 'Z') { 4710 NEXT; 4711 cur = CUR; 4712 if (cur == 's') { 4713 NEXT; 4714 /* space */ 4715 type = XML_REGEXP_SEPAR_SPACE; 4716 } else if (cur == 'l') { 4717 NEXT; 4718 /* line */ 4719 type = XML_REGEXP_SEPAR_LINE; 4720 } else if (cur == 'p') { 4721 NEXT; 4722 /* paragraph */ 4723 type = XML_REGEXP_SEPAR_PARA; 4724 } else { 4725 /* all separators */ 4726 type = XML_REGEXP_SEPAR; 4727 } 4728 } else if (cur == 'S') { 4729 NEXT; 4730 cur = CUR; 4731 if (cur == 'm') { 4732 NEXT; 4733 type = XML_REGEXP_SYMBOL_MATH; 4734 /* math */ 4735 } else if (cur == 'c') { 4736 NEXT; 4737 type = XML_REGEXP_SYMBOL_CURRENCY; 4738 /* currency */ 4739 } else if (cur == 'k') { 4740 NEXT; 4741 type = XML_REGEXP_SYMBOL_MODIFIER; 4742 /* modifiers */ 4743 } else if (cur == 'o') { 4744 NEXT; 4745 type = XML_REGEXP_SYMBOL_OTHERS; 4746 /* other */ 4747 } else { 4748 /* all symbols */ 4749 type = XML_REGEXP_SYMBOL; 4750 } 4751 } else if (cur == 'C') { 4752 NEXT; 4753 cur = CUR; 4754 if (cur == 'c') { 4755 NEXT; 4756 /* control */ 4757 type = XML_REGEXP_OTHER_CONTROL; 4758 } else if (cur == 'f') { 4759 NEXT; 4760 /* format */ 4761 type = XML_REGEXP_OTHER_FORMAT; 4762 } else if (cur == 'o') { 4763 NEXT; 4764 /* private use */ 4765 type = XML_REGEXP_OTHER_PRIVATE; 4766 } else if (cur == 'n') { 4767 NEXT; 4768 /* not assigned */ 4769 type = XML_REGEXP_OTHER_NA; 4770 } else { 4771 /* all others */ 4772 type = XML_REGEXP_OTHER; 4773 } 4774 } else if (cur == 'I') { 4775 const xmlChar *start; 4776 NEXT; 4777 cur = CUR; 4778 if (cur != 's') { 4779 ERROR("IsXXXX expected"); 4780 return; 4781 } 4782 NEXT; 4783 start = ctxt->cur; 4784 cur = CUR; 4785 if (((cur >= 'a') && (cur <= 'z')) || 4786 ((cur >= 'A') && (cur <= 'Z')) || 4787 ((cur >= '0') && (cur <= '9')) || 4788 (cur == 0x2D)) { 4789 NEXT; 4790 cur = CUR; 4791 while (((cur >= 'a') && (cur <= 'z')) || 4792 ((cur >= 'A') && (cur <= 'Z')) || 4793 ((cur >= '0') && (cur <= '9')) || 4794 (cur == 0x2D)) { 4795 NEXT; 4796 cur = CUR; 4797 } 4798 } 4799 type = XML_REGEXP_BLOCK_NAME; 4800 blockName = xmlStrndup(start, ctxt->cur - start); 4801 } else { 4802 ERROR("Unknown char property"); 4803 return; 4804 } 4805 if (ctxt->atom == NULL) { 4806 ctxt->atom = xmlRegNewAtom(ctxt, type); 4807 if (ctxt->atom != NULL) 4808 ctxt->atom->valuep = blockName; 4809 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4810 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4811 type, 0, 0, blockName); 4812 } 4813 } 4814 4815 /** 4816 * xmlFAParseCharClassEsc: 4817 * @ctxt: a regexp parser context 4818 * 4819 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4820 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4821 * [25] catEsc ::= '\p{' charProp '}' 4822 * [26] complEsc ::= '\P{' charProp '}' 4823 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4824 */ 4825 static void 4826 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4827 int cur; 4828 4829 if (CUR == '.') { 4830 if (ctxt->atom == NULL) { 4831 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4832 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4833 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4834 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4835 } 4836 NEXT; 4837 return; 4838 } 4839 if (CUR != '\\') { 4840 ERROR("Escaped sequence: expecting \\"); 4841 return; 4842 } 4843 NEXT; 4844 cur = CUR; 4845 if (cur == 'p') { 4846 NEXT; 4847 if (CUR != '{') { 4848 ERROR("Expecting '{'"); 4849 return; 4850 } 4851 NEXT; 4852 xmlFAParseCharProp(ctxt); 4853 if (CUR != '}') { 4854 ERROR("Expecting '}'"); 4855 return; 4856 } 4857 NEXT; 4858 } else if (cur == 'P') { 4859 NEXT; 4860 if (CUR != '{') { 4861 ERROR("Expecting '{'"); 4862 return; 4863 } 4864 NEXT; 4865 xmlFAParseCharProp(ctxt); 4866 ctxt->atom->neg = 1; 4867 if (CUR != '}') { 4868 ERROR("Expecting '}'"); 4869 return; 4870 } 4871 NEXT; 4872 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4873 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4874 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4875 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4876 (cur == 0x5E)) { 4877 if (ctxt->atom == NULL) { 4878 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4879 if (ctxt->atom != NULL) { 4880 switch (cur) { 4881 case 'n': 4882 ctxt->atom->codepoint = '\n'; 4883 break; 4884 case 'r': 4885 ctxt->atom->codepoint = '\r'; 4886 break; 4887 case 't': 4888 ctxt->atom->codepoint = '\t'; 4889 break; 4890 default: 4891 ctxt->atom->codepoint = cur; 4892 } 4893 } 4894 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4895 switch (cur) { 4896 case 'n': 4897 cur = '\n'; 4898 break; 4899 case 'r': 4900 cur = '\r'; 4901 break; 4902 case 't': 4903 cur = '\t'; 4904 break; 4905 } 4906 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4907 XML_REGEXP_CHARVAL, cur, cur, NULL); 4908 } 4909 NEXT; 4910 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4911 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4912 (cur == 'w') || (cur == 'W')) { 4913 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4914 4915 switch (cur) { 4916 case 's': 4917 type = XML_REGEXP_ANYSPACE; 4918 break; 4919 case 'S': 4920 type = XML_REGEXP_NOTSPACE; 4921 break; 4922 case 'i': 4923 type = XML_REGEXP_INITNAME; 4924 break; 4925 case 'I': 4926 type = XML_REGEXP_NOTINITNAME; 4927 break; 4928 case 'c': 4929 type = XML_REGEXP_NAMECHAR; 4930 break; 4931 case 'C': 4932 type = XML_REGEXP_NOTNAMECHAR; 4933 break; 4934 case 'd': 4935 type = XML_REGEXP_DECIMAL; 4936 break; 4937 case 'D': 4938 type = XML_REGEXP_NOTDECIMAL; 4939 break; 4940 case 'w': 4941 type = XML_REGEXP_REALCHAR; 4942 break; 4943 case 'W': 4944 type = XML_REGEXP_NOTREALCHAR; 4945 break; 4946 } 4947 NEXT; 4948 if (ctxt->atom == NULL) { 4949 ctxt->atom = xmlRegNewAtom(ctxt, type); 4950 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4951 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4952 type, 0, 0, NULL); 4953 } 4954 } else { 4955 ERROR("Wrong escape sequence, misuse of character '\\'"); 4956 } 4957 } 4958 4959 /** 4960 * xmlFAParseCharRange: 4961 * @ctxt: a regexp parser context 4962 * 4963 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 4964 * [18] seRange ::= charOrEsc '-' charOrEsc 4965 * [20] charOrEsc ::= XmlChar | SingleCharEsc 4966 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 4967 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 4968 */ 4969 static void 4970 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 4971 int cur, len; 4972 int start = -1; 4973 int end = -1; 4974 4975 if (CUR == '\0') { 4976 ERROR("Expecting ']'"); 4977 return; 4978 } 4979 4980 cur = CUR; 4981 if (cur == '\\') { 4982 NEXT; 4983 cur = CUR; 4984 switch (cur) { 4985 case 'n': start = 0xA; break; 4986 case 'r': start = 0xD; break; 4987 case 't': start = 0x9; break; 4988 case '\\': case '|': case '.': case '-': case '^': case '?': 4989 case '*': case '+': case '{': case '}': case '(': case ')': 4990 case '[': case ']': 4991 start = cur; break; 4992 default: 4993 ERROR("Invalid escape value"); 4994 return; 4995 } 4996 end = start; 4997 len = 1; 4998 } else if ((cur != 0x5B) && (cur != 0x5D)) { 4999 end = start = CUR_SCHAR(ctxt->cur, len); 5000 } else { 5001 ERROR("Expecting a char range"); 5002 return; 5003 } 5004 /* 5005 * Since we are "inside" a range, we can assume ctxt->cur is past 5006 * the start of ctxt->string, and PREV should be safe 5007 */ 5008 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) { 5009 NEXTL(len); 5010 return; 5011 } 5012 NEXTL(len); 5013 cur = CUR; 5014 if ((cur != '-') || (NXT(1) == ']')) { 5015 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5016 XML_REGEXP_CHARVAL, start, end, NULL); 5017 return; 5018 } 5019 NEXT; 5020 cur = CUR; 5021 if (cur == '\\') { 5022 NEXT; 5023 cur = CUR; 5024 switch (cur) { 5025 case 'n': end = 0xA; break; 5026 case 'r': end = 0xD; break; 5027 case 't': end = 0x9; break; 5028 case '\\': case '|': case '.': case '-': case '^': case '?': 5029 case '*': case '+': case '{': case '}': case '(': case ')': 5030 case '[': case ']': 5031 end = cur; break; 5032 default: 5033 ERROR("Invalid escape value"); 5034 return; 5035 } 5036 len = 1; 5037 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5038 end = CUR_SCHAR(ctxt->cur, len); 5039 } else { 5040 ERROR("Expecting the end of a char range"); 5041 return; 5042 } 5043 NEXTL(len); 5044 /* TODO check that the values are acceptable character ranges for XML */ 5045 if (end < start) { 5046 ERROR("End of range is before start of range"); 5047 } else { 5048 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5049 XML_REGEXP_CHARVAL, start, end, NULL); 5050 } 5051 return; 5052 } 5053 5054 /** 5055 * xmlFAParsePosCharGroup: 5056 * @ctxt: a regexp parser context 5057 * 5058 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 5059 */ 5060 static void 5061 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 5062 do { 5063 if (CUR == '\\') { 5064 xmlFAParseCharClassEsc(ctxt); 5065 } else { 5066 xmlFAParseCharRange(ctxt); 5067 } 5068 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 5069 (CUR != 0) && (ctxt->error == 0)); 5070 } 5071 5072 /** 5073 * xmlFAParseCharGroup: 5074 * @ctxt: a regexp parser context 5075 * 5076 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 5077 * [15] negCharGroup ::= '^' posCharGroup 5078 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 5079 * [12] charClassExpr ::= '[' charGroup ']' 5080 */ 5081 static void 5082 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 5083 int n = ctxt->neg; 5084 while ((CUR != ']') && (ctxt->error == 0)) { 5085 if (CUR == '^') { 5086 int neg = ctxt->neg; 5087 5088 NEXT; 5089 ctxt->neg = !ctxt->neg; 5090 xmlFAParsePosCharGroup(ctxt); 5091 ctxt->neg = neg; 5092 } else if ((CUR == '-') && (NXT(1) == '[')) { 5093 int neg = ctxt->neg; 5094 ctxt->neg = 2; 5095 NEXT; /* eat the '-' */ 5096 NEXT; /* eat the '[' */ 5097 xmlFAParseCharGroup(ctxt); 5098 if (CUR == ']') { 5099 NEXT; 5100 } else { 5101 ERROR("charClassExpr: ']' expected"); 5102 break; 5103 } 5104 ctxt->neg = neg; 5105 break; 5106 } else if (CUR != ']') { 5107 xmlFAParsePosCharGroup(ctxt); 5108 } 5109 } 5110 ctxt->neg = n; 5111 } 5112 5113 /** 5114 * xmlFAParseCharClass: 5115 * @ctxt: a regexp parser context 5116 * 5117 * [11] charClass ::= charClassEsc | charClassExpr 5118 * [12] charClassExpr ::= '[' charGroup ']' 5119 */ 5120 static void 5121 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 5122 if (CUR == '[') { 5123 NEXT; 5124 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 5125 if (ctxt->atom == NULL) 5126 return; 5127 xmlFAParseCharGroup(ctxt); 5128 if (CUR == ']') { 5129 NEXT; 5130 } else { 5131 ERROR("xmlFAParseCharClass: ']' expected"); 5132 } 5133 } else { 5134 xmlFAParseCharClassEsc(ctxt); 5135 } 5136 } 5137 5138 /** 5139 * xmlFAParseQuantExact: 5140 * @ctxt: a regexp parser context 5141 * 5142 * [8] QuantExact ::= [0-9]+ 5143 * 5144 * Returns 0 if success or -1 in case of error 5145 */ 5146 static int 5147 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 5148 int ret = 0; 5149 int ok = 0; 5150 5151 while ((CUR >= '0') && (CUR <= '9')) { 5152 ret = ret * 10 + (CUR - '0'); 5153 ok = 1; 5154 NEXT; 5155 } 5156 if (ok != 1) { 5157 return(-1); 5158 } 5159 return(ret); 5160 } 5161 5162 /** 5163 * xmlFAParseQuantifier: 5164 * @ctxt: a regexp parser context 5165 * 5166 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 5167 * [5] quantity ::= quantRange | quantMin | QuantExact 5168 * [6] quantRange ::= QuantExact ',' QuantExact 5169 * [7] quantMin ::= QuantExact ',' 5170 * [8] QuantExact ::= [0-9]+ 5171 */ 5172 static int 5173 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5174 int cur; 5175 5176 cur = CUR; 5177 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5178 if (ctxt->atom != NULL) { 5179 if (cur == '?') 5180 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5181 else if (cur == '*') 5182 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5183 else if (cur == '+') 5184 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5185 } 5186 NEXT; 5187 return(1); 5188 } 5189 if (cur == '{') { 5190 int min = 0, max = 0; 5191 5192 NEXT; 5193 cur = xmlFAParseQuantExact(ctxt); 5194 if (cur >= 0) 5195 min = cur; 5196 if (CUR == ',') { 5197 NEXT; 5198 if (CUR == '}') 5199 max = INT_MAX; 5200 else { 5201 cur = xmlFAParseQuantExact(ctxt); 5202 if (cur >= 0) 5203 max = cur; 5204 else { 5205 ERROR("Improper quantifier"); 5206 } 5207 } 5208 } 5209 if (CUR == '}') { 5210 NEXT; 5211 } else { 5212 ERROR("Unterminated quantifier"); 5213 } 5214 if (max == 0) 5215 max = min; 5216 if (ctxt->atom != NULL) { 5217 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5218 ctxt->atom->min = min; 5219 ctxt->atom->max = max; 5220 } 5221 return(1); 5222 } 5223 return(0); 5224 } 5225 5226 /** 5227 * xmlFAParseAtom: 5228 * @ctxt: a regexp parser context 5229 * 5230 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5231 */ 5232 static int 5233 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5234 int codepoint, len; 5235 5236 codepoint = xmlFAIsChar(ctxt); 5237 if (codepoint > 0) { 5238 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5239 if (ctxt->atom == NULL) 5240 return(-1); 5241 codepoint = CUR_SCHAR(ctxt->cur, len); 5242 ctxt->atom->codepoint = codepoint; 5243 NEXTL(len); 5244 return(1); 5245 } else if (CUR == '|') { 5246 return(0); 5247 } else if (CUR == 0) { 5248 return(0); 5249 } else if (CUR == ')') { 5250 return(0); 5251 } else if (CUR == '(') { 5252 xmlRegStatePtr start, oldend, start0; 5253 5254 NEXT; 5255 /* 5256 * this extra Epsilon transition is needed if we count with 0 allowed 5257 * unfortunately this can't be known at that point 5258 */ 5259 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5260 start0 = ctxt->state; 5261 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5262 start = ctxt->state; 5263 oldend = ctxt->end; 5264 ctxt->end = NULL; 5265 ctxt->atom = NULL; 5266 xmlFAParseRegExp(ctxt, 0); 5267 if (CUR == ')') { 5268 NEXT; 5269 } else { 5270 ERROR("xmlFAParseAtom: expecting ')'"); 5271 } 5272 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5273 if (ctxt->atom == NULL) 5274 return(-1); 5275 ctxt->atom->start = start; 5276 ctxt->atom->start0 = start0; 5277 ctxt->atom->stop = ctxt->state; 5278 ctxt->end = oldend; 5279 return(1); 5280 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5281 xmlFAParseCharClass(ctxt); 5282 return(1); 5283 } 5284 return(0); 5285 } 5286 5287 /** 5288 * xmlFAParsePiece: 5289 * @ctxt: a regexp parser context 5290 * 5291 * [3] piece ::= atom quantifier? 5292 */ 5293 static int 5294 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5295 int ret; 5296 5297 ctxt->atom = NULL; 5298 ret = xmlFAParseAtom(ctxt); 5299 if (ret == 0) 5300 return(0); 5301 if (ctxt->atom == NULL) { 5302 ERROR("internal: no atom generated"); 5303 } 5304 xmlFAParseQuantifier(ctxt); 5305 return(1); 5306 } 5307 5308 /** 5309 * xmlFAParseBranch: 5310 * @ctxt: a regexp parser context 5311 * @to: optional target to the end of the branch 5312 * 5313 * @to is used to optimize by removing duplicate path in automata 5314 * in expressions like (a|b)(c|d) 5315 * 5316 * [2] branch ::= piece* 5317 */ 5318 static int 5319 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5320 xmlRegStatePtr previous; 5321 int ret; 5322 5323 previous = ctxt->state; 5324 ret = xmlFAParsePiece(ctxt); 5325 if (ret != 0) { 5326 if (xmlFAGenerateTransitions(ctxt, previous, 5327 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5328 return(-1); 5329 previous = ctxt->state; 5330 ctxt->atom = NULL; 5331 } 5332 while ((ret != 0) && (ctxt->error == 0)) { 5333 ret = xmlFAParsePiece(ctxt); 5334 if (ret != 0) { 5335 if (xmlFAGenerateTransitions(ctxt, previous, 5336 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5337 return(-1); 5338 previous = ctxt->state; 5339 ctxt->atom = NULL; 5340 } 5341 } 5342 return(0); 5343 } 5344 5345 /** 5346 * xmlFAParseRegExp: 5347 * @ctxt: a regexp parser context 5348 * @top: is this the top-level expression ? 5349 * 5350 * [1] regExp ::= branch ( '|' branch )* 5351 */ 5352 static void 5353 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5354 xmlRegStatePtr start, end; 5355 5356 /* if not top start should have been generated by an epsilon trans */ 5357 start = ctxt->state; 5358 ctxt->end = NULL; 5359 xmlFAParseBranch(ctxt, NULL); 5360 if (top) { 5361 #ifdef DEBUG_REGEXP_GRAPH 5362 printf("State %d is final\n", ctxt->state->no); 5363 #endif 5364 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5365 } 5366 if (CUR != '|') { 5367 ctxt->end = ctxt->state; 5368 return; 5369 } 5370 end = ctxt->state; 5371 while ((CUR == '|') && (ctxt->error == 0)) { 5372 NEXT; 5373 ctxt->state = start; 5374 ctxt->end = NULL; 5375 xmlFAParseBranch(ctxt, end); 5376 } 5377 if (!top) { 5378 ctxt->state = end; 5379 ctxt->end = end; 5380 } 5381 } 5382 5383 /************************************************************************ 5384 * * 5385 * The basic API * 5386 * * 5387 ************************************************************************/ 5388 5389 /** 5390 * xmlRegexpPrint: 5391 * @output: the file for the output debug 5392 * @regexp: the compiled regexp 5393 * 5394 * Print the content of the compiled regular expression 5395 */ 5396 void 5397 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5398 int i; 5399 5400 if (output == NULL) 5401 return; 5402 fprintf(output, " regexp: "); 5403 if (regexp == NULL) { 5404 fprintf(output, "NULL\n"); 5405 return; 5406 } 5407 fprintf(output, "'%s' ", regexp->string); 5408 fprintf(output, "\n"); 5409 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5410 for (i = 0;i < regexp->nbAtoms; i++) { 5411 fprintf(output, " %02d ", i); 5412 xmlRegPrintAtom(output, regexp->atoms[i]); 5413 } 5414 fprintf(output, "%d states:", regexp->nbStates); 5415 fprintf(output, "\n"); 5416 for (i = 0;i < regexp->nbStates; i++) { 5417 xmlRegPrintState(output, regexp->states[i]); 5418 } 5419 fprintf(output, "%d counters:\n", regexp->nbCounters); 5420 for (i = 0;i < regexp->nbCounters; i++) { 5421 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5422 regexp->counters[i].max); 5423 } 5424 } 5425 5426 /** 5427 * xmlRegexpCompile: 5428 * @regexp: a regular expression string 5429 * 5430 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5431 * Appendix F and builds an automata suitable for testing strings against 5432 * that regular expression 5433 * 5434 * Returns the compiled expression or NULL in case of error 5435 */ 5436 xmlRegexpPtr 5437 xmlRegexpCompile(const xmlChar *regexp) { 5438 xmlRegexpPtr ret; 5439 xmlRegParserCtxtPtr ctxt; 5440 5441 ctxt = xmlRegNewParserCtxt(regexp); 5442 if (ctxt == NULL) 5443 return(NULL); 5444 5445 /* initialize the parser */ 5446 ctxt->end = NULL; 5447 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5448 xmlRegStatePush(ctxt, ctxt->start); 5449 5450 /* parse the expression building an automata */ 5451 xmlFAParseRegExp(ctxt, 1); 5452 if (CUR != 0) { 5453 ERROR("xmlFAParseRegExp: extra characters"); 5454 } 5455 if (ctxt->error != 0) { 5456 xmlRegFreeParserCtxt(ctxt); 5457 return(NULL); 5458 } 5459 ctxt->end = ctxt->state; 5460 ctxt->start->type = XML_REGEXP_START_STATE; 5461 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5462 5463 /* remove the Epsilon except for counted transitions */ 5464 xmlFAEliminateEpsilonTransitions(ctxt); 5465 5466 5467 if (ctxt->error != 0) { 5468 xmlRegFreeParserCtxt(ctxt); 5469 return(NULL); 5470 } 5471 ret = xmlRegEpxFromParse(ctxt); 5472 xmlRegFreeParserCtxt(ctxt); 5473 return(ret); 5474 } 5475 5476 /** 5477 * xmlRegexpExec: 5478 * @comp: the compiled regular expression 5479 * @content: the value to check against the regular expression 5480 * 5481 * Check if the regular expression generates the value 5482 * 5483 * Returns 1 if it matches, 0 if not and a negative value in case of error 5484 */ 5485 int 5486 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5487 if ((comp == NULL) || (content == NULL)) 5488 return(-1); 5489 return(xmlFARegExec(comp, content)); 5490 } 5491 5492 /** 5493 * xmlRegexpIsDeterminist: 5494 * @comp: the compiled regular expression 5495 * 5496 * Check if the regular expression is determinist 5497 * 5498 * Returns 1 if it yes, 0 if not and a negative value in case of error 5499 */ 5500 int 5501 xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5502 xmlAutomataPtr am; 5503 int ret; 5504 5505 if (comp == NULL) 5506 return(-1); 5507 if (comp->determinist != -1) 5508 return(comp->determinist); 5509 5510 am = xmlNewAutomata(); 5511 if (am->states != NULL) { 5512 int i; 5513 5514 for (i = 0;i < am->nbStates;i++) 5515 xmlRegFreeState(am->states[i]); 5516 xmlFree(am->states); 5517 } 5518 am->nbAtoms = comp->nbAtoms; 5519 am->atoms = comp->atoms; 5520 am->nbStates = comp->nbStates; 5521 am->states = comp->states; 5522 am->determinist = -1; 5523 am->flags = comp->flags; 5524 ret = xmlFAComputesDeterminism(am); 5525 am->atoms = NULL; 5526 am->states = NULL; 5527 xmlFreeAutomata(am); 5528 comp->determinist = ret; 5529 return(ret); 5530 } 5531 5532 /** 5533 * xmlRegFreeRegexp: 5534 * @regexp: the regexp 5535 * 5536 * Free a regexp 5537 */ 5538 void 5539 xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5540 int i; 5541 if (regexp == NULL) 5542 return; 5543 5544 if (regexp->string != NULL) 5545 xmlFree(regexp->string); 5546 if (regexp->states != NULL) { 5547 for (i = 0;i < regexp->nbStates;i++) 5548 xmlRegFreeState(regexp->states[i]); 5549 xmlFree(regexp->states); 5550 } 5551 if (regexp->atoms != NULL) { 5552 for (i = 0;i < regexp->nbAtoms;i++) 5553 xmlRegFreeAtom(regexp->atoms[i]); 5554 xmlFree(regexp->atoms); 5555 } 5556 if (regexp->counters != NULL) 5557 xmlFree(regexp->counters); 5558 if (regexp->compact != NULL) 5559 xmlFree(regexp->compact); 5560 if (regexp->transdata != NULL) 5561 xmlFree(regexp->transdata); 5562 if (regexp->stringMap != NULL) { 5563 for (i = 0; i < regexp->nbstrings;i++) 5564 xmlFree(regexp->stringMap[i]); 5565 xmlFree(regexp->stringMap); 5566 } 5567 5568 xmlFree(regexp); 5569 } 5570 5571 #ifdef LIBXML_AUTOMATA_ENABLED 5572 /************************************************************************ 5573 * * 5574 * The Automata interface * 5575 * * 5576 ************************************************************************/ 5577 5578 /** 5579 * xmlNewAutomata: 5580 * 5581 * Create a new automata 5582 * 5583 * Returns the new object or NULL in case of failure 5584 */ 5585 xmlAutomataPtr 5586 xmlNewAutomata(void) { 5587 xmlAutomataPtr ctxt; 5588 5589 ctxt = xmlRegNewParserCtxt(NULL); 5590 if (ctxt == NULL) 5591 return(NULL); 5592 5593 /* initialize the parser */ 5594 ctxt->end = NULL; 5595 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5596 if (ctxt->start == NULL) { 5597 xmlFreeAutomata(ctxt); 5598 return(NULL); 5599 } 5600 ctxt->start->type = XML_REGEXP_START_STATE; 5601 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5602 xmlRegFreeState(ctxt->start); 5603 xmlFreeAutomata(ctxt); 5604 return(NULL); 5605 } 5606 ctxt->flags = 0; 5607 5608 return(ctxt); 5609 } 5610 5611 /** 5612 * xmlFreeAutomata: 5613 * @am: an automata 5614 * 5615 * Free an automata 5616 */ 5617 void 5618 xmlFreeAutomata(xmlAutomataPtr am) { 5619 if (am == NULL) 5620 return; 5621 xmlRegFreeParserCtxt(am); 5622 } 5623 5624 /** 5625 * xmlAutomataSetFlags: 5626 * @am: an automata 5627 * @flags: a set of internal flags 5628 * 5629 * Set some flags on the automata 5630 */ 5631 void 5632 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) { 5633 if (am == NULL) 5634 return; 5635 am->flags |= flags; 5636 } 5637 5638 /** 5639 * xmlAutomataGetInitState: 5640 * @am: an automata 5641 * 5642 * Initial state lookup 5643 * 5644 * Returns the initial state of the automata 5645 */ 5646 xmlAutomataStatePtr 5647 xmlAutomataGetInitState(xmlAutomataPtr am) { 5648 if (am == NULL) 5649 return(NULL); 5650 return(am->start); 5651 } 5652 5653 /** 5654 * xmlAutomataSetFinalState: 5655 * @am: an automata 5656 * @state: a state in this automata 5657 * 5658 * Makes that state a final state 5659 * 5660 * Returns 0 or -1 in case of error 5661 */ 5662 int 5663 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5664 if ((am == NULL) || (state == NULL)) 5665 return(-1); 5666 state->type = XML_REGEXP_FINAL_STATE; 5667 return(0); 5668 } 5669 5670 /** 5671 * xmlAutomataNewTransition: 5672 * @am: an automata 5673 * @from: the starting point of the transition 5674 * @to: the target point of the transition or NULL 5675 * @token: the input string associated to that transition 5676 * @data: data passed to the callback function if the transition is activated 5677 * 5678 * If @to is NULL, this creates first a new target state in the automata 5679 * and then adds a transition from the @from state to the target state 5680 * activated by the value of @token 5681 * 5682 * Returns the target state or NULL in case of error 5683 */ 5684 xmlAutomataStatePtr 5685 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5686 xmlAutomataStatePtr to, const xmlChar *token, 5687 void *data) { 5688 xmlRegAtomPtr atom; 5689 5690 if ((am == NULL) || (from == NULL) || (token == NULL)) 5691 return(NULL); 5692 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5693 if (atom == NULL) 5694 return(NULL); 5695 atom->data = data; 5696 if (atom == NULL) 5697 return(NULL); 5698 atom->valuep = xmlStrdup(token); 5699 5700 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5701 xmlRegFreeAtom(atom); 5702 return(NULL); 5703 } 5704 if (to == NULL) 5705 return(am->state); 5706 return(to); 5707 } 5708 5709 /** 5710 * xmlAutomataNewTransition2: 5711 * @am: an automata 5712 * @from: the starting point of the transition 5713 * @to: the target point of the transition or NULL 5714 * @token: the first input string associated to that transition 5715 * @token2: the second input string associated to that transition 5716 * @data: data passed to the callback function if the transition is activated 5717 * 5718 * If @to is NULL, this creates first a new target state in the automata 5719 * and then adds a transition from the @from state to the target state 5720 * activated by the value of @token 5721 * 5722 * Returns the target state or NULL in case of error 5723 */ 5724 xmlAutomataStatePtr 5725 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5726 xmlAutomataStatePtr to, const xmlChar *token, 5727 const xmlChar *token2, void *data) { 5728 xmlRegAtomPtr atom; 5729 5730 if ((am == NULL) || (from == NULL) || (token == NULL)) 5731 return(NULL); 5732 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5733 if (atom == NULL) 5734 return(NULL); 5735 atom->data = data; 5736 if ((token2 == NULL) || (*token2 == 0)) { 5737 atom->valuep = xmlStrdup(token); 5738 } else { 5739 int lenn, lenp; 5740 xmlChar *str; 5741 5742 lenn = strlen((char *) token2); 5743 lenp = strlen((char *) token); 5744 5745 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5746 if (str == NULL) { 5747 xmlRegFreeAtom(atom); 5748 return(NULL); 5749 } 5750 memcpy(&str[0], token, lenp); 5751 str[lenp] = '|'; 5752 memcpy(&str[lenp + 1], token2, lenn); 5753 str[lenn + lenp + 1] = 0; 5754 5755 atom->valuep = str; 5756 } 5757 5758 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5759 xmlRegFreeAtom(atom); 5760 return(NULL); 5761 } 5762 if (to == NULL) 5763 return(am->state); 5764 return(to); 5765 } 5766 5767 /** 5768 * xmlAutomataNewNegTrans: 5769 * @am: an automata 5770 * @from: the starting point of the transition 5771 * @to: the target point of the transition or NULL 5772 * @token: the first input string associated to that transition 5773 * @token2: the second input string associated to that transition 5774 * @data: data passed to the callback function if the transition is activated 5775 * 5776 * If @to is NULL, this creates first a new target state in the automata 5777 * and then adds a transition from the @from state to the target state 5778 * activated by any value except (@token,@token2) 5779 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5780 # the semantic of XSD ##other 5781 * 5782 * Returns the target state or NULL in case of error 5783 */ 5784 xmlAutomataStatePtr 5785 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5786 xmlAutomataStatePtr to, const xmlChar *token, 5787 const xmlChar *token2, void *data) { 5788 xmlRegAtomPtr atom; 5789 xmlChar err_msg[200]; 5790 5791 if ((am == NULL) || (from == NULL) || (token == NULL)) 5792 return(NULL); 5793 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5794 if (atom == NULL) 5795 return(NULL); 5796 atom->data = data; 5797 atom->neg = 1; 5798 if ((token2 == NULL) || (*token2 == 0)) { 5799 atom->valuep = xmlStrdup(token); 5800 } else { 5801 int lenn, lenp; 5802 xmlChar *str; 5803 5804 lenn = strlen((char *) token2); 5805 lenp = strlen((char *) token); 5806 5807 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5808 if (str == NULL) { 5809 xmlRegFreeAtom(atom); 5810 return(NULL); 5811 } 5812 memcpy(&str[0], token, lenp); 5813 str[lenp] = '|'; 5814 memcpy(&str[lenp + 1], token2, lenn); 5815 str[lenn + lenp + 1] = 0; 5816 5817 atom->valuep = str; 5818 } 5819 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5820 err_msg[199] = 0; 5821 atom->valuep2 = xmlStrdup(err_msg); 5822 5823 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5824 xmlRegFreeAtom(atom); 5825 return(NULL); 5826 } 5827 am->negs++; 5828 if (to == NULL) 5829 return(am->state); 5830 return(to); 5831 } 5832 5833 /** 5834 * xmlAutomataNewCountTrans2: 5835 * @am: an automata 5836 * @from: the starting point of the transition 5837 * @to: the target point of the transition or NULL 5838 * @token: the input string associated to that transition 5839 * @token2: the second input string associated to that transition 5840 * @min: the minimum successive occurences of token 5841 * @max: the maximum successive occurences of token 5842 * @data: data associated to the transition 5843 * 5844 * If @to is NULL, this creates first a new target state in the automata 5845 * and then adds a transition from the @from state to the target state 5846 * activated by a succession of input of value @token and @token2 and 5847 * whose number is between @min and @max 5848 * 5849 * Returns the target state or NULL in case of error 5850 */ 5851 xmlAutomataStatePtr 5852 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5853 xmlAutomataStatePtr to, const xmlChar *token, 5854 const xmlChar *token2, 5855 int min, int max, void *data) { 5856 xmlRegAtomPtr atom; 5857 int counter; 5858 5859 if ((am == NULL) || (from == NULL) || (token == NULL)) 5860 return(NULL); 5861 if (min < 0) 5862 return(NULL); 5863 if ((max < min) || (max < 1)) 5864 return(NULL); 5865 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5866 if (atom == NULL) 5867 return(NULL); 5868 if ((token2 == NULL) || (*token2 == 0)) { 5869 atom->valuep = xmlStrdup(token); 5870 } else { 5871 int lenn, lenp; 5872 xmlChar *str; 5873 5874 lenn = strlen((char *) token2); 5875 lenp = strlen((char *) token); 5876 5877 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5878 if (str == NULL) { 5879 xmlRegFreeAtom(atom); 5880 return(NULL); 5881 } 5882 memcpy(&str[0], token, lenp); 5883 str[lenp] = '|'; 5884 memcpy(&str[lenp + 1], token2, lenn); 5885 str[lenn + lenp + 1] = 0; 5886 5887 atom->valuep = str; 5888 } 5889 atom->data = data; 5890 if (min == 0) 5891 atom->min = 1; 5892 else 5893 atom->min = min; 5894 atom->max = max; 5895 5896 /* 5897 * associate a counter to the transition. 5898 */ 5899 counter = xmlRegGetCounter(am); 5900 am->counters[counter].min = min; 5901 am->counters[counter].max = max; 5902 5903 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5904 if (to == NULL) { 5905 to = xmlRegNewState(am); 5906 xmlRegStatePush(am, to); 5907 } 5908 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5909 xmlRegAtomPush(am, atom); 5910 am->state = to; 5911 5912 if (to == NULL) 5913 to = am->state; 5914 if (to == NULL) 5915 return(NULL); 5916 if (min == 0) 5917 xmlFAGenerateEpsilonTransition(am, from, to); 5918 return(to); 5919 } 5920 5921 /** 5922 * xmlAutomataNewCountTrans: 5923 * @am: an automata 5924 * @from: the starting point of the transition 5925 * @to: the target point of the transition or NULL 5926 * @token: the input string associated to that transition 5927 * @min: the minimum successive occurences of token 5928 * @max: the maximum successive occurences of token 5929 * @data: data associated to the transition 5930 * 5931 * If @to is NULL, this creates first a new target state in the automata 5932 * and then adds a transition from the @from state to the target state 5933 * activated by a succession of input of value @token and whose number 5934 * is between @min and @max 5935 * 5936 * Returns the target state or NULL in case of error 5937 */ 5938 xmlAutomataStatePtr 5939 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5940 xmlAutomataStatePtr to, const xmlChar *token, 5941 int min, int max, void *data) { 5942 xmlRegAtomPtr atom; 5943 int counter; 5944 5945 if ((am == NULL) || (from == NULL) || (token == NULL)) 5946 return(NULL); 5947 if (min < 0) 5948 return(NULL); 5949 if ((max < min) || (max < 1)) 5950 return(NULL); 5951 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5952 if (atom == NULL) 5953 return(NULL); 5954 atom->valuep = xmlStrdup(token); 5955 atom->data = data; 5956 if (min == 0) 5957 atom->min = 1; 5958 else 5959 atom->min = min; 5960 atom->max = max; 5961 5962 /* 5963 * associate a counter to the transition. 5964 */ 5965 counter = xmlRegGetCounter(am); 5966 am->counters[counter].min = min; 5967 am->counters[counter].max = max; 5968 5969 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5970 if (to == NULL) { 5971 to = xmlRegNewState(am); 5972 xmlRegStatePush(am, to); 5973 } 5974 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5975 xmlRegAtomPush(am, atom); 5976 am->state = to; 5977 5978 if (to == NULL) 5979 to = am->state; 5980 if (to == NULL) 5981 return(NULL); 5982 if (min == 0) 5983 xmlFAGenerateEpsilonTransition(am, from, to); 5984 return(to); 5985 } 5986 5987 /** 5988 * xmlAutomataNewOnceTrans2: 5989 * @am: an automata 5990 * @from: the starting point of the transition 5991 * @to: the target point of the transition or NULL 5992 * @token: the input string associated to that transition 5993 * @token2: the second input string associated to that transition 5994 * @min: the minimum successive occurences of token 5995 * @max: the maximum successive occurences of token 5996 * @data: data associated to the transition 5997 * 5998 * If @to is NULL, this creates first a new target state in the automata 5999 * and then adds a transition from the @from state to the target state 6000 * activated by a succession of input of value @token and @token2 and whose 6001 * number is between @min and @max, moreover that transition can only be 6002 * crossed once. 6003 * 6004 * Returns the target state or NULL in case of error 6005 */ 6006 xmlAutomataStatePtr 6007 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 6008 xmlAutomataStatePtr to, const xmlChar *token, 6009 const xmlChar *token2, 6010 int min, int max, void *data) { 6011 xmlRegAtomPtr atom; 6012 int counter; 6013 6014 if ((am == NULL) || (from == NULL) || (token == NULL)) 6015 return(NULL); 6016 if (min < 1) 6017 return(NULL); 6018 if ((max < min) || (max < 1)) 6019 return(NULL); 6020 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6021 if (atom == NULL) 6022 return(NULL); 6023 if ((token2 == NULL) || (*token2 == 0)) { 6024 atom->valuep = xmlStrdup(token); 6025 } else { 6026 int lenn, lenp; 6027 xmlChar *str; 6028 6029 lenn = strlen((char *) token2); 6030 lenp = strlen((char *) token); 6031 6032 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 6033 if (str == NULL) { 6034 xmlRegFreeAtom(atom); 6035 return(NULL); 6036 } 6037 memcpy(&str[0], token, lenp); 6038 str[lenp] = '|'; 6039 memcpy(&str[lenp + 1], token2, lenn); 6040 str[lenn + lenp + 1] = 0; 6041 6042 atom->valuep = str; 6043 } 6044 atom->data = data; 6045 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6046 atom->min = min; 6047 atom->max = max; 6048 /* 6049 * associate a counter to the transition. 6050 */ 6051 counter = xmlRegGetCounter(am); 6052 am->counters[counter].min = 1; 6053 am->counters[counter].max = 1; 6054 6055 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6056 if (to == NULL) { 6057 to = xmlRegNewState(am); 6058 xmlRegStatePush(am, to); 6059 } 6060 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6061 xmlRegAtomPush(am, atom); 6062 am->state = to; 6063 return(to); 6064 } 6065 6066 6067 6068 /** 6069 * xmlAutomataNewOnceTrans: 6070 * @am: an automata 6071 * @from: the starting point of the transition 6072 * @to: the target point of the transition or NULL 6073 * @token: the input string associated to that transition 6074 * @min: the minimum successive occurences of token 6075 * @max: the maximum successive occurences of token 6076 * @data: data associated to the transition 6077 * 6078 * If @to is NULL, this creates first a new target state in the automata 6079 * and then adds a transition from the @from state to the target state 6080 * activated by a succession of input of value @token and whose number 6081 * is between @min and @max, moreover that transition can only be crossed 6082 * once. 6083 * 6084 * Returns the target state or NULL in case of error 6085 */ 6086 xmlAutomataStatePtr 6087 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6088 xmlAutomataStatePtr to, const xmlChar *token, 6089 int min, int max, void *data) { 6090 xmlRegAtomPtr atom; 6091 int counter; 6092 6093 if ((am == NULL) || (from == NULL) || (token == NULL)) 6094 return(NULL); 6095 if (min < 1) 6096 return(NULL); 6097 if ((max < min) || (max < 1)) 6098 return(NULL); 6099 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6100 if (atom == NULL) 6101 return(NULL); 6102 atom->valuep = xmlStrdup(token); 6103 atom->data = data; 6104 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6105 atom->min = min; 6106 atom->max = max; 6107 /* 6108 * associate a counter to the transition. 6109 */ 6110 counter = xmlRegGetCounter(am); 6111 am->counters[counter].min = 1; 6112 am->counters[counter].max = 1; 6113 6114 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6115 if (to == NULL) { 6116 to = xmlRegNewState(am); 6117 xmlRegStatePush(am, to); 6118 } 6119 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6120 xmlRegAtomPush(am, atom); 6121 am->state = to; 6122 return(to); 6123 } 6124 6125 /** 6126 * xmlAutomataNewState: 6127 * @am: an automata 6128 * 6129 * Create a new disconnected state in the automata 6130 * 6131 * Returns the new state or NULL in case of error 6132 */ 6133 xmlAutomataStatePtr 6134 xmlAutomataNewState(xmlAutomataPtr am) { 6135 xmlAutomataStatePtr to; 6136 6137 if (am == NULL) 6138 return(NULL); 6139 to = xmlRegNewState(am); 6140 xmlRegStatePush(am, to); 6141 return(to); 6142 } 6143 6144 /** 6145 * xmlAutomataNewEpsilon: 6146 * @am: an automata 6147 * @from: the starting point of the transition 6148 * @to: the target point of the transition or NULL 6149 * 6150 * If @to is NULL, this creates first a new target state in the automata 6151 * and then adds an epsilon transition from the @from state to the 6152 * target state 6153 * 6154 * Returns the target state or NULL in case of error 6155 */ 6156 xmlAutomataStatePtr 6157 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 6158 xmlAutomataStatePtr to) { 6159 if ((am == NULL) || (from == NULL)) 6160 return(NULL); 6161 xmlFAGenerateEpsilonTransition(am, from, to); 6162 if (to == NULL) 6163 return(am->state); 6164 return(to); 6165 } 6166 6167 /** 6168 * xmlAutomataNewAllTrans: 6169 * @am: an automata 6170 * @from: the starting point of the transition 6171 * @to: the target point of the transition or NULL 6172 * @lax: allow to transition if not all all transitions have been activated 6173 * 6174 * If @to is NULL, this creates first a new target state in the automata 6175 * and then adds a an ALL transition from the @from state to the 6176 * target state. That transition is an epsilon transition allowed only when 6177 * all transitions from the @from node have been activated. 6178 * 6179 * Returns the target state or NULL in case of error 6180 */ 6181 xmlAutomataStatePtr 6182 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6183 xmlAutomataStatePtr to, int lax) { 6184 if ((am == NULL) || (from == NULL)) 6185 return(NULL); 6186 xmlFAGenerateAllTransition(am, from, to, lax); 6187 if (to == NULL) 6188 return(am->state); 6189 return(to); 6190 } 6191 6192 /** 6193 * xmlAutomataNewCounter: 6194 * @am: an automata 6195 * @min: the minimal value on the counter 6196 * @max: the maximal value on the counter 6197 * 6198 * Create a new counter 6199 * 6200 * Returns the counter number or -1 in case of error 6201 */ 6202 int 6203 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 6204 int ret; 6205 6206 if (am == NULL) 6207 return(-1); 6208 6209 ret = xmlRegGetCounter(am); 6210 if (ret < 0) 6211 return(-1); 6212 am->counters[ret].min = min; 6213 am->counters[ret].max = max; 6214 return(ret); 6215 } 6216 6217 /** 6218 * xmlAutomataNewCountedTrans: 6219 * @am: an automata 6220 * @from: the starting point of the transition 6221 * @to: the target point of the transition or NULL 6222 * @counter: the counter associated to that transition 6223 * 6224 * If @to is NULL, this creates first a new target state in the automata 6225 * and then adds an epsilon transition from the @from state to the target state 6226 * which will increment the counter provided 6227 * 6228 * Returns the target state or NULL in case of error 6229 */ 6230 xmlAutomataStatePtr 6231 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6232 xmlAutomataStatePtr to, int counter) { 6233 if ((am == NULL) || (from == NULL) || (counter < 0)) 6234 return(NULL); 6235 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 6236 if (to == NULL) 6237 return(am->state); 6238 return(to); 6239 } 6240 6241 /** 6242 * xmlAutomataNewCounterTrans: 6243 * @am: an automata 6244 * @from: the starting point of the transition 6245 * @to: the target point of the transition or NULL 6246 * @counter: the counter associated to that transition 6247 * 6248 * If @to is NULL, this creates first a new target state in the automata 6249 * and then adds an epsilon transition from the @from state to the target state 6250 * which will be allowed only if the counter is within the right range. 6251 * 6252 * Returns the target state or NULL in case of error 6253 */ 6254 xmlAutomataStatePtr 6255 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6256 xmlAutomataStatePtr to, int counter) { 6257 if ((am == NULL) || (from == NULL) || (counter < 0)) 6258 return(NULL); 6259 xmlFAGenerateCountedTransition(am, from, to, counter); 6260 if (to == NULL) 6261 return(am->state); 6262 return(to); 6263 } 6264 6265 /** 6266 * xmlAutomataCompile: 6267 * @am: an automata 6268 * 6269 * Compile the automata into a Reg Exp ready for being executed. 6270 * The automata should be free after this point. 6271 * 6272 * Returns the compiled regexp or NULL in case of error 6273 */ 6274 xmlRegexpPtr 6275 xmlAutomataCompile(xmlAutomataPtr am) { 6276 xmlRegexpPtr ret; 6277 6278 if ((am == NULL) || (am->error != 0)) return(NULL); 6279 xmlFAEliminateEpsilonTransitions(am); 6280 /* xmlFAComputesDeterminism(am); */ 6281 ret = xmlRegEpxFromParse(am); 6282 6283 return(ret); 6284 } 6285 6286 /** 6287 * xmlAutomataIsDeterminist: 6288 * @am: an automata 6289 * 6290 * Checks if an automata is determinist. 6291 * 6292 * Returns 1 if true, 0 if not, and -1 in case of error 6293 */ 6294 int 6295 xmlAutomataIsDeterminist(xmlAutomataPtr am) { 6296 int ret; 6297 6298 if (am == NULL) 6299 return(-1); 6300 6301 ret = xmlFAComputesDeterminism(am); 6302 return(ret); 6303 } 6304 #endif /* LIBXML_AUTOMATA_ENABLED */ 6305 6306 #ifdef LIBXML_EXPR_ENABLED 6307 /************************************************************************ 6308 * * 6309 * Formal Expression handling code * 6310 * * 6311 ************************************************************************/ 6312 /************************************************************************ 6313 * * 6314 * Expression handling context * 6315 * * 6316 ************************************************************************/ 6317 6318 struct _xmlExpCtxt { 6319 xmlDictPtr dict; 6320 xmlExpNodePtr *table; 6321 int size; 6322 int nbElems; 6323 int nb_nodes; 6324 int maxNodes; 6325 const char *expr; 6326 const char *cur; 6327 int nb_cons; 6328 int tabSize; 6329 }; 6330 6331 /** 6332 * xmlExpNewCtxt: 6333 * @maxNodes: the maximum number of nodes 6334 * @dict: optional dictionnary to use internally 6335 * 6336 * Creates a new context for manipulating expressions 6337 * 6338 * Returns the context or NULL in case of error 6339 */ 6340 xmlExpCtxtPtr 6341 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 6342 xmlExpCtxtPtr ret; 6343 int size = 256; 6344 6345 if (maxNodes <= 4096) 6346 maxNodes = 4096; 6347 6348 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 6349 if (ret == NULL) 6350 return(NULL); 6351 memset(ret, 0, sizeof(xmlExpCtxt)); 6352 ret->size = size; 6353 ret->nbElems = 0; 6354 ret->maxNodes = maxNodes; 6355 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 6356 if (ret->table == NULL) { 6357 xmlFree(ret); 6358 return(NULL); 6359 } 6360 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 6361 if (dict == NULL) { 6362 ret->dict = xmlDictCreate(); 6363 if (ret->dict == NULL) { 6364 xmlFree(ret->table); 6365 xmlFree(ret); 6366 return(NULL); 6367 } 6368 } else { 6369 ret->dict = dict; 6370 xmlDictReference(ret->dict); 6371 } 6372 return(ret); 6373 } 6374 6375 /** 6376 * xmlExpFreeCtxt: 6377 * @ctxt: an expression context 6378 * 6379 * Free an expression context 6380 */ 6381 void 6382 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 6383 if (ctxt == NULL) 6384 return; 6385 xmlDictFree(ctxt->dict); 6386 if (ctxt->table != NULL) 6387 xmlFree(ctxt->table); 6388 xmlFree(ctxt); 6389 } 6390 6391 /************************************************************************ 6392 * * 6393 * Structure associated to an expression node * 6394 * * 6395 ************************************************************************/ 6396 #define MAX_NODES 10000 6397 6398 /* #define DEBUG_DERIV */ 6399 6400 /* 6401 * TODO: 6402 * - Wildcards 6403 * - public API for creation 6404 * 6405 * Started 6406 * - regression testing 6407 * 6408 * Done 6409 * - split into module and test tool 6410 * - memleaks 6411 */ 6412 6413 typedef enum { 6414 XML_EXP_NILABLE = (1 << 0) 6415 } xmlExpNodeInfo; 6416 6417 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 6418 6419 struct _xmlExpNode { 6420 unsigned char type;/* xmlExpNodeType */ 6421 unsigned char info;/* OR of xmlExpNodeInfo */ 6422 unsigned short key; /* the hash key */ 6423 unsigned int ref; /* The number of references */ 6424 int c_max; /* the maximum length it can consume */ 6425 xmlExpNodePtr exp_left; 6426 xmlExpNodePtr next;/* the next node in the hash table or free list */ 6427 union { 6428 struct { 6429 int f_min; 6430 int f_max; 6431 } count; 6432 struct { 6433 xmlExpNodePtr f_right; 6434 } children; 6435 const xmlChar *f_str; 6436 } field; 6437 }; 6438 6439 #define exp_min field.count.f_min 6440 #define exp_max field.count.f_max 6441 /* #define exp_left field.children.f_left */ 6442 #define exp_right field.children.f_right 6443 #define exp_str field.f_str 6444 6445 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 6446 static xmlExpNode forbiddenExpNode = { 6447 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6448 }; 6449 xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 6450 static xmlExpNode emptyExpNode = { 6451 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6452 }; 6453 xmlExpNodePtr emptyExp = &emptyExpNode; 6454 6455 /************************************************************************ 6456 * * 6457 * The custom hash table for unicity and canonicalization * 6458 * of sub-expressions pointers * 6459 * * 6460 ************************************************************************/ 6461 /* 6462 * xmlExpHashNameComputeKey: 6463 * Calculate the hash key for a token 6464 */ 6465 static unsigned short 6466 xmlExpHashNameComputeKey(const xmlChar *name) { 6467 unsigned short value = 0L; 6468 char ch; 6469 6470 if (name != NULL) { 6471 value += 30 * (*name); 6472 while ((ch = *name++) != 0) { 6473 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 6474 } 6475 } 6476 return (value); 6477 } 6478 6479 /* 6480 * xmlExpHashComputeKey: 6481 * Calculate the hash key for a compound expression 6482 */ 6483 static unsigned short 6484 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 6485 xmlExpNodePtr right) { 6486 unsigned long value; 6487 unsigned short ret; 6488 6489 switch (type) { 6490 case XML_EXP_SEQ: 6491 value = left->key; 6492 value += right->key; 6493 value *= 3; 6494 ret = (unsigned short) value; 6495 break; 6496 case XML_EXP_OR: 6497 value = left->key; 6498 value += right->key; 6499 value *= 7; 6500 ret = (unsigned short) value; 6501 break; 6502 case XML_EXP_COUNT: 6503 value = left->key; 6504 value += right->key; 6505 ret = (unsigned short) value; 6506 break; 6507 default: 6508 ret = 0; 6509 } 6510 return(ret); 6511 } 6512 6513 6514 static xmlExpNodePtr 6515 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6516 xmlExpNodePtr ret; 6517 6518 if (ctxt->nb_nodes >= MAX_NODES) 6519 return(NULL); 6520 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6521 if (ret == NULL) 6522 return(NULL); 6523 memset(ret, 0, sizeof(xmlExpNode)); 6524 ret->type = type; 6525 ret->next = NULL; 6526 ctxt->nb_nodes++; 6527 ctxt->nb_cons++; 6528 return(ret); 6529 } 6530 6531 /** 6532 * xmlExpHashGetEntry: 6533 * @table: the hash table 6534 * 6535 * Get the unique entry from the hash table. The entry is created if 6536 * needed. @left and @right are consumed, i.e. their ref count will 6537 * be decremented by the operation. 6538 * 6539 * Returns the pointer or NULL in case of error 6540 */ 6541 static xmlExpNodePtr 6542 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6543 xmlExpNodePtr left, xmlExpNodePtr right, 6544 const xmlChar *name, int min, int max) { 6545 unsigned short kbase, key; 6546 xmlExpNodePtr entry; 6547 xmlExpNodePtr insert; 6548 6549 if (ctxt == NULL) 6550 return(NULL); 6551 6552 /* 6553 * Check for duplicate and insertion location. 6554 */ 6555 if (type == XML_EXP_ATOM) { 6556 kbase = xmlExpHashNameComputeKey(name); 6557 } else if (type == XML_EXP_COUNT) { 6558 /* COUNT reduction rule 1 */ 6559 /* a{1} -> a */ 6560 if (min == max) { 6561 if (min == 1) { 6562 return(left); 6563 } 6564 if (min == 0) { 6565 xmlExpFree(ctxt, left); 6566 return(emptyExp); 6567 } 6568 } 6569 if (min < 0) { 6570 xmlExpFree(ctxt, left); 6571 return(forbiddenExp); 6572 } 6573 if (max == -1) 6574 kbase = min + 79; 6575 else 6576 kbase = max - min; 6577 kbase += left->key; 6578 } else if (type == XML_EXP_OR) { 6579 /* Forbid reduction rules */ 6580 if (left->type == XML_EXP_FORBID) { 6581 xmlExpFree(ctxt, left); 6582 return(right); 6583 } 6584 if (right->type == XML_EXP_FORBID) { 6585 xmlExpFree(ctxt, right); 6586 return(left); 6587 } 6588 6589 /* OR reduction rule 1 */ 6590 /* a | a reduced to a */ 6591 if (left == right) { 6592 left->ref--; 6593 return(left); 6594 } 6595 /* OR canonicalization rule 1 */ 6596 /* linearize (a | b) | c into a | (b | c) */ 6597 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6598 xmlExpNodePtr tmp = left; 6599 left = right; 6600 right = tmp; 6601 } 6602 /* OR reduction rule 2 */ 6603 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6604 if (right->type == XML_EXP_OR) { 6605 if ((left == right->exp_left) || 6606 (left == right->exp_right)) { 6607 xmlExpFree(ctxt, left); 6608 return(right); 6609 } 6610 } 6611 /* OR canonicalization rule 2 */ 6612 /* linearize (a | b) | c into a | (b | c) */ 6613 if (left->type == XML_EXP_OR) { 6614 xmlExpNodePtr tmp; 6615 6616 /* OR canonicalization rule 2 */ 6617 if ((left->exp_right->type != XML_EXP_OR) && 6618 (left->exp_right->key < left->exp_left->key)) { 6619 tmp = left->exp_right; 6620 left->exp_right = left->exp_left; 6621 left->exp_left = tmp; 6622 } 6623 left->exp_right->ref++; 6624 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6625 NULL, 0, 0); 6626 left->exp_left->ref++; 6627 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6628 NULL, 0, 0); 6629 6630 xmlExpFree(ctxt, left); 6631 return(tmp); 6632 } 6633 if (right->type == XML_EXP_OR) { 6634 /* Ordering in the tree */ 6635 /* C | (A | B) -> A | (B | C) */ 6636 if (left->key > right->exp_right->key) { 6637 xmlExpNodePtr tmp; 6638 right->exp_right->ref++; 6639 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6640 left, NULL, 0, 0); 6641 right->exp_left->ref++; 6642 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6643 tmp, NULL, 0, 0); 6644 xmlExpFree(ctxt, right); 6645 return(tmp); 6646 } 6647 /* Ordering in the tree */ 6648 /* B | (A | C) -> A | (B | C) */ 6649 if (left->key > right->exp_left->key) { 6650 xmlExpNodePtr tmp; 6651 right->exp_right->ref++; 6652 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6653 right->exp_right, NULL, 0, 0); 6654 right->exp_left->ref++; 6655 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6656 tmp, NULL, 0, 0); 6657 xmlExpFree(ctxt, right); 6658 return(tmp); 6659 } 6660 } 6661 /* we know both types are != XML_EXP_OR here */ 6662 else if (left->key > right->key) { 6663 xmlExpNodePtr tmp = left; 6664 left = right; 6665 right = tmp; 6666 } 6667 kbase = xmlExpHashComputeKey(type, left, right); 6668 } else if (type == XML_EXP_SEQ) { 6669 /* Forbid reduction rules */ 6670 if (left->type == XML_EXP_FORBID) { 6671 xmlExpFree(ctxt, right); 6672 return(left); 6673 } 6674 if (right->type == XML_EXP_FORBID) { 6675 xmlExpFree(ctxt, left); 6676 return(right); 6677 } 6678 /* Empty reduction rules */ 6679 if (right->type == XML_EXP_EMPTY) { 6680 return(left); 6681 } 6682 if (left->type == XML_EXP_EMPTY) { 6683 return(right); 6684 } 6685 kbase = xmlExpHashComputeKey(type, left, right); 6686 } else 6687 return(NULL); 6688 6689 key = kbase % ctxt->size; 6690 if (ctxt->table[key] != NULL) { 6691 for (insert = ctxt->table[key]; insert != NULL; 6692 insert = insert->next) { 6693 if ((insert->key == kbase) && 6694 (insert->type == type)) { 6695 if (type == XML_EXP_ATOM) { 6696 if (name == insert->exp_str) { 6697 insert->ref++; 6698 return(insert); 6699 } 6700 } else if (type == XML_EXP_COUNT) { 6701 if ((insert->exp_min == min) && (insert->exp_max == max) && 6702 (insert->exp_left == left)) { 6703 insert->ref++; 6704 left->ref--; 6705 return(insert); 6706 } 6707 } else if ((insert->exp_left == left) && 6708 (insert->exp_right == right)) { 6709 insert->ref++; 6710 left->ref--; 6711 right->ref--; 6712 return(insert); 6713 } 6714 } 6715 } 6716 } 6717 6718 entry = xmlExpNewNode(ctxt, type); 6719 if (entry == NULL) 6720 return(NULL); 6721 entry->key = kbase; 6722 if (type == XML_EXP_ATOM) { 6723 entry->exp_str = name; 6724 entry->c_max = 1; 6725 } else if (type == XML_EXP_COUNT) { 6726 entry->exp_min = min; 6727 entry->exp_max = max; 6728 entry->exp_left = left; 6729 if ((min == 0) || (IS_NILLABLE(left))) 6730 entry->info |= XML_EXP_NILABLE; 6731 if (max < 0) 6732 entry->c_max = -1; 6733 else 6734 entry->c_max = max * entry->exp_left->c_max; 6735 } else { 6736 entry->exp_left = left; 6737 entry->exp_right = right; 6738 if (type == XML_EXP_OR) { 6739 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6740 entry->info |= XML_EXP_NILABLE; 6741 if ((entry->exp_left->c_max == -1) || 6742 (entry->exp_right->c_max == -1)) 6743 entry->c_max = -1; 6744 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6745 entry->c_max = entry->exp_left->c_max; 6746 else 6747 entry->c_max = entry->exp_right->c_max; 6748 } else { 6749 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6750 entry->info |= XML_EXP_NILABLE; 6751 if ((entry->exp_left->c_max == -1) || 6752 (entry->exp_right->c_max == -1)) 6753 entry->c_max = -1; 6754 else 6755 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6756 } 6757 } 6758 entry->ref = 1; 6759 if (ctxt->table[key] != NULL) 6760 entry->next = ctxt->table[key]; 6761 6762 ctxt->table[key] = entry; 6763 ctxt->nbElems++; 6764 6765 return(entry); 6766 } 6767 6768 /** 6769 * xmlExpFree: 6770 * @ctxt: the expression context 6771 * @exp: the expression 6772 * 6773 * Dereference the expression 6774 */ 6775 void 6776 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6777 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6778 return; 6779 exp->ref--; 6780 if (exp->ref == 0) { 6781 unsigned short key; 6782 6783 /* Unlink it first from the hash table */ 6784 key = exp->key % ctxt->size; 6785 if (ctxt->table[key] == exp) { 6786 ctxt->table[key] = exp->next; 6787 } else { 6788 xmlExpNodePtr tmp; 6789 6790 tmp = ctxt->table[key]; 6791 while (tmp != NULL) { 6792 if (tmp->next == exp) { 6793 tmp->next = exp->next; 6794 break; 6795 } 6796 tmp = tmp->next; 6797 } 6798 } 6799 6800 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6801 xmlExpFree(ctxt, exp->exp_left); 6802 xmlExpFree(ctxt, exp->exp_right); 6803 } else if (exp->type == XML_EXP_COUNT) { 6804 xmlExpFree(ctxt, exp->exp_left); 6805 } 6806 xmlFree(exp); 6807 ctxt->nb_nodes--; 6808 } 6809 } 6810 6811 /** 6812 * xmlExpRef: 6813 * @exp: the expression 6814 * 6815 * Increase the reference count of the expression 6816 */ 6817 void 6818 xmlExpRef(xmlExpNodePtr exp) { 6819 if (exp != NULL) 6820 exp->ref++; 6821 } 6822 6823 /** 6824 * xmlExpNewAtom: 6825 * @ctxt: the expression context 6826 * @name: the atom name 6827 * @len: the atom name lenght in byte (or -1); 6828 * 6829 * Get the atom associated to this name from that context 6830 * 6831 * Returns the node or NULL in case of error 6832 */ 6833 xmlExpNodePtr 6834 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 6835 if ((ctxt == NULL) || (name == NULL)) 6836 return(NULL); 6837 name = xmlDictLookup(ctxt->dict, name, len); 6838 if (name == NULL) 6839 return(NULL); 6840 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 6841 } 6842 6843 /** 6844 * xmlExpNewOr: 6845 * @ctxt: the expression context 6846 * @left: left expression 6847 * @right: right expression 6848 * 6849 * Get the atom associated to the choice @left | @right 6850 * Note that @left and @right are consumed in the operation, to keep 6851 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6852 * this is true even in case of failure (unless ctxt == NULL). 6853 * 6854 * Returns the node or NULL in case of error 6855 */ 6856 xmlExpNodePtr 6857 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6858 if (ctxt == NULL) 6859 return(NULL); 6860 if ((left == NULL) || (right == NULL)) { 6861 xmlExpFree(ctxt, left); 6862 xmlExpFree(ctxt, right); 6863 return(NULL); 6864 } 6865 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 6866 } 6867 6868 /** 6869 * xmlExpNewSeq: 6870 * @ctxt: the expression context 6871 * @left: left expression 6872 * @right: right expression 6873 * 6874 * Get the atom associated to the sequence @left , @right 6875 * Note that @left and @right are consumed in the operation, to keep 6876 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6877 * this is true even in case of failure (unless ctxt == NULL). 6878 * 6879 * Returns the node or NULL in case of error 6880 */ 6881 xmlExpNodePtr 6882 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6883 if (ctxt == NULL) 6884 return(NULL); 6885 if ((left == NULL) || (right == NULL)) { 6886 xmlExpFree(ctxt, left); 6887 xmlExpFree(ctxt, right); 6888 return(NULL); 6889 } 6890 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 6891 } 6892 6893 /** 6894 * xmlExpNewRange: 6895 * @ctxt: the expression context 6896 * @subset: the expression to be repeated 6897 * @min: the lower bound for the repetition 6898 * @max: the upper bound for the repetition, -1 means infinite 6899 * 6900 * Get the atom associated to the range (@subset){@min, @max} 6901 * Note that @subset is consumed in the operation, to keep 6902 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 6903 * this is true even in case of failure (unless ctxt == NULL). 6904 * 6905 * Returns the node or NULL in case of error 6906 */ 6907 xmlExpNodePtr 6908 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 6909 if (ctxt == NULL) 6910 return(NULL); 6911 if ((subset == NULL) || (min < 0) || (max < -1) || 6912 ((max >= 0) && (min > max))) { 6913 xmlExpFree(ctxt, subset); 6914 return(NULL); 6915 } 6916 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 6917 NULL, NULL, min, max)); 6918 } 6919 6920 /************************************************************************ 6921 * * 6922 * Public API for operations on expressions * 6923 * * 6924 ************************************************************************/ 6925 6926 static int 6927 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6928 const xmlChar**list, int len, int nb) { 6929 int tmp, tmp2; 6930 tail: 6931 switch (exp->type) { 6932 case XML_EXP_EMPTY: 6933 return(0); 6934 case XML_EXP_ATOM: 6935 for (tmp = 0;tmp < nb;tmp++) 6936 if (list[tmp] == exp->exp_str) 6937 return(0); 6938 if (nb >= len) 6939 return(-2); 6940 list[nb] = exp->exp_str; 6941 return(1); 6942 case XML_EXP_COUNT: 6943 exp = exp->exp_left; 6944 goto tail; 6945 case XML_EXP_SEQ: 6946 case XML_EXP_OR: 6947 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 6948 if (tmp < 0) 6949 return(tmp); 6950 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 6951 nb + tmp); 6952 if (tmp2 < 0) 6953 return(tmp2); 6954 return(tmp + tmp2); 6955 } 6956 return(-1); 6957 } 6958 6959 /** 6960 * xmlExpGetLanguage: 6961 * @ctxt: the expression context 6962 * @exp: the expression 6963 * @langList: where to store the tokens 6964 * @len: the allocated lenght of @list 6965 * 6966 * Find all the strings used in @exp and store them in @list 6967 * 6968 * Returns the number of unique strings found, -1 in case of errors and 6969 * -2 if there is more than @len strings 6970 */ 6971 int 6972 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6973 const xmlChar**langList, int len) { 6974 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) 6975 return(-1); 6976 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); 6977 } 6978 6979 static int 6980 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6981 const xmlChar**list, int len, int nb) { 6982 int tmp, tmp2; 6983 tail: 6984 switch (exp->type) { 6985 case XML_EXP_FORBID: 6986 return(0); 6987 case XML_EXP_EMPTY: 6988 return(0); 6989 case XML_EXP_ATOM: 6990 for (tmp = 0;tmp < nb;tmp++) 6991 if (list[tmp] == exp->exp_str) 6992 return(0); 6993 if (nb >= len) 6994 return(-2); 6995 list[nb] = exp->exp_str; 6996 return(1); 6997 case XML_EXP_COUNT: 6998 exp = exp->exp_left; 6999 goto tail; 7000 case XML_EXP_SEQ: 7001 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7002 if (tmp < 0) 7003 return(tmp); 7004 if (IS_NILLABLE(exp->exp_left)) { 7005 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7006 nb + tmp); 7007 if (tmp2 < 0) 7008 return(tmp2); 7009 tmp += tmp2; 7010 } 7011 return(tmp); 7012 case XML_EXP_OR: 7013 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7014 if (tmp < 0) 7015 return(tmp); 7016 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7017 nb + tmp); 7018 if (tmp2 < 0) 7019 return(tmp2); 7020 return(tmp + tmp2); 7021 } 7022 return(-1); 7023 } 7024 7025 /** 7026 * xmlExpGetStart: 7027 * @ctxt: the expression context 7028 * @exp: the expression 7029 * @tokList: where to store the tokens 7030 * @len: the allocated lenght of @list 7031 * 7032 * Find all the strings that appears at the start of the languages 7033 * accepted by @exp and store them in @list. E.g. for (a, b) | c 7034 * it will return the list [a, c] 7035 * 7036 * Returns the number of unique strings found, -1 in case of errors and 7037 * -2 if there is more than @len strings 7038 */ 7039 int 7040 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7041 const xmlChar**tokList, int len) { 7042 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) 7043 return(-1); 7044 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); 7045 } 7046 7047 /** 7048 * xmlExpIsNillable: 7049 * @exp: the expression 7050 * 7051 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce 7052 * 7053 * Returns 1 if nillable, 0 if not and -1 in case of error 7054 */ 7055 int 7056 xmlExpIsNillable(xmlExpNodePtr exp) { 7057 if (exp == NULL) 7058 return(-1); 7059 return(IS_NILLABLE(exp) != 0); 7060 } 7061 7062 static xmlExpNodePtr 7063 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 7064 { 7065 xmlExpNodePtr ret; 7066 7067 switch (exp->type) { 7068 case XML_EXP_EMPTY: 7069 return(forbiddenExp); 7070 case XML_EXP_FORBID: 7071 return(forbiddenExp); 7072 case XML_EXP_ATOM: 7073 if (exp->exp_str == str) { 7074 #ifdef DEBUG_DERIV 7075 printf("deriv atom: equal => Empty\n"); 7076 #endif 7077 ret = emptyExp; 7078 } else { 7079 #ifdef DEBUG_DERIV 7080 printf("deriv atom: mismatch => forbid\n"); 7081 #endif 7082 /* TODO wildcards here */ 7083 ret = forbiddenExp; 7084 } 7085 return(ret); 7086 case XML_EXP_OR: { 7087 xmlExpNodePtr tmp; 7088 7089 #ifdef DEBUG_DERIV 7090 printf("deriv or: => or(derivs)\n"); 7091 #endif 7092 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7093 if (tmp == NULL) { 7094 return(NULL); 7095 } 7096 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7097 if (ret == NULL) { 7098 xmlExpFree(ctxt, tmp); 7099 return(NULL); 7100 } 7101 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 7102 NULL, 0, 0); 7103 return(ret); 7104 } 7105 case XML_EXP_SEQ: 7106 #ifdef DEBUG_DERIV 7107 printf("deriv seq: starting with left\n"); 7108 #endif 7109 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7110 if (ret == NULL) { 7111 return(NULL); 7112 } else if (ret == forbiddenExp) { 7113 if (IS_NILLABLE(exp->exp_left)) { 7114 #ifdef DEBUG_DERIV 7115 printf("deriv seq: left failed but nillable\n"); 7116 #endif 7117 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7118 } 7119 } else { 7120 #ifdef DEBUG_DERIV 7121 printf("deriv seq: left match => sequence\n"); 7122 #endif 7123 exp->exp_right->ref++; 7124 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 7125 NULL, 0, 0); 7126 } 7127 return(ret); 7128 case XML_EXP_COUNT: { 7129 int min, max; 7130 xmlExpNodePtr tmp; 7131 7132 if (exp->exp_max == 0) 7133 return(forbiddenExp); 7134 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7135 if (ret == NULL) 7136 return(NULL); 7137 if (ret == forbiddenExp) { 7138 #ifdef DEBUG_DERIV 7139 printf("deriv count: pattern mismatch => forbid\n"); 7140 #endif 7141 return(ret); 7142 } 7143 if (exp->exp_max == 1) 7144 return(ret); 7145 if (exp->exp_max < 0) /* unbounded */ 7146 max = -1; 7147 else 7148 max = exp->exp_max - 1; 7149 if (exp->exp_min > 0) 7150 min = exp->exp_min - 1; 7151 else 7152 min = 0; 7153 exp->exp_left->ref++; 7154 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 7155 NULL, min, max); 7156 if (ret == emptyExp) { 7157 #ifdef DEBUG_DERIV 7158 printf("deriv count: match to empty => new count\n"); 7159 #endif 7160 return(tmp); 7161 } 7162 #ifdef DEBUG_DERIV 7163 printf("deriv count: match => sequence with new count\n"); 7164 #endif 7165 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 7166 NULL, 0, 0)); 7167 } 7168 } 7169 return(NULL); 7170 } 7171 7172 /** 7173 * xmlExpStringDerive: 7174 * @ctxt: the expression context 7175 * @exp: the expression 7176 * @str: the string 7177 * @len: the string len in bytes if available 7178 * 7179 * Do one step of Brzozowski derivation of the expression @exp with 7180 * respect to the input string 7181 * 7182 * Returns the resulting expression or NULL in case of internal error 7183 */ 7184 xmlExpNodePtr 7185 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7186 const xmlChar *str, int len) { 7187 const xmlChar *input; 7188 7189 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 7190 return(NULL); 7191 } 7192 /* 7193 * check the string is in the dictionnary, if yes use an interned 7194 * copy, otherwise we know it's not an acceptable input 7195 */ 7196 input = xmlDictExists(ctxt->dict, str, len); 7197 if (input == NULL) { 7198 return(forbiddenExp); 7199 } 7200 return(xmlExpStringDeriveInt(ctxt, exp, input)); 7201 } 7202 7203 static int 7204 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 7205 int ret = 1; 7206 7207 if (sub->c_max == -1) { 7208 if (exp->c_max != -1) 7209 ret = 0; 7210 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 7211 ret = 0; 7212 } 7213 #if 0 7214 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 7215 ret = 0; 7216 #endif 7217 return(ret); 7218 } 7219 7220 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7221 xmlExpNodePtr sub); 7222 /** 7223 * xmlExpDivide: 7224 * @ctxt: the expressions context 7225 * @exp: the englobing expression 7226 * @sub: the subexpression 7227 * @mult: the multiple expression 7228 * @remain: the remain from the derivation of the multiple 7229 * 7230 * Check if exp is a multiple of sub, i.e. if there is a finite number n 7231 * so that sub{n} subsume exp 7232 * 7233 * Returns the multiple value if successful, 0 if it is not a multiple 7234 * and -1 in case of internel error. 7235 */ 7236 7237 static int 7238 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 7239 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 7240 int i; 7241 xmlExpNodePtr tmp, tmp2; 7242 7243 if (mult != NULL) *mult = NULL; 7244 if (remain != NULL) *remain = NULL; 7245 if (exp->c_max == -1) return(0); 7246 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 7247 7248 for (i = 1;i <= exp->c_max;i++) { 7249 sub->ref++; 7250 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7251 sub, NULL, NULL, i, i); 7252 if (tmp == NULL) { 7253 return(-1); 7254 } 7255 if (!xmlExpCheckCard(tmp, exp)) { 7256 xmlExpFree(ctxt, tmp); 7257 continue; 7258 } 7259 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 7260 if (tmp2 == NULL) { 7261 xmlExpFree(ctxt, tmp); 7262 return(-1); 7263 } 7264 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 7265 if (remain != NULL) 7266 *remain = tmp2; 7267 else 7268 xmlExpFree(ctxt, tmp2); 7269 if (mult != NULL) 7270 *mult = tmp; 7271 else 7272 xmlExpFree(ctxt, tmp); 7273 #ifdef DEBUG_DERIV 7274 printf("Divide succeeded %d\n", i); 7275 #endif 7276 return(i); 7277 } 7278 xmlExpFree(ctxt, tmp); 7279 xmlExpFree(ctxt, tmp2); 7280 } 7281 #ifdef DEBUG_DERIV 7282 printf("Divide failed\n"); 7283 #endif 7284 return(0); 7285 } 7286 7287 /** 7288 * xmlExpExpDeriveInt: 7289 * @ctxt: the expressions context 7290 * @exp: the englobing expression 7291 * @sub: the subexpression 7292 * 7293 * Try to do a step of Brzozowski derivation but at a higher level 7294 * the input being a subexpression. 7295 * 7296 * Returns the resulting expression or NULL in case of internal error 7297 */ 7298 static xmlExpNodePtr 7299 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7300 xmlExpNodePtr ret, tmp, tmp2, tmp3; 7301 const xmlChar **tab; 7302 int len, i; 7303 7304 /* 7305 * In case of equality and if the expression can only consume a finite 7306 * amount, then the derivation is empty 7307 */ 7308 if ((exp == sub) && (exp->c_max >= 0)) { 7309 #ifdef DEBUG_DERIV 7310 printf("Equal(exp, sub) and finite -> Empty\n"); 7311 #endif 7312 return(emptyExp); 7313 } 7314 /* 7315 * decompose sub sequence first 7316 */ 7317 if (sub->type == XML_EXP_EMPTY) { 7318 #ifdef DEBUG_DERIV 7319 printf("Empty(sub) -> Empty\n"); 7320 #endif 7321 exp->ref++; 7322 return(exp); 7323 } 7324 if (sub->type == XML_EXP_SEQ) { 7325 #ifdef DEBUG_DERIV 7326 printf("Seq(sub) -> decompose\n"); 7327 #endif 7328 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7329 if (tmp == NULL) 7330 return(NULL); 7331 if (tmp == forbiddenExp) 7332 return(tmp); 7333 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 7334 xmlExpFree(ctxt, tmp); 7335 return(ret); 7336 } 7337 if (sub->type == XML_EXP_OR) { 7338 #ifdef DEBUG_DERIV 7339 printf("Or(sub) -> decompose\n"); 7340 #endif 7341 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7342 if (tmp == forbiddenExp) 7343 return(tmp); 7344 if (tmp == NULL) 7345 return(NULL); 7346 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 7347 if ((ret == NULL) || (ret == forbiddenExp)) { 7348 xmlExpFree(ctxt, tmp); 7349 return(ret); 7350 } 7351 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 7352 } 7353 if (!xmlExpCheckCard(exp, sub)) { 7354 #ifdef DEBUG_DERIV 7355 printf("CheckCard(exp, sub) failed -> Forbid\n"); 7356 #endif 7357 return(forbiddenExp); 7358 } 7359 switch (exp->type) { 7360 case XML_EXP_EMPTY: 7361 if (sub == emptyExp) 7362 return(emptyExp); 7363 #ifdef DEBUG_DERIV 7364 printf("Empty(exp) -> Forbid\n"); 7365 #endif 7366 return(forbiddenExp); 7367 case XML_EXP_FORBID: 7368 #ifdef DEBUG_DERIV 7369 printf("Forbid(exp) -> Forbid\n"); 7370 #endif 7371 return(forbiddenExp); 7372 case XML_EXP_ATOM: 7373 if (sub->type == XML_EXP_ATOM) { 7374 /* TODO: handle wildcards */ 7375 if (exp->exp_str == sub->exp_str) { 7376 #ifdef DEBUG_DERIV 7377 printf("Atom match -> Empty\n"); 7378 #endif 7379 return(emptyExp); 7380 } 7381 #ifdef DEBUG_DERIV 7382 printf("Atom mismatch -> Forbid\n"); 7383 #endif 7384 return(forbiddenExp); 7385 } 7386 if ((sub->type == XML_EXP_COUNT) && 7387 (sub->exp_max == 1) && 7388 (sub->exp_left->type == XML_EXP_ATOM)) { 7389 /* TODO: handle wildcards */ 7390 if (exp->exp_str == sub->exp_left->exp_str) { 7391 #ifdef DEBUG_DERIV 7392 printf("Atom match -> Empty\n"); 7393 #endif 7394 return(emptyExp); 7395 } 7396 #ifdef DEBUG_DERIV 7397 printf("Atom mismatch -> Forbid\n"); 7398 #endif 7399 return(forbiddenExp); 7400 } 7401 #ifdef DEBUG_DERIV 7402 printf("Compex exp vs Atom -> Forbid\n"); 7403 #endif 7404 return(forbiddenExp); 7405 case XML_EXP_SEQ: 7406 /* try to get the sequence consumed only if possible */ 7407 if (xmlExpCheckCard(exp->exp_left, sub)) { 7408 /* See if the sequence can be consumed directly */ 7409 #ifdef DEBUG_DERIV 7410 printf("Seq trying left only\n"); 7411 #endif 7412 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7413 if ((ret != forbiddenExp) && (ret != NULL)) { 7414 #ifdef DEBUG_DERIV 7415 printf("Seq trying left only worked\n"); 7416 #endif 7417 /* 7418 * TODO: assumption here that we are determinist 7419 * i.e. we won't get to a nillable exp left 7420 * subset which could be matched by the right 7421 * part too. 7422 * e.g.: (a | b)+,(a | c) and 'a+,a' 7423 */ 7424 exp->exp_right->ref++; 7425 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7426 exp->exp_right, NULL, 0, 0)); 7427 } 7428 #ifdef DEBUG_DERIV 7429 } else { 7430 printf("Seq: left too short\n"); 7431 #endif 7432 } 7433 /* Try instead to decompose */ 7434 if (sub->type == XML_EXP_COUNT) { 7435 int min, max; 7436 7437 #ifdef DEBUG_DERIV 7438 printf("Seq: sub is a count\n"); 7439 #endif 7440 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7441 if (ret == NULL) 7442 return(NULL); 7443 if (ret != forbiddenExp) { 7444 #ifdef DEBUG_DERIV 7445 printf("Seq , Count match on left\n"); 7446 #endif 7447 if (sub->exp_max < 0) 7448 max = -1; 7449 else 7450 max = sub->exp_max -1; 7451 if (sub->exp_min > 0) 7452 min = sub->exp_min -1; 7453 else 7454 min = 0; 7455 exp->exp_right->ref++; 7456 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7457 exp->exp_right, NULL, 0, 0); 7458 if (tmp == NULL) 7459 return(NULL); 7460 7461 sub->exp_left->ref++; 7462 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7463 sub->exp_left, NULL, NULL, min, max); 7464 if (tmp2 == NULL) { 7465 xmlExpFree(ctxt, tmp); 7466 return(NULL); 7467 } 7468 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7469 xmlExpFree(ctxt, tmp); 7470 xmlExpFree(ctxt, tmp2); 7471 return(ret); 7472 } 7473 } 7474 /* we made no progress on structured operations */ 7475 break; 7476 case XML_EXP_OR: 7477 #ifdef DEBUG_DERIV 7478 printf("Or , trying both side\n"); 7479 #endif 7480 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7481 if (ret == NULL) 7482 return(NULL); 7483 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 7484 if (tmp == NULL) { 7485 xmlExpFree(ctxt, ret); 7486 return(NULL); 7487 } 7488 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 7489 case XML_EXP_COUNT: { 7490 int min, max; 7491 7492 if (sub->type == XML_EXP_COUNT) { 7493 /* 7494 * Try to see if the loop is completely subsumed 7495 */ 7496 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7497 if (tmp == NULL) 7498 return(NULL); 7499 if (tmp == forbiddenExp) { 7500 int mult; 7501 7502 #ifdef DEBUG_DERIV 7503 printf("Count, Count inner don't subsume\n"); 7504 #endif 7505 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7506 NULL, &tmp); 7507 if (mult <= 0) { 7508 #ifdef DEBUG_DERIV 7509 printf("Count, Count not multiple => forbidden\n"); 7510 #endif 7511 return(forbiddenExp); 7512 } 7513 if (sub->exp_max == -1) { 7514 max = -1; 7515 if (exp->exp_max == -1) { 7516 if (exp->exp_min <= sub->exp_min * mult) 7517 min = 0; 7518 else 7519 min = exp->exp_min - sub->exp_min * mult; 7520 } else { 7521 #ifdef DEBUG_DERIV 7522 printf("Count, Count finite can't subsume infinite\n"); 7523 #endif 7524 xmlExpFree(ctxt, tmp); 7525 return(forbiddenExp); 7526 } 7527 } else { 7528 if (exp->exp_max == -1) { 7529 #ifdef DEBUG_DERIV 7530 printf("Infinite loop consume mult finite loop\n"); 7531 #endif 7532 if (exp->exp_min > sub->exp_min * mult) { 7533 max = -1; 7534 min = exp->exp_min - sub->exp_min * mult; 7535 } else { 7536 max = -1; 7537 min = 0; 7538 } 7539 } else { 7540 if (exp->exp_max < sub->exp_max * mult) { 7541 #ifdef DEBUG_DERIV 7542 printf("loops max mult mismatch => forbidden\n"); 7543 #endif 7544 xmlExpFree(ctxt, tmp); 7545 return(forbiddenExp); 7546 } 7547 if (sub->exp_max * mult > exp->exp_min) 7548 min = 0; 7549 else 7550 min = exp->exp_min - sub->exp_max * mult; 7551 max = exp->exp_max - sub->exp_max * mult; 7552 } 7553 } 7554 } else if (!IS_NILLABLE(tmp)) { 7555 /* 7556 * TODO: loop here to try to grow if working on finite 7557 * blocks. 7558 */ 7559 #ifdef DEBUG_DERIV 7560 printf("Count, Count remain not nillable => forbidden\n"); 7561 #endif 7562 xmlExpFree(ctxt, tmp); 7563 return(forbiddenExp); 7564 } else if (sub->exp_max == -1) { 7565 if (exp->exp_max == -1) { 7566 if (exp->exp_min <= sub->exp_min) { 7567 #ifdef DEBUG_DERIV 7568 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7569 #endif 7570 max = -1; 7571 min = 0; 7572 } else { 7573 #ifdef DEBUG_DERIV 7574 printf("Infinite loops min => Count(X,Inf)\n"); 7575 #endif 7576 max = -1; 7577 min = exp->exp_min - sub->exp_min; 7578 } 7579 } else if (exp->exp_min > sub->exp_min) { 7580 #ifdef DEBUG_DERIV 7581 printf("loops min mismatch 1 => forbidden ???\n"); 7582 #endif 7583 xmlExpFree(ctxt, tmp); 7584 return(forbiddenExp); 7585 } else { 7586 max = -1; 7587 min = 0; 7588 } 7589 } else { 7590 if (exp->exp_max == -1) { 7591 #ifdef DEBUG_DERIV 7592 printf("Infinite loop consume finite loop\n"); 7593 #endif 7594 if (exp->exp_min > sub->exp_min) { 7595 max = -1; 7596 min = exp->exp_min - sub->exp_min; 7597 } else { 7598 max = -1; 7599 min = 0; 7600 } 7601 } else { 7602 if (exp->exp_max < sub->exp_max) { 7603 #ifdef DEBUG_DERIV 7604 printf("loops max mismatch => forbidden\n"); 7605 #endif 7606 xmlExpFree(ctxt, tmp); 7607 return(forbiddenExp); 7608 } 7609 if (sub->exp_max > exp->exp_min) 7610 min = 0; 7611 else 7612 min = exp->exp_min - sub->exp_max; 7613 max = exp->exp_max - sub->exp_max; 7614 } 7615 } 7616 #ifdef DEBUG_DERIV 7617 printf("loops match => SEQ(COUNT())\n"); 7618 #endif 7619 exp->exp_left->ref++; 7620 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7621 NULL, NULL, min, max); 7622 if (tmp2 == NULL) { 7623 return(NULL); 7624 } 7625 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7626 NULL, 0, 0); 7627 return(ret); 7628 } 7629 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7630 if (tmp == NULL) 7631 return(NULL); 7632 if (tmp == forbiddenExp) { 7633 #ifdef DEBUG_DERIV 7634 printf("loop mismatch => forbidden\n"); 7635 #endif 7636 return(forbiddenExp); 7637 } 7638 if (exp->exp_min > 0) 7639 min = exp->exp_min - 1; 7640 else 7641 min = 0; 7642 if (exp->exp_max < 0) 7643 max = -1; 7644 else 7645 max = exp->exp_max - 1; 7646 7647 #ifdef DEBUG_DERIV 7648 printf("loop match => SEQ(COUNT())\n"); 7649 #endif 7650 exp->exp_left->ref++; 7651 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7652 NULL, NULL, min, max); 7653 if (tmp2 == NULL) 7654 return(NULL); 7655 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7656 NULL, 0, 0); 7657 return(ret); 7658 } 7659 } 7660 7661 #ifdef DEBUG_DERIV 7662 printf("Fallback to derivative\n"); 7663 #endif 7664 if (IS_NILLABLE(sub)) { 7665 if (!(IS_NILLABLE(exp))) 7666 return(forbiddenExp); 7667 else 7668 ret = emptyExp; 7669 } else 7670 ret = NULL; 7671 /* 7672 * here the structured derivation made no progress so 7673 * we use the default token based derivation to force one more step 7674 */ 7675 if (ctxt->tabSize == 0) 7676 ctxt->tabSize = 40; 7677 7678 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7679 sizeof(const xmlChar *)); 7680 if (tab == NULL) { 7681 return(NULL); 7682 } 7683 7684 /* 7685 * collect all the strings accepted by the subexpression on input 7686 */ 7687 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7688 while (len < 0) { 7689 const xmlChar **temp; 7690 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7691 sizeof(const xmlChar *)); 7692 if (temp == NULL) { 7693 xmlFree((xmlChar **) tab); 7694 return(NULL); 7695 } 7696 tab = temp; 7697 ctxt->tabSize *= 2; 7698 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7699 } 7700 for (i = 0;i < len;i++) { 7701 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7702 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7703 xmlExpFree(ctxt, ret); 7704 xmlFree((xmlChar **) tab); 7705 return(tmp); 7706 } 7707 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7708 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7709 xmlExpFree(ctxt, tmp); 7710 xmlExpFree(ctxt, ret); 7711 xmlFree((xmlChar **) tab); 7712 return(tmp); 7713 } 7714 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7715 xmlExpFree(ctxt, tmp); 7716 xmlExpFree(ctxt, tmp2); 7717 7718 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7719 xmlExpFree(ctxt, ret); 7720 xmlFree((xmlChar **) tab); 7721 return(tmp3); 7722 } 7723 7724 if (ret == NULL) 7725 ret = tmp3; 7726 else { 7727 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7728 if (ret == NULL) { 7729 xmlFree((xmlChar **) tab); 7730 return(NULL); 7731 } 7732 } 7733 } 7734 xmlFree((xmlChar **) tab); 7735 return(ret); 7736 } 7737 7738 /** 7739 * xmlExpExpDerive: 7740 * @ctxt: the expressions context 7741 * @exp: the englobing expression 7742 * @sub: the subexpression 7743 * 7744 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7745 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7746 * it usually tatkes less than linear time and can handle expressions generating 7747 * infinite languages. 7748 * 7749 * Returns the resulting expression or NULL in case of internal error, the 7750 * result must be freed 7751 */ 7752 xmlExpNodePtr 7753 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7754 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7755 return(NULL); 7756 7757 /* 7758 * O(1) speedups 7759 */ 7760 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7761 #ifdef DEBUG_DERIV 7762 printf("Sub nillable and not exp : can't subsume\n"); 7763 #endif 7764 return(forbiddenExp); 7765 } 7766 if (xmlExpCheckCard(exp, sub) == 0) { 7767 #ifdef DEBUG_DERIV 7768 printf("sub generate longuer sequances than exp : can't subsume\n"); 7769 #endif 7770 return(forbiddenExp); 7771 } 7772 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7773 } 7774 7775 /** 7776 * xmlExpSubsume: 7777 * @ctxt: the expressions context 7778 * @exp: the englobing expression 7779 * @sub: the subexpression 7780 * 7781 * Check whether @exp accepts all the languages accexpted by @sub 7782 * the input being a subexpression. 7783 * 7784 * Returns 1 if true 0 if false and -1 in case of failure. 7785 */ 7786 int 7787 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7788 xmlExpNodePtr tmp; 7789 7790 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7791 return(-1); 7792 7793 /* 7794 * TODO: speedup by checking the language of sub is a subset of the 7795 * language of exp 7796 */ 7797 /* 7798 * O(1) speedups 7799 */ 7800 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7801 #ifdef DEBUG_DERIV 7802 printf("Sub nillable and not exp : can't subsume\n"); 7803 #endif 7804 return(0); 7805 } 7806 if (xmlExpCheckCard(exp, sub) == 0) { 7807 #ifdef DEBUG_DERIV 7808 printf("sub generate longuer sequances than exp : can't subsume\n"); 7809 #endif 7810 return(0); 7811 } 7812 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7813 #ifdef DEBUG_DERIV 7814 printf("Result derivation :\n"); 7815 PRINT_EXP(tmp); 7816 #endif 7817 if (tmp == NULL) 7818 return(-1); 7819 if (tmp == forbiddenExp) 7820 return(0); 7821 if (tmp == emptyExp) 7822 return(1); 7823 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 7824 xmlExpFree(ctxt, tmp); 7825 return(1); 7826 } 7827 xmlExpFree(ctxt, tmp); 7828 return(0); 7829 } 7830 7831 /************************************************************************ 7832 * * 7833 * Parsing expression * 7834 * * 7835 ************************************************************************/ 7836 7837 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 7838 7839 #undef CUR 7840 #define CUR (*ctxt->cur) 7841 #undef NEXT 7842 #define NEXT ctxt->cur++; 7843 #undef IS_BLANK 7844 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 7845 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 7846 7847 static int 7848 xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 7849 int ret = 0; 7850 7851 SKIP_BLANKS 7852 if (CUR == '*') { 7853 NEXT 7854 return(-1); 7855 } 7856 if ((CUR < '0') || (CUR > '9')) 7857 return(-1); 7858 while ((CUR >= '0') && (CUR <= '9')) { 7859 ret = ret * 10 + (CUR - '0'); 7860 NEXT 7861 } 7862 return(ret); 7863 } 7864 7865 static xmlExpNodePtr 7866 xmlExpParseOr(xmlExpCtxtPtr ctxt) { 7867 const char *base; 7868 xmlExpNodePtr ret; 7869 const xmlChar *val; 7870 7871 SKIP_BLANKS 7872 base = ctxt->cur; 7873 if (*ctxt->cur == '(') { 7874 NEXT 7875 ret = xmlExpParseExpr(ctxt); 7876 SKIP_BLANKS 7877 if (*ctxt->cur != ')') { 7878 fprintf(stderr, "unbalanced '(' : %s\n", base); 7879 xmlExpFree(ctxt, ret); 7880 return(NULL); 7881 } 7882 NEXT; 7883 SKIP_BLANKS 7884 goto parse_quantifier; 7885 } 7886 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 7887 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 7888 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 7889 NEXT; 7890 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 7891 if (val == NULL) 7892 return(NULL); 7893 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 7894 if (ret == NULL) 7895 return(NULL); 7896 SKIP_BLANKS 7897 parse_quantifier: 7898 if (CUR == '{') { 7899 int min, max; 7900 7901 NEXT 7902 min = xmlExpParseNumber(ctxt); 7903 if (min < 0) { 7904 xmlExpFree(ctxt, ret); 7905 return(NULL); 7906 } 7907 SKIP_BLANKS 7908 if (CUR == ',') { 7909 NEXT 7910 max = xmlExpParseNumber(ctxt); 7911 SKIP_BLANKS 7912 } else 7913 max = min; 7914 if (CUR != '}') { 7915 xmlExpFree(ctxt, ret); 7916 return(NULL); 7917 } 7918 NEXT 7919 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7920 min, max); 7921 SKIP_BLANKS 7922 } else if (CUR == '?') { 7923 NEXT 7924 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7925 0, 1); 7926 SKIP_BLANKS 7927 } else if (CUR == '+') { 7928 NEXT 7929 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7930 1, -1); 7931 SKIP_BLANKS 7932 } else if (CUR == '*') { 7933 NEXT 7934 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7935 0, -1); 7936 SKIP_BLANKS 7937 } 7938 return(ret); 7939 } 7940 7941 7942 static xmlExpNodePtr 7943 xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 7944 xmlExpNodePtr ret, right; 7945 7946 ret = xmlExpParseOr(ctxt); 7947 SKIP_BLANKS 7948 while (CUR == '|') { 7949 NEXT 7950 right = xmlExpParseOr(ctxt); 7951 if (right == NULL) { 7952 xmlExpFree(ctxt, ret); 7953 return(NULL); 7954 } 7955 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 7956 if (ret == NULL) 7957 return(NULL); 7958 } 7959 return(ret); 7960 } 7961 7962 static xmlExpNodePtr 7963 xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 7964 xmlExpNodePtr ret, right; 7965 7966 ret = xmlExpParseSeq(ctxt); 7967 SKIP_BLANKS 7968 while (CUR == ',') { 7969 NEXT 7970 right = xmlExpParseSeq(ctxt); 7971 if (right == NULL) { 7972 xmlExpFree(ctxt, ret); 7973 return(NULL); 7974 } 7975 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 7976 if (ret == NULL) 7977 return(NULL); 7978 } 7979 return(ret); 7980 } 7981 7982 /** 7983 * xmlExpParse: 7984 * @ctxt: the expressions context 7985 * @expr: the 0 terminated string 7986 * 7987 * Minimal parser for regexps, it understand the following constructs 7988 * - string terminals 7989 * - choice operator | 7990 * - sequence operator , 7991 * - subexpressions (...) 7992 * - usual cardinality operators + * and ? 7993 * - finite sequences { min, max } 7994 * - infinite sequences { min, * } 7995 * There is minimal checkings made especially no checking on strings values 7996 * 7997 * Returns a new expression or NULL in case of failure 7998 */ 7999 xmlExpNodePtr 8000 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 8001 xmlExpNodePtr ret; 8002 8003 ctxt->expr = expr; 8004 ctxt->cur = expr; 8005 8006 ret = xmlExpParseExpr(ctxt); 8007 SKIP_BLANKS 8008 if (*ctxt->cur != 0) { 8009 xmlExpFree(ctxt, ret); 8010 return(NULL); 8011 } 8012 return(ret); 8013 } 8014 8015 static void 8016 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 8017 xmlExpNodePtr c; 8018 8019 if (expr == NULL) return; 8020 if (glob) xmlBufferWriteChar(buf, "("); 8021 switch (expr->type) { 8022 case XML_EXP_EMPTY: 8023 xmlBufferWriteChar(buf, "empty"); 8024 break; 8025 case XML_EXP_FORBID: 8026 xmlBufferWriteChar(buf, "forbidden"); 8027 break; 8028 case XML_EXP_ATOM: 8029 xmlBufferWriteCHAR(buf, expr->exp_str); 8030 break; 8031 case XML_EXP_SEQ: 8032 c = expr->exp_left; 8033 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8034 xmlExpDumpInt(buf, c, 1); 8035 else 8036 xmlExpDumpInt(buf, c, 0); 8037 xmlBufferWriteChar(buf, " , "); 8038 c = expr->exp_right; 8039 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8040 xmlExpDumpInt(buf, c, 1); 8041 else 8042 xmlExpDumpInt(buf, c, 0); 8043 break; 8044 case XML_EXP_OR: 8045 c = expr->exp_left; 8046 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8047 xmlExpDumpInt(buf, c, 1); 8048 else 8049 xmlExpDumpInt(buf, c, 0); 8050 xmlBufferWriteChar(buf, " | "); 8051 c = expr->exp_right; 8052 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8053 xmlExpDumpInt(buf, c, 1); 8054 else 8055 xmlExpDumpInt(buf, c, 0); 8056 break; 8057 case XML_EXP_COUNT: { 8058 char rep[40]; 8059 8060 c = expr->exp_left; 8061 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8062 xmlExpDumpInt(buf, c, 1); 8063 else 8064 xmlExpDumpInt(buf, c, 0); 8065 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 8066 rep[0] = '?'; 8067 rep[1] = 0; 8068 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 8069 rep[0] = '*'; 8070 rep[1] = 0; 8071 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 8072 rep[0] = '+'; 8073 rep[1] = 0; 8074 } else if (expr->exp_max == expr->exp_min) { 8075 snprintf(rep, 39, "{%d}", expr->exp_min); 8076 } else if (expr->exp_max < 0) { 8077 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 8078 } else { 8079 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 8080 } 8081 rep[39] = 0; 8082 xmlBufferWriteChar(buf, rep); 8083 break; 8084 } 8085 default: 8086 fprintf(stderr, "Error in tree\n"); 8087 } 8088 if (glob) 8089 xmlBufferWriteChar(buf, ")"); 8090 } 8091 /** 8092 * xmlExpDump: 8093 * @buf: a buffer to receive the output 8094 * @expr: the compiled expression 8095 * 8096 * Serialize the expression as compiled to the buffer 8097 */ 8098 void 8099 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 8100 if ((buf == NULL) || (expr == NULL)) 8101 return; 8102 xmlExpDumpInt(buf, expr, 0); 8103 } 8104 8105 /** 8106 * xmlExpMaxToken: 8107 * @expr: a compiled expression 8108 * 8109 * Indicate the maximum number of input a expression can accept 8110 * 8111 * Returns the maximum length or -1 in case of error 8112 */ 8113 int 8114 xmlExpMaxToken(xmlExpNodePtr expr) { 8115 if (expr == NULL) 8116 return(-1); 8117 return(expr->c_max); 8118 } 8119 8120 /** 8121 * xmlExpCtxtNbNodes: 8122 * @ctxt: an expression context 8123 * 8124 * Debugging facility provides the number of allocated nodes at a that point 8125 * 8126 * Returns the number of nodes in use or -1 in case of error 8127 */ 8128 int 8129 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 8130 if (ctxt == NULL) 8131 return(-1); 8132 return(ctxt->nb_nodes); 8133 } 8134 8135 /** 8136 * xmlExpCtxtNbCons: 8137 * @ctxt: an expression context 8138 * 8139 * Debugging facility provides the number of allocated nodes over lifetime 8140 * 8141 * Returns the number of nodes ever allocated or -1 in case of error 8142 */ 8143 int 8144 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 8145 if (ctxt == NULL) 8146 return(-1); 8147 return(ctxt->nb_cons); 8148 } 8149 8150 #endif /* LIBXML_EXPR_ENABLED */ 8151 #define bottom_xmlregexp 8152 #include "elfgcchack.h" 8153 #endif /* LIBXML_REGEXP_ENABLED */ 8154