1 /* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard (at) redhat.com> 15 */ 16 17 #define IN_LIBXML 18 #include "libxml.h" 19 20 #ifdef LIBXML_REGEXP_ENABLED 21 22 /* #define DEBUG_ERR */ 23 24 #include <stdio.h> 25 #include <string.h> 26 #ifdef HAVE_LIMITS_H 27 #include <limits.h> 28 #endif 29 30 #include <libxml/tree.h> 31 #include <libxml/parserInternals.h> 32 #include <libxml/xmlregexp.h> 33 #include <libxml/xmlautomata.h> 34 #include <libxml/xmlunicode.h> 35 36 #ifndef INT_MAX 37 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 38 #endif 39 40 /* #define DEBUG_REGEXP_GRAPH */ 41 /* #define DEBUG_REGEXP_EXEC */ 42 /* #define DEBUG_PUSH */ 43 /* #define DEBUG_COMPACTION */ 44 45 #define MAX_PUSH 10000000 46 47 #define ERROR(str) \ 48 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 49 xmlRegexpErrCompile(ctxt, str); 50 #define NEXT ctxt->cur++ 51 #define CUR (*(ctxt->cur)) 52 #define NXT(index) (ctxt->cur[index]) 53 54 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 55 #define NEXTL(l) ctxt->cur += l; 56 #define XML_REG_STRING_SEPARATOR '|' 57 /* 58 * Need PREV to check on a '-' within a Character Group. May only be used 59 * when it's guaranteed that cur is not at the beginning of ctxt->string! 60 */ 61 #define PREV (ctxt->cur[-1]) 62 63 /** 64 * TODO: 65 * 66 * macro to flag unimplemented blocks 67 */ 68 #define TODO \ 69 xmlGenericError(xmlGenericErrorContext, \ 70 "Unimplemented block at %s:%d\n", \ 71 __FILE__, __LINE__); 72 73 /************************************************************************ 74 * * 75 * Datatypes and structures * 76 * * 77 ************************************************************************/ 78 79 /* 80 * Note: the order of the enums below is significant, do not shuffle 81 */ 82 typedef enum { 83 XML_REGEXP_EPSILON = 1, 84 XML_REGEXP_CHARVAL, 85 XML_REGEXP_RANGES, 86 XML_REGEXP_SUBREG, /* used for () sub regexps */ 87 XML_REGEXP_STRING, 88 XML_REGEXP_ANYCHAR, /* . */ 89 XML_REGEXP_ANYSPACE, /* \s */ 90 XML_REGEXP_NOTSPACE, /* \S */ 91 XML_REGEXP_INITNAME, /* \l */ 92 XML_REGEXP_NOTINITNAME, /* \L */ 93 XML_REGEXP_NAMECHAR, /* \c */ 94 XML_REGEXP_NOTNAMECHAR, /* \C */ 95 XML_REGEXP_DECIMAL, /* \d */ 96 XML_REGEXP_NOTDECIMAL, /* \D */ 97 XML_REGEXP_REALCHAR, /* \w */ 98 XML_REGEXP_NOTREALCHAR, /* \W */ 99 XML_REGEXP_LETTER = 100, 100 XML_REGEXP_LETTER_UPPERCASE, 101 XML_REGEXP_LETTER_LOWERCASE, 102 XML_REGEXP_LETTER_TITLECASE, 103 XML_REGEXP_LETTER_MODIFIER, 104 XML_REGEXP_LETTER_OTHERS, 105 XML_REGEXP_MARK, 106 XML_REGEXP_MARK_NONSPACING, 107 XML_REGEXP_MARK_SPACECOMBINING, 108 XML_REGEXP_MARK_ENCLOSING, 109 XML_REGEXP_NUMBER, 110 XML_REGEXP_NUMBER_DECIMAL, 111 XML_REGEXP_NUMBER_LETTER, 112 XML_REGEXP_NUMBER_OTHERS, 113 XML_REGEXP_PUNCT, 114 XML_REGEXP_PUNCT_CONNECTOR, 115 XML_REGEXP_PUNCT_DASH, 116 XML_REGEXP_PUNCT_OPEN, 117 XML_REGEXP_PUNCT_CLOSE, 118 XML_REGEXP_PUNCT_INITQUOTE, 119 XML_REGEXP_PUNCT_FINQUOTE, 120 XML_REGEXP_PUNCT_OTHERS, 121 XML_REGEXP_SEPAR, 122 XML_REGEXP_SEPAR_SPACE, 123 XML_REGEXP_SEPAR_LINE, 124 XML_REGEXP_SEPAR_PARA, 125 XML_REGEXP_SYMBOL, 126 XML_REGEXP_SYMBOL_MATH, 127 XML_REGEXP_SYMBOL_CURRENCY, 128 XML_REGEXP_SYMBOL_MODIFIER, 129 XML_REGEXP_SYMBOL_OTHERS, 130 XML_REGEXP_OTHER, 131 XML_REGEXP_OTHER_CONTROL, 132 XML_REGEXP_OTHER_FORMAT, 133 XML_REGEXP_OTHER_PRIVATE, 134 XML_REGEXP_OTHER_NA, 135 XML_REGEXP_BLOCK_NAME 136 } xmlRegAtomType; 137 138 typedef enum { 139 XML_REGEXP_QUANT_EPSILON = 1, 140 XML_REGEXP_QUANT_ONCE, 141 XML_REGEXP_QUANT_OPT, 142 XML_REGEXP_QUANT_MULT, 143 XML_REGEXP_QUANT_PLUS, 144 XML_REGEXP_QUANT_ONCEONLY, 145 XML_REGEXP_QUANT_ALL, 146 XML_REGEXP_QUANT_RANGE 147 } xmlRegQuantType; 148 149 typedef enum { 150 XML_REGEXP_START_STATE = 1, 151 XML_REGEXP_FINAL_STATE, 152 XML_REGEXP_TRANS_STATE, 153 XML_REGEXP_SINK_STATE, 154 XML_REGEXP_UNREACH_STATE 155 } xmlRegStateType; 156 157 typedef enum { 158 XML_REGEXP_MARK_NORMAL = 0, 159 XML_REGEXP_MARK_START, 160 XML_REGEXP_MARK_VISITED 161 } xmlRegMarkedType; 162 163 typedef struct _xmlRegRange xmlRegRange; 164 typedef xmlRegRange *xmlRegRangePtr; 165 166 struct _xmlRegRange { 167 int neg; /* 0 normal, 1 not, 2 exclude */ 168 xmlRegAtomType type; 169 int start; 170 int end; 171 xmlChar *blockName; 172 }; 173 174 typedef struct _xmlRegAtom xmlRegAtom; 175 typedef xmlRegAtom *xmlRegAtomPtr; 176 177 typedef struct _xmlAutomataState xmlRegState; 178 typedef xmlRegState *xmlRegStatePtr; 179 180 struct _xmlRegAtom { 181 int no; 182 xmlRegAtomType type; 183 xmlRegQuantType quant; 184 int min; 185 int max; 186 187 void *valuep; 188 void *valuep2; 189 int neg; 190 int codepoint; 191 xmlRegStatePtr start; 192 xmlRegStatePtr start0; 193 xmlRegStatePtr stop; 194 int maxRanges; 195 int nbRanges; 196 xmlRegRangePtr *ranges; 197 void *data; 198 }; 199 200 typedef struct _xmlRegCounter xmlRegCounter; 201 typedef xmlRegCounter *xmlRegCounterPtr; 202 203 struct _xmlRegCounter { 204 int min; 205 int max; 206 }; 207 208 typedef struct _xmlRegTrans xmlRegTrans; 209 typedef xmlRegTrans *xmlRegTransPtr; 210 211 struct _xmlRegTrans { 212 xmlRegAtomPtr atom; 213 int to; 214 int counter; 215 int count; 216 int nd; 217 }; 218 219 struct _xmlAutomataState { 220 xmlRegStateType type; 221 xmlRegMarkedType mark; 222 xmlRegMarkedType reached; 223 int no; 224 int maxTrans; 225 int nbTrans; 226 xmlRegTrans *trans; 227 /* knowing states ponting to us can speed things up */ 228 int maxTransTo; 229 int nbTransTo; 230 int *transTo; 231 }; 232 233 typedef struct _xmlAutomata xmlRegParserCtxt; 234 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 235 236 struct _xmlAutomata { 237 xmlChar *string; 238 xmlChar *cur; 239 240 int error; 241 int neg; 242 243 xmlRegStatePtr start; 244 xmlRegStatePtr end; 245 xmlRegStatePtr state; 246 247 xmlRegAtomPtr atom; 248 249 int maxAtoms; 250 int nbAtoms; 251 xmlRegAtomPtr *atoms; 252 253 int maxStates; 254 int nbStates; 255 xmlRegStatePtr *states; 256 257 int maxCounters; 258 int nbCounters; 259 xmlRegCounter *counters; 260 261 int determinist; 262 int negs; 263 }; 264 265 struct _xmlRegexp { 266 xmlChar *string; 267 int nbStates; 268 xmlRegStatePtr *states; 269 int nbAtoms; 270 xmlRegAtomPtr *atoms; 271 int nbCounters; 272 xmlRegCounter *counters; 273 int determinist; 274 /* 275 * That's the compact form for determinists automatas 276 */ 277 int nbstates; 278 int *compact; 279 void **transdata; 280 int nbstrings; 281 xmlChar **stringMap; 282 }; 283 284 typedef struct _xmlRegExecRollback xmlRegExecRollback; 285 typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 286 287 struct _xmlRegExecRollback { 288 xmlRegStatePtr state;/* the current state */ 289 int index; /* the index in the input stack */ 290 int nextbranch; /* the next transition to explore in that state */ 291 int *counts; /* save the automata state if it has some */ 292 }; 293 294 typedef struct _xmlRegInputToken xmlRegInputToken; 295 typedef xmlRegInputToken *xmlRegInputTokenPtr; 296 297 struct _xmlRegInputToken { 298 xmlChar *value; 299 void *data; 300 }; 301 302 struct _xmlRegExecCtxt { 303 int status; /* execution status != 0 indicate an error */ 304 int determinist; /* did we find an indeterministic behaviour */ 305 xmlRegexpPtr comp; /* the compiled regexp */ 306 xmlRegExecCallbacks callback; 307 void *data; 308 309 xmlRegStatePtr state;/* the current state */ 310 int transno; /* the current transition on that state */ 311 int transcount; /* the number of chars in char counted transitions */ 312 313 /* 314 * A stack of rollback states 315 */ 316 int maxRollbacks; 317 int nbRollbacks; 318 xmlRegExecRollback *rollbacks; 319 320 /* 321 * The state of the automata if any 322 */ 323 int *counts; 324 325 /* 326 * The input stack 327 */ 328 int inputStackMax; 329 int inputStackNr; 330 int index; 331 int *charStack; 332 const xmlChar *inputString; /* when operating on characters */ 333 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 334 335 /* 336 * error handling 337 */ 338 int errStateNo; /* the error state number */ 339 xmlRegStatePtr errState; /* the error state */ 340 xmlChar *errString; /* the string raising the error */ 341 int *errCounts; /* counters at the error state */ 342 int nbPush; 343 }; 344 345 #define REGEXP_ALL_COUNTER 0x123456 346 #define REGEXP_ALL_LAX_COUNTER 0x123457 347 348 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 349 static void xmlRegFreeState(xmlRegStatePtr state); 350 static void xmlRegFreeAtom(xmlRegAtomPtr atom); 351 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 352 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 353 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 354 int neg, int start, int end, const xmlChar *blockName); 355 356 /************************************************************************ 357 * * 358 * Regexp memory error handler * 359 * * 360 ************************************************************************/ 361 /** 362 * xmlRegexpErrMemory: 363 * @extra: extra information 364 * 365 * Handle an out of memory condition 366 */ 367 static void 368 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 369 { 370 const char *regexp = NULL; 371 if (ctxt != NULL) { 372 regexp = (const char *) ctxt->string; 373 ctxt->error = XML_ERR_NO_MEMORY; 374 } 375 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 376 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 377 regexp, NULL, 0, 0, 378 "Memory allocation failed : %s\n", extra); 379 } 380 381 /** 382 * xmlRegexpErrCompile: 383 * @extra: extra information 384 * 385 * Handle a compilation failure 386 */ 387 static void 388 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 389 { 390 const char *regexp = NULL; 391 int idx = 0; 392 393 if (ctxt != NULL) { 394 regexp = (const char *) ctxt->string; 395 idx = ctxt->cur - ctxt->string; 396 ctxt->error = XML_REGEXP_COMPILE_ERROR; 397 } 398 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 399 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 400 regexp, NULL, idx, 0, 401 "failed to compile: %s\n", extra); 402 } 403 404 /************************************************************************ 405 * * 406 * Allocation/Deallocation * 407 * * 408 ************************************************************************/ 409 410 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 411 /** 412 * xmlRegEpxFromParse: 413 * @ctxt: the parser context used to build it 414 * 415 * Allocate a new regexp and fill it with the result from the parser 416 * 417 * Returns the new regexp or NULL in case of error 418 */ 419 static xmlRegexpPtr 420 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 421 xmlRegexpPtr ret; 422 423 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 424 if (ret == NULL) { 425 xmlRegexpErrMemory(ctxt, "compiling regexp"); 426 return(NULL); 427 } 428 memset(ret, 0, sizeof(xmlRegexp)); 429 ret->string = ctxt->string; 430 ret->nbStates = ctxt->nbStates; 431 ret->states = ctxt->states; 432 ret->nbAtoms = ctxt->nbAtoms; 433 ret->atoms = ctxt->atoms; 434 ret->nbCounters = ctxt->nbCounters; 435 ret->counters = ctxt->counters; 436 ret->determinist = ctxt->determinist; 437 if (ret->determinist == -1) { 438 xmlRegexpIsDeterminist(ret); 439 } 440 441 if ((ret->determinist != 0) && 442 (ret->nbCounters == 0) && 443 (ctxt->negs == 0) && 444 (ret->atoms != NULL) && 445 (ret->atoms[0] != NULL) && 446 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 447 int i, j, nbstates = 0, nbatoms = 0; 448 int *stateRemap; 449 int *stringRemap; 450 int *transitions; 451 void **transdata; 452 xmlChar **stringMap; 453 xmlChar *value; 454 455 /* 456 * Switch to a compact representation 457 * 1/ counting the effective number of states left 458 * 2/ counting the unique number of atoms, and check that 459 * they are all of the string type 460 * 3/ build a table state x atom for the transitions 461 */ 462 463 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 464 if (stateRemap == NULL) { 465 xmlRegexpErrMemory(ctxt, "compiling regexp"); 466 xmlFree(ret); 467 return(NULL); 468 } 469 for (i = 0;i < ret->nbStates;i++) { 470 if (ret->states[i] != NULL) { 471 stateRemap[i] = nbstates; 472 nbstates++; 473 } else { 474 stateRemap[i] = -1; 475 } 476 } 477 #ifdef DEBUG_COMPACTION 478 printf("Final: %d states\n", nbstates); 479 #endif 480 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 481 if (stringMap == NULL) { 482 xmlRegexpErrMemory(ctxt, "compiling regexp"); 483 xmlFree(stateRemap); 484 xmlFree(ret); 485 return(NULL); 486 } 487 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 488 if (stringRemap == NULL) { 489 xmlRegexpErrMemory(ctxt, "compiling regexp"); 490 xmlFree(stringMap); 491 xmlFree(stateRemap); 492 xmlFree(ret); 493 return(NULL); 494 } 495 for (i = 0;i < ret->nbAtoms;i++) { 496 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 497 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 498 value = ret->atoms[i]->valuep; 499 for (j = 0;j < nbatoms;j++) { 500 if (xmlStrEqual(stringMap[j], value)) { 501 stringRemap[i] = j; 502 break; 503 } 504 } 505 if (j >= nbatoms) { 506 stringRemap[i] = nbatoms; 507 stringMap[nbatoms] = xmlStrdup(value); 508 if (stringMap[nbatoms] == NULL) { 509 for (i = 0;i < nbatoms;i++) 510 xmlFree(stringMap[i]); 511 xmlFree(stringRemap); 512 xmlFree(stringMap); 513 xmlFree(stateRemap); 514 xmlFree(ret); 515 return(NULL); 516 } 517 nbatoms++; 518 } 519 } else { 520 xmlFree(stateRemap); 521 xmlFree(stringRemap); 522 for (i = 0;i < nbatoms;i++) 523 xmlFree(stringMap[i]); 524 xmlFree(stringMap); 525 xmlFree(ret); 526 return(NULL); 527 } 528 } 529 #ifdef DEBUG_COMPACTION 530 printf("Final: %d atoms\n", nbatoms); 531 #endif 532 transitions = (int *) xmlMalloc((nbstates + 1) * 533 (nbatoms + 1) * sizeof(int)); 534 if (transitions == NULL) { 535 xmlFree(stateRemap); 536 xmlFree(stringRemap); 537 xmlFree(stringMap); 538 xmlFree(ret); 539 return(NULL); 540 } 541 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 542 543 /* 544 * Allocate the transition table. The first entry for each 545 * state corresponds to the state type. 546 */ 547 transdata = NULL; 548 549 for (i = 0;i < ret->nbStates;i++) { 550 int stateno, atomno, targetno, prev; 551 xmlRegStatePtr state; 552 xmlRegTransPtr trans; 553 554 stateno = stateRemap[i]; 555 if (stateno == -1) 556 continue; 557 state = ret->states[i]; 558 559 transitions[stateno * (nbatoms + 1)] = state->type; 560 561 for (j = 0;j < state->nbTrans;j++) { 562 trans = &(state->trans[j]); 563 if ((trans->to == -1) || (trans->atom == NULL)) 564 continue; 565 atomno = stringRemap[trans->atom->no]; 566 if ((trans->atom->data != NULL) && (transdata == NULL)) { 567 transdata = (void **) xmlMalloc(nbstates * nbatoms * 568 sizeof(void *)); 569 if (transdata != NULL) 570 memset(transdata, 0, 571 nbstates * nbatoms * sizeof(void *)); 572 else { 573 xmlRegexpErrMemory(ctxt, "compiling regexp"); 574 break; 575 } 576 } 577 targetno = stateRemap[trans->to]; 578 /* 579 * if the same atom can generate transitions to 2 different 580 * states then it means the automata is not determinist and 581 * the compact form can't be used ! 582 */ 583 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 584 if (prev != 0) { 585 if (prev != targetno + 1) { 586 ret->determinist = 0; 587 #ifdef DEBUG_COMPACTION 588 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 589 i, j, trans->atom->no, trans->to, atomno, targetno); 590 printf(" previous to is %d\n", prev); 591 #endif 592 if (transdata != NULL) 593 xmlFree(transdata); 594 xmlFree(transitions); 595 xmlFree(stateRemap); 596 xmlFree(stringRemap); 597 for (i = 0;i < nbatoms;i++) 598 xmlFree(stringMap[i]); 599 xmlFree(stringMap); 600 goto not_determ; 601 } 602 } else { 603 #if 0 604 printf("State %d trans %d: atom %d to %d : %d to %d\n", 605 i, j, trans->atom->no, trans->to, atomno, targetno); 606 #endif 607 transitions[stateno * (nbatoms + 1) + atomno + 1] = 608 targetno + 1; /* to avoid 0 */ 609 if (transdata != NULL) 610 transdata[stateno * nbatoms + atomno] = 611 trans->atom->data; 612 } 613 } 614 } 615 ret->determinist = 1; 616 #ifdef DEBUG_COMPACTION 617 /* 618 * Debug 619 */ 620 for (i = 0;i < nbstates;i++) { 621 for (j = 0;j < nbatoms + 1;j++) { 622 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 623 } 624 printf("\n"); 625 } 626 printf("\n"); 627 #endif 628 /* 629 * Cleanup of the old data 630 */ 631 if (ret->states != NULL) { 632 for (i = 0;i < ret->nbStates;i++) 633 xmlRegFreeState(ret->states[i]); 634 xmlFree(ret->states); 635 } 636 ret->states = NULL; 637 ret->nbStates = 0; 638 if (ret->atoms != NULL) { 639 for (i = 0;i < ret->nbAtoms;i++) 640 xmlRegFreeAtom(ret->atoms[i]); 641 xmlFree(ret->atoms); 642 } 643 ret->atoms = NULL; 644 ret->nbAtoms = 0; 645 646 ret->compact = transitions; 647 ret->transdata = transdata; 648 ret->stringMap = stringMap; 649 ret->nbstrings = nbatoms; 650 ret->nbstates = nbstates; 651 xmlFree(stateRemap); 652 xmlFree(stringRemap); 653 } 654 not_determ: 655 ctxt->string = NULL; 656 ctxt->nbStates = 0; 657 ctxt->states = NULL; 658 ctxt->nbAtoms = 0; 659 ctxt->atoms = NULL; 660 ctxt->nbCounters = 0; 661 ctxt->counters = NULL; 662 return(ret); 663 } 664 665 /** 666 * xmlRegNewParserCtxt: 667 * @string: the string to parse 668 * 669 * Allocate a new regexp parser context 670 * 671 * Returns the new context or NULL in case of error 672 */ 673 static xmlRegParserCtxtPtr 674 xmlRegNewParserCtxt(const xmlChar *string) { 675 xmlRegParserCtxtPtr ret; 676 677 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 678 if (ret == NULL) 679 return(NULL); 680 memset(ret, 0, sizeof(xmlRegParserCtxt)); 681 if (string != NULL) 682 ret->string = xmlStrdup(string); 683 ret->cur = ret->string; 684 ret->neg = 0; 685 ret->negs = 0; 686 ret->error = 0; 687 ret->determinist = -1; 688 return(ret); 689 } 690 691 /** 692 * xmlRegNewRange: 693 * @ctxt: the regexp parser context 694 * @neg: is that negative 695 * @type: the type of range 696 * @start: the start codepoint 697 * @end: the end codepoint 698 * 699 * Allocate a new regexp range 700 * 701 * Returns the new range or NULL in case of error 702 */ 703 static xmlRegRangePtr 704 xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 705 int neg, xmlRegAtomType type, int start, int end) { 706 xmlRegRangePtr ret; 707 708 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 709 if (ret == NULL) { 710 xmlRegexpErrMemory(ctxt, "allocating range"); 711 return(NULL); 712 } 713 ret->neg = neg; 714 ret->type = type; 715 ret->start = start; 716 ret->end = end; 717 return(ret); 718 } 719 720 /** 721 * xmlRegFreeRange: 722 * @range: the regexp range 723 * 724 * Free a regexp range 725 */ 726 static void 727 xmlRegFreeRange(xmlRegRangePtr range) { 728 if (range == NULL) 729 return; 730 731 if (range->blockName != NULL) 732 xmlFree(range->blockName); 733 xmlFree(range); 734 } 735 736 /** 737 * xmlRegCopyRange: 738 * @range: the regexp range 739 * 740 * Copy a regexp range 741 * 742 * Returns the new copy or NULL in case of error. 743 */ 744 static xmlRegRangePtr 745 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) { 746 xmlRegRangePtr ret; 747 748 if (range == NULL) 749 return(NULL); 750 751 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start, 752 range->end); 753 if (ret == NULL) 754 return(NULL); 755 if (range->blockName != NULL) { 756 ret->blockName = xmlStrdup(range->blockName); 757 if (ret->blockName == NULL) { 758 xmlRegexpErrMemory(ctxt, "allocating range"); 759 xmlRegFreeRange(ret); 760 return(NULL); 761 } 762 } 763 return(ret); 764 } 765 766 /** 767 * xmlRegNewAtom: 768 * @ctxt: the regexp parser context 769 * @type: the type of atom 770 * 771 * Allocate a new atom 772 * 773 * Returns the new atom or NULL in case of error 774 */ 775 static xmlRegAtomPtr 776 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 777 xmlRegAtomPtr ret; 778 779 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 780 if (ret == NULL) { 781 xmlRegexpErrMemory(ctxt, "allocating atom"); 782 return(NULL); 783 } 784 memset(ret, 0, sizeof(xmlRegAtom)); 785 ret->type = type; 786 ret->quant = XML_REGEXP_QUANT_ONCE; 787 ret->min = 0; 788 ret->max = 0; 789 return(ret); 790 } 791 792 /** 793 * xmlRegFreeAtom: 794 * @atom: the regexp atom 795 * 796 * Free a regexp atom 797 */ 798 static void 799 xmlRegFreeAtom(xmlRegAtomPtr atom) { 800 int i; 801 802 if (atom == NULL) 803 return; 804 805 for (i = 0;i < atom->nbRanges;i++) 806 xmlRegFreeRange(atom->ranges[i]); 807 if (atom->ranges != NULL) 808 xmlFree(atom->ranges); 809 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 810 xmlFree(atom->valuep); 811 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 812 xmlFree(atom->valuep2); 813 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 814 xmlFree(atom->valuep); 815 xmlFree(atom); 816 } 817 818 /** 819 * xmlRegCopyAtom: 820 * @ctxt: the regexp parser context 821 * @atom: the oiginal atom 822 * 823 * Allocate a new regexp range 824 * 825 * Returns the new atom or NULL in case of error 826 */ 827 static xmlRegAtomPtr 828 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 829 xmlRegAtomPtr ret; 830 831 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 832 if (ret == NULL) { 833 xmlRegexpErrMemory(ctxt, "copying atom"); 834 return(NULL); 835 } 836 memset(ret, 0, sizeof(xmlRegAtom)); 837 ret->type = atom->type; 838 ret->quant = atom->quant; 839 ret->min = atom->min; 840 ret->max = atom->max; 841 if (atom->nbRanges > 0) { 842 int i; 843 844 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) * 845 atom->nbRanges); 846 if (ret->ranges == NULL) { 847 xmlRegexpErrMemory(ctxt, "copying atom"); 848 goto error; 849 } 850 for (i = 0;i < atom->nbRanges;i++) { 851 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]); 852 if (ret->ranges[i] == NULL) 853 goto error; 854 ret->nbRanges = i + 1; 855 } 856 } 857 return(ret); 858 859 error: 860 xmlRegFreeAtom(ret); 861 return(NULL); 862 } 863 864 static xmlRegStatePtr 865 xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 866 xmlRegStatePtr ret; 867 868 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 869 if (ret == NULL) { 870 xmlRegexpErrMemory(ctxt, "allocating state"); 871 return(NULL); 872 } 873 memset(ret, 0, sizeof(xmlRegState)); 874 ret->type = XML_REGEXP_TRANS_STATE; 875 ret->mark = XML_REGEXP_MARK_NORMAL; 876 return(ret); 877 } 878 879 /** 880 * xmlRegFreeState: 881 * @state: the regexp state 882 * 883 * Free a regexp state 884 */ 885 static void 886 xmlRegFreeState(xmlRegStatePtr state) { 887 if (state == NULL) 888 return; 889 890 if (state->trans != NULL) 891 xmlFree(state->trans); 892 if (state->transTo != NULL) 893 xmlFree(state->transTo); 894 xmlFree(state); 895 } 896 897 /** 898 * xmlRegFreeParserCtxt: 899 * @ctxt: the regexp parser context 900 * 901 * Free a regexp parser context 902 */ 903 static void 904 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 905 int i; 906 if (ctxt == NULL) 907 return; 908 909 if (ctxt->string != NULL) 910 xmlFree(ctxt->string); 911 if (ctxt->states != NULL) { 912 for (i = 0;i < ctxt->nbStates;i++) 913 xmlRegFreeState(ctxt->states[i]); 914 xmlFree(ctxt->states); 915 } 916 if (ctxt->atoms != NULL) { 917 for (i = 0;i < ctxt->nbAtoms;i++) 918 xmlRegFreeAtom(ctxt->atoms[i]); 919 xmlFree(ctxt->atoms); 920 } 921 if (ctxt->counters != NULL) 922 xmlFree(ctxt->counters); 923 xmlFree(ctxt); 924 } 925 926 /************************************************************************ 927 * * 928 * Display of Data structures * 929 * * 930 ************************************************************************/ 931 932 static void 933 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 934 switch (type) { 935 case XML_REGEXP_EPSILON: 936 fprintf(output, "epsilon "); break; 937 case XML_REGEXP_CHARVAL: 938 fprintf(output, "charval "); break; 939 case XML_REGEXP_RANGES: 940 fprintf(output, "ranges "); break; 941 case XML_REGEXP_SUBREG: 942 fprintf(output, "subexpr "); break; 943 case XML_REGEXP_STRING: 944 fprintf(output, "string "); break; 945 case XML_REGEXP_ANYCHAR: 946 fprintf(output, "anychar "); break; 947 case XML_REGEXP_ANYSPACE: 948 fprintf(output, "anyspace "); break; 949 case XML_REGEXP_NOTSPACE: 950 fprintf(output, "notspace "); break; 951 case XML_REGEXP_INITNAME: 952 fprintf(output, "initname "); break; 953 case XML_REGEXP_NOTINITNAME: 954 fprintf(output, "notinitname "); break; 955 case XML_REGEXP_NAMECHAR: 956 fprintf(output, "namechar "); break; 957 case XML_REGEXP_NOTNAMECHAR: 958 fprintf(output, "notnamechar "); break; 959 case XML_REGEXP_DECIMAL: 960 fprintf(output, "decimal "); break; 961 case XML_REGEXP_NOTDECIMAL: 962 fprintf(output, "notdecimal "); break; 963 case XML_REGEXP_REALCHAR: 964 fprintf(output, "realchar "); break; 965 case XML_REGEXP_NOTREALCHAR: 966 fprintf(output, "notrealchar "); break; 967 case XML_REGEXP_LETTER: 968 fprintf(output, "LETTER "); break; 969 case XML_REGEXP_LETTER_UPPERCASE: 970 fprintf(output, "LETTER_UPPERCASE "); break; 971 case XML_REGEXP_LETTER_LOWERCASE: 972 fprintf(output, "LETTER_LOWERCASE "); break; 973 case XML_REGEXP_LETTER_TITLECASE: 974 fprintf(output, "LETTER_TITLECASE "); break; 975 case XML_REGEXP_LETTER_MODIFIER: 976 fprintf(output, "LETTER_MODIFIER "); break; 977 case XML_REGEXP_LETTER_OTHERS: 978 fprintf(output, "LETTER_OTHERS "); break; 979 case XML_REGEXP_MARK: 980 fprintf(output, "MARK "); break; 981 case XML_REGEXP_MARK_NONSPACING: 982 fprintf(output, "MARK_NONSPACING "); break; 983 case XML_REGEXP_MARK_SPACECOMBINING: 984 fprintf(output, "MARK_SPACECOMBINING "); break; 985 case XML_REGEXP_MARK_ENCLOSING: 986 fprintf(output, "MARK_ENCLOSING "); break; 987 case XML_REGEXP_NUMBER: 988 fprintf(output, "NUMBER "); break; 989 case XML_REGEXP_NUMBER_DECIMAL: 990 fprintf(output, "NUMBER_DECIMAL "); break; 991 case XML_REGEXP_NUMBER_LETTER: 992 fprintf(output, "NUMBER_LETTER "); break; 993 case XML_REGEXP_NUMBER_OTHERS: 994 fprintf(output, "NUMBER_OTHERS "); break; 995 case XML_REGEXP_PUNCT: 996 fprintf(output, "PUNCT "); break; 997 case XML_REGEXP_PUNCT_CONNECTOR: 998 fprintf(output, "PUNCT_CONNECTOR "); break; 999 case XML_REGEXP_PUNCT_DASH: 1000 fprintf(output, "PUNCT_DASH "); break; 1001 case XML_REGEXP_PUNCT_OPEN: 1002 fprintf(output, "PUNCT_OPEN "); break; 1003 case XML_REGEXP_PUNCT_CLOSE: 1004 fprintf(output, "PUNCT_CLOSE "); break; 1005 case XML_REGEXP_PUNCT_INITQUOTE: 1006 fprintf(output, "PUNCT_INITQUOTE "); break; 1007 case XML_REGEXP_PUNCT_FINQUOTE: 1008 fprintf(output, "PUNCT_FINQUOTE "); break; 1009 case XML_REGEXP_PUNCT_OTHERS: 1010 fprintf(output, "PUNCT_OTHERS "); break; 1011 case XML_REGEXP_SEPAR: 1012 fprintf(output, "SEPAR "); break; 1013 case XML_REGEXP_SEPAR_SPACE: 1014 fprintf(output, "SEPAR_SPACE "); break; 1015 case XML_REGEXP_SEPAR_LINE: 1016 fprintf(output, "SEPAR_LINE "); break; 1017 case XML_REGEXP_SEPAR_PARA: 1018 fprintf(output, "SEPAR_PARA "); break; 1019 case XML_REGEXP_SYMBOL: 1020 fprintf(output, "SYMBOL "); break; 1021 case XML_REGEXP_SYMBOL_MATH: 1022 fprintf(output, "SYMBOL_MATH "); break; 1023 case XML_REGEXP_SYMBOL_CURRENCY: 1024 fprintf(output, "SYMBOL_CURRENCY "); break; 1025 case XML_REGEXP_SYMBOL_MODIFIER: 1026 fprintf(output, "SYMBOL_MODIFIER "); break; 1027 case XML_REGEXP_SYMBOL_OTHERS: 1028 fprintf(output, "SYMBOL_OTHERS "); break; 1029 case XML_REGEXP_OTHER: 1030 fprintf(output, "OTHER "); break; 1031 case XML_REGEXP_OTHER_CONTROL: 1032 fprintf(output, "OTHER_CONTROL "); break; 1033 case XML_REGEXP_OTHER_FORMAT: 1034 fprintf(output, "OTHER_FORMAT "); break; 1035 case XML_REGEXP_OTHER_PRIVATE: 1036 fprintf(output, "OTHER_PRIVATE "); break; 1037 case XML_REGEXP_OTHER_NA: 1038 fprintf(output, "OTHER_NA "); break; 1039 case XML_REGEXP_BLOCK_NAME: 1040 fprintf(output, "BLOCK "); break; 1041 } 1042 } 1043 1044 static void 1045 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 1046 switch (type) { 1047 case XML_REGEXP_QUANT_EPSILON: 1048 fprintf(output, "epsilon "); break; 1049 case XML_REGEXP_QUANT_ONCE: 1050 fprintf(output, "once "); break; 1051 case XML_REGEXP_QUANT_OPT: 1052 fprintf(output, "? "); break; 1053 case XML_REGEXP_QUANT_MULT: 1054 fprintf(output, "* "); break; 1055 case XML_REGEXP_QUANT_PLUS: 1056 fprintf(output, "+ "); break; 1057 case XML_REGEXP_QUANT_RANGE: 1058 fprintf(output, "range "); break; 1059 case XML_REGEXP_QUANT_ONCEONLY: 1060 fprintf(output, "onceonly "); break; 1061 case XML_REGEXP_QUANT_ALL: 1062 fprintf(output, "all "); break; 1063 } 1064 } 1065 static void 1066 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 1067 fprintf(output, " range: "); 1068 if (range->neg) 1069 fprintf(output, "negative "); 1070 xmlRegPrintAtomType(output, range->type); 1071 fprintf(output, "%c - %c\n", range->start, range->end); 1072 } 1073 1074 static void 1075 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 1076 fprintf(output, " atom: "); 1077 if (atom == NULL) { 1078 fprintf(output, "NULL\n"); 1079 return; 1080 } 1081 if (atom->neg) 1082 fprintf(output, "not "); 1083 xmlRegPrintAtomType(output, atom->type); 1084 xmlRegPrintQuantType(output, atom->quant); 1085 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1086 fprintf(output, "%d-%d ", atom->min, atom->max); 1087 if (atom->type == XML_REGEXP_STRING) 1088 fprintf(output, "'%s' ", (char *) atom->valuep); 1089 if (atom->type == XML_REGEXP_CHARVAL) 1090 fprintf(output, "char %c\n", atom->codepoint); 1091 else if (atom->type == XML_REGEXP_RANGES) { 1092 int i; 1093 fprintf(output, "%d entries\n", atom->nbRanges); 1094 for (i = 0; i < atom->nbRanges;i++) 1095 xmlRegPrintRange(output, atom->ranges[i]); 1096 } else if (atom->type == XML_REGEXP_SUBREG) { 1097 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1098 } else { 1099 fprintf(output, "\n"); 1100 } 1101 } 1102 1103 static void 1104 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1105 fprintf(output, " trans: "); 1106 if (trans == NULL) { 1107 fprintf(output, "NULL\n"); 1108 return; 1109 } 1110 if (trans->to < 0) { 1111 fprintf(output, "removed\n"); 1112 return; 1113 } 1114 if (trans->nd != 0) { 1115 if (trans->nd == 2) 1116 fprintf(output, "last not determinist, "); 1117 else 1118 fprintf(output, "not determinist, "); 1119 } 1120 if (trans->counter >= 0) { 1121 fprintf(output, "counted %d, ", trans->counter); 1122 } 1123 if (trans->count == REGEXP_ALL_COUNTER) { 1124 fprintf(output, "all transition, "); 1125 } else if (trans->count >= 0) { 1126 fprintf(output, "count based %d, ", trans->count); 1127 } 1128 if (trans->atom == NULL) { 1129 fprintf(output, "epsilon to %d\n", trans->to); 1130 return; 1131 } 1132 if (trans->atom->type == XML_REGEXP_CHARVAL) 1133 fprintf(output, "char %c ", trans->atom->codepoint); 1134 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1135 } 1136 1137 static void 1138 xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1139 int i; 1140 1141 fprintf(output, " state: "); 1142 if (state == NULL) { 1143 fprintf(output, "NULL\n"); 1144 return; 1145 } 1146 if (state->type == XML_REGEXP_START_STATE) 1147 fprintf(output, "START "); 1148 if (state->type == XML_REGEXP_FINAL_STATE) 1149 fprintf(output, "FINAL "); 1150 1151 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1152 for (i = 0;i < state->nbTrans; i++) { 1153 xmlRegPrintTrans(output, &(state->trans[i])); 1154 } 1155 } 1156 1157 #ifdef DEBUG_REGEXP_GRAPH 1158 static void 1159 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1160 int i; 1161 1162 fprintf(output, " ctxt: "); 1163 if (ctxt == NULL) { 1164 fprintf(output, "NULL\n"); 1165 return; 1166 } 1167 fprintf(output, "'%s' ", ctxt->string); 1168 if (ctxt->error) 1169 fprintf(output, "error "); 1170 if (ctxt->neg) 1171 fprintf(output, "neg "); 1172 fprintf(output, "\n"); 1173 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1174 for (i = 0;i < ctxt->nbAtoms; i++) { 1175 fprintf(output, " %02d ", i); 1176 xmlRegPrintAtom(output, ctxt->atoms[i]); 1177 } 1178 if (ctxt->atom != NULL) { 1179 fprintf(output, "current atom:\n"); 1180 xmlRegPrintAtom(output, ctxt->atom); 1181 } 1182 fprintf(output, "%d states:", ctxt->nbStates); 1183 if (ctxt->start != NULL) 1184 fprintf(output, " start: %d", ctxt->start->no); 1185 if (ctxt->end != NULL) 1186 fprintf(output, " end: %d", ctxt->end->no); 1187 fprintf(output, "\n"); 1188 for (i = 0;i < ctxt->nbStates; i++) { 1189 xmlRegPrintState(output, ctxt->states[i]); 1190 } 1191 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1192 for (i = 0;i < ctxt->nbCounters; i++) { 1193 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1194 ctxt->counters[i].max); 1195 } 1196 } 1197 #endif 1198 1199 /************************************************************************ 1200 * * 1201 * Finite Automata structures manipulations * 1202 * * 1203 ************************************************************************/ 1204 1205 static void 1206 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1207 int neg, xmlRegAtomType type, int start, int end, 1208 xmlChar *blockName) { 1209 xmlRegRangePtr range; 1210 1211 if (atom == NULL) { 1212 ERROR("add range: atom is NULL"); 1213 return; 1214 } 1215 if (atom->type != XML_REGEXP_RANGES) { 1216 ERROR("add range: atom is not ranges"); 1217 return; 1218 } 1219 if (atom->maxRanges == 0) { 1220 atom->maxRanges = 4; 1221 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1222 sizeof(xmlRegRangePtr)); 1223 if (atom->ranges == NULL) { 1224 xmlRegexpErrMemory(ctxt, "adding ranges"); 1225 atom->maxRanges = 0; 1226 return; 1227 } 1228 } else if (atom->nbRanges >= atom->maxRanges) { 1229 xmlRegRangePtr *tmp; 1230 atom->maxRanges *= 2; 1231 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1232 sizeof(xmlRegRangePtr)); 1233 if (tmp == NULL) { 1234 xmlRegexpErrMemory(ctxt, "adding ranges"); 1235 atom->maxRanges /= 2; 1236 return; 1237 } 1238 atom->ranges = tmp; 1239 } 1240 range = xmlRegNewRange(ctxt, neg, type, start, end); 1241 if (range == NULL) 1242 return; 1243 range->blockName = blockName; 1244 atom->ranges[atom->nbRanges++] = range; 1245 1246 } 1247 1248 static int 1249 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1250 if (ctxt->maxCounters == 0) { 1251 ctxt->maxCounters = 4; 1252 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1253 sizeof(xmlRegCounter)); 1254 if (ctxt->counters == NULL) { 1255 xmlRegexpErrMemory(ctxt, "allocating counter"); 1256 ctxt->maxCounters = 0; 1257 return(-1); 1258 } 1259 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1260 xmlRegCounter *tmp; 1261 ctxt->maxCounters *= 2; 1262 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1263 sizeof(xmlRegCounter)); 1264 if (tmp == NULL) { 1265 xmlRegexpErrMemory(ctxt, "allocating counter"); 1266 ctxt->maxCounters /= 2; 1267 return(-1); 1268 } 1269 ctxt->counters = tmp; 1270 } 1271 ctxt->counters[ctxt->nbCounters].min = -1; 1272 ctxt->counters[ctxt->nbCounters].max = -1; 1273 return(ctxt->nbCounters++); 1274 } 1275 1276 static int 1277 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1278 if (atom == NULL) { 1279 ERROR("atom push: atom is NULL"); 1280 return(-1); 1281 } 1282 if (ctxt->maxAtoms == 0) { 1283 ctxt->maxAtoms = 4; 1284 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1285 sizeof(xmlRegAtomPtr)); 1286 if (ctxt->atoms == NULL) { 1287 xmlRegexpErrMemory(ctxt, "pushing atom"); 1288 ctxt->maxAtoms = 0; 1289 return(-1); 1290 } 1291 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1292 xmlRegAtomPtr *tmp; 1293 ctxt->maxAtoms *= 2; 1294 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1295 sizeof(xmlRegAtomPtr)); 1296 if (tmp == NULL) { 1297 xmlRegexpErrMemory(ctxt, "allocating counter"); 1298 ctxt->maxAtoms /= 2; 1299 return(-1); 1300 } 1301 ctxt->atoms = tmp; 1302 } 1303 atom->no = ctxt->nbAtoms; 1304 ctxt->atoms[ctxt->nbAtoms++] = atom; 1305 return(0); 1306 } 1307 1308 static void 1309 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1310 int from) { 1311 if (target->maxTransTo == 0) { 1312 target->maxTransTo = 8; 1313 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1314 sizeof(int)); 1315 if (target->transTo == NULL) { 1316 xmlRegexpErrMemory(ctxt, "adding transition"); 1317 target->maxTransTo = 0; 1318 return; 1319 } 1320 } else if (target->nbTransTo >= target->maxTransTo) { 1321 int *tmp; 1322 target->maxTransTo *= 2; 1323 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1324 sizeof(int)); 1325 if (tmp == NULL) { 1326 xmlRegexpErrMemory(ctxt, "adding transition"); 1327 target->maxTransTo /= 2; 1328 return; 1329 } 1330 target->transTo = tmp; 1331 } 1332 target->transTo[target->nbTransTo] = from; 1333 target->nbTransTo++; 1334 } 1335 1336 static void 1337 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1338 xmlRegAtomPtr atom, xmlRegStatePtr target, 1339 int counter, int count) { 1340 1341 int nrtrans; 1342 1343 if (state == NULL) { 1344 ERROR("add state: state is NULL"); 1345 return; 1346 } 1347 if (target == NULL) { 1348 ERROR("add state: target is NULL"); 1349 return; 1350 } 1351 /* 1352 * Other routines follow the philosophy 'When in doubt, add a transition' 1353 * so we check here whether such a transition is already present and, if 1354 * so, silently ignore this request. 1355 */ 1356 1357 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1358 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1359 if ((trans->atom == atom) && 1360 (trans->to == target->no) && 1361 (trans->counter == counter) && 1362 (trans->count == count)) { 1363 #ifdef DEBUG_REGEXP_GRAPH 1364 printf("Ignoring duplicate transition from %d to %d\n", 1365 state->no, target->no); 1366 #endif 1367 return; 1368 } 1369 } 1370 1371 if (state->maxTrans == 0) { 1372 state->maxTrans = 8; 1373 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1374 sizeof(xmlRegTrans)); 1375 if (state->trans == NULL) { 1376 xmlRegexpErrMemory(ctxt, "adding transition"); 1377 state->maxTrans = 0; 1378 return; 1379 } 1380 } else if (state->nbTrans >= state->maxTrans) { 1381 xmlRegTrans *tmp; 1382 state->maxTrans *= 2; 1383 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1384 sizeof(xmlRegTrans)); 1385 if (tmp == NULL) { 1386 xmlRegexpErrMemory(ctxt, "adding transition"); 1387 state->maxTrans /= 2; 1388 return; 1389 } 1390 state->trans = tmp; 1391 } 1392 #ifdef DEBUG_REGEXP_GRAPH 1393 printf("Add trans from %d to %d ", state->no, target->no); 1394 if (count == REGEXP_ALL_COUNTER) 1395 printf("all transition\n"); 1396 else if (count >= 0) 1397 printf("count based %d\n", count); 1398 else if (counter >= 0) 1399 printf("counted %d\n", counter); 1400 else if (atom == NULL) 1401 printf("epsilon transition\n"); 1402 else if (atom != NULL) 1403 xmlRegPrintAtom(stdout, atom); 1404 #endif 1405 1406 state->trans[state->nbTrans].atom = atom; 1407 state->trans[state->nbTrans].to = target->no; 1408 state->trans[state->nbTrans].counter = counter; 1409 state->trans[state->nbTrans].count = count; 1410 state->trans[state->nbTrans].nd = 0; 1411 state->nbTrans++; 1412 xmlRegStateAddTransTo(ctxt, target, state->no); 1413 } 1414 1415 static int 1416 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1417 if (state == NULL) return(-1); 1418 if (ctxt->maxStates == 0) { 1419 ctxt->maxStates = 4; 1420 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1421 sizeof(xmlRegStatePtr)); 1422 if (ctxt->states == NULL) { 1423 xmlRegexpErrMemory(ctxt, "adding state"); 1424 ctxt->maxStates = 0; 1425 return(-1); 1426 } 1427 } else if (ctxt->nbStates >= ctxt->maxStates) { 1428 xmlRegStatePtr *tmp; 1429 ctxt->maxStates *= 2; 1430 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1431 sizeof(xmlRegStatePtr)); 1432 if (tmp == NULL) { 1433 xmlRegexpErrMemory(ctxt, "adding state"); 1434 ctxt->maxStates /= 2; 1435 return(-1); 1436 } 1437 ctxt->states = tmp; 1438 } 1439 state->no = ctxt->nbStates; 1440 ctxt->states[ctxt->nbStates++] = state; 1441 return(0); 1442 } 1443 1444 /** 1445 * xmlFAGenerateAllTransition: 1446 * @ctxt: a regexp parser context 1447 * @from: the from state 1448 * @to: the target state or NULL for building a new one 1449 * @lax: 1450 * 1451 */ 1452 static void 1453 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1454 xmlRegStatePtr from, xmlRegStatePtr to, 1455 int lax) { 1456 if (to == NULL) { 1457 to = xmlRegNewState(ctxt); 1458 xmlRegStatePush(ctxt, to); 1459 ctxt->state = to; 1460 } 1461 if (lax) 1462 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1463 else 1464 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1465 } 1466 1467 /** 1468 * xmlFAGenerateEpsilonTransition: 1469 * @ctxt: a regexp parser context 1470 * @from: the from state 1471 * @to: the target state or NULL for building a new one 1472 * 1473 */ 1474 static void 1475 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1476 xmlRegStatePtr from, xmlRegStatePtr to) { 1477 if (to == NULL) { 1478 to = xmlRegNewState(ctxt); 1479 xmlRegStatePush(ctxt, to); 1480 ctxt->state = to; 1481 } 1482 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1483 } 1484 1485 /** 1486 * xmlFAGenerateCountedEpsilonTransition: 1487 * @ctxt: a regexp parser context 1488 * @from: the from state 1489 * @to: the target state or NULL for building a new one 1490 * counter: the counter for that transition 1491 * 1492 */ 1493 static void 1494 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1495 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1496 if (to == NULL) { 1497 to = xmlRegNewState(ctxt); 1498 xmlRegStatePush(ctxt, to); 1499 ctxt->state = to; 1500 } 1501 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1502 } 1503 1504 /** 1505 * xmlFAGenerateCountedTransition: 1506 * @ctxt: a regexp parser context 1507 * @from: the from state 1508 * @to: the target state or NULL for building a new one 1509 * counter: the counter for that transition 1510 * 1511 */ 1512 static void 1513 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1514 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1515 if (to == NULL) { 1516 to = xmlRegNewState(ctxt); 1517 xmlRegStatePush(ctxt, to); 1518 ctxt->state = to; 1519 } 1520 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1521 } 1522 1523 /** 1524 * xmlFAGenerateTransitions: 1525 * @ctxt: a regexp parser context 1526 * @from: the from state 1527 * @to: the target state or NULL for building a new one 1528 * @atom: the atom generating the transition 1529 * 1530 * Returns 0 if success and -1 in case of error. 1531 */ 1532 static int 1533 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1534 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1535 xmlRegStatePtr end; 1536 1537 if (atom == NULL) { 1538 ERROR("genrate transition: atom == NULL"); 1539 return(-1); 1540 } 1541 if (atom->type == XML_REGEXP_SUBREG) { 1542 /* 1543 * this is a subexpression handling one should not need to 1544 * create a new node except for XML_REGEXP_QUANT_RANGE. 1545 */ 1546 if (xmlRegAtomPush(ctxt, atom) < 0) { 1547 return(-1); 1548 } 1549 if ((to != NULL) && (atom->stop != to) && 1550 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1551 /* 1552 * Generate an epsilon transition to link to the target 1553 */ 1554 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1555 #ifdef DV 1556 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1557 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1558 to = xmlRegNewState(ctxt); 1559 xmlRegStatePush(ctxt, to); 1560 ctxt->state = to; 1561 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1562 #endif 1563 } 1564 switch (atom->quant) { 1565 case XML_REGEXP_QUANT_OPT: 1566 atom->quant = XML_REGEXP_QUANT_ONCE; 1567 /* 1568 * transition done to the state after end of atom. 1569 * 1. set transition from atom start to new state 1570 * 2. set transition from atom end to this state. 1571 */ 1572 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1573 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, ctxt->state); 1574 break; 1575 case XML_REGEXP_QUANT_MULT: 1576 atom->quant = XML_REGEXP_QUANT_ONCE; 1577 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1578 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1579 break; 1580 case XML_REGEXP_QUANT_PLUS: 1581 atom->quant = XML_REGEXP_QUANT_ONCE; 1582 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1583 break; 1584 case XML_REGEXP_QUANT_RANGE: { 1585 int counter; 1586 xmlRegStatePtr inter, newstate; 1587 1588 /* 1589 * create the final state now if needed 1590 */ 1591 if (to != NULL) { 1592 newstate = to; 1593 } else { 1594 newstate = xmlRegNewState(ctxt); 1595 xmlRegStatePush(ctxt, newstate); 1596 } 1597 1598 /* 1599 * The principle here is to use counted transition 1600 * to avoid explosion in the number of states in the 1601 * graph. This is clearly more complex but should not 1602 * be exploitable at runtime. 1603 */ 1604 if ((atom->min == 0) && (atom->start0 == NULL)) { 1605 xmlRegAtomPtr copy; 1606 /* 1607 * duplicate a transition based on atom to count next 1608 * occurences after 1. We cannot loop to atom->start 1609 * directly because we need an epsilon transition to 1610 * newstate. 1611 */ 1612 /* ???? For some reason it seems we never reach that 1613 case, I suppose this got optimized out before when 1614 building the automata */ 1615 copy = xmlRegCopyAtom(ctxt, atom); 1616 if (copy == NULL) 1617 return(-1); 1618 copy->quant = XML_REGEXP_QUANT_ONCE; 1619 copy->min = 0; 1620 copy->max = 0; 1621 1622 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy) 1623 < 0) 1624 return(-1); 1625 inter = ctxt->state; 1626 counter = xmlRegGetCounter(ctxt); 1627 ctxt->counters[counter].min = atom->min - 1; 1628 ctxt->counters[counter].max = atom->max - 1; 1629 /* count the number of times we see it again */ 1630 xmlFAGenerateCountedEpsilonTransition(ctxt, inter, 1631 atom->stop, counter); 1632 /* allow a way out based on the count */ 1633 xmlFAGenerateCountedTransition(ctxt, inter, 1634 newstate, counter); 1635 /* and also allow a direct exit for 0 */ 1636 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1637 newstate); 1638 } else { 1639 /* 1640 * either we need the atom at least once or there 1641 * is an atom->start0 allowing to easilly plug the 1642 * epsilon transition. 1643 */ 1644 counter = xmlRegGetCounter(ctxt); 1645 ctxt->counters[counter].min = atom->min - 1; 1646 ctxt->counters[counter].max = atom->max - 1; 1647 /* count the number of times we see it again */ 1648 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1649 atom->start, counter); 1650 /* allow a way out based on the count */ 1651 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1652 newstate, counter); 1653 /* and if needed allow a direct exit for 0 */ 1654 if (atom->min == 0) 1655 xmlFAGenerateEpsilonTransition(ctxt, atom->start0, 1656 newstate); 1657 1658 } 1659 atom->min = 0; 1660 atom->max = 0; 1661 atom->quant = XML_REGEXP_QUANT_ONCE; 1662 ctxt->state = newstate; 1663 } 1664 default: 1665 break; 1666 } 1667 return(0); 1668 } 1669 if ((atom->min == 0) && (atom->max == 0) && 1670 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1671 /* 1672 * we can discard the atom and generate an epsilon transition instead 1673 */ 1674 if (to == NULL) { 1675 to = xmlRegNewState(ctxt); 1676 if (to != NULL) 1677 xmlRegStatePush(ctxt, to); 1678 else { 1679 return(-1); 1680 } 1681 } 1682 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1683 ctxt->state = to; 1684 xmlRegFreeAtom(atom); 1685 return(0); 1686 } 1687 if (to == NULL) { 1688 to = xmlRegNewState(ctxt); 1689 if (to != NULL) 1690 xmlRegStatePush(ctxt, to); 1691 else { 1692 return(-1); 1693 } 1694 } 1695 end = to; 1696 if ((atom->quant == XML_REGEXP_QUANT_MULT) || 1697 (atom->quant == XML_REGEXP_QUANT_PLUS)) { 1698 /* 1699 * Do not pollute the target state by adding transitions from 1700 * it as it is likely to be the shared target of multiple branches. 1701 * So isolate with an epsilon transition. 1702 */ 1703 xmlRegStatePtr tmp; 1704 1705 tmp = xmlRegNewState(ctxt); 1706 if (tmp != NULL) 1707 xmlRegStatePush(ctxt, tmp); 1708 else { 1709 return(-1); 1710 } 1711 xmlFAGenerateEpsilonTransition(ctxt, tmp, to); 1712 to = tmp; 1713 } 1714 if (xmlRegAtomPush(ctxt, atom) < 0) { 1715 return(-1); 1716 } 1717 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1718 ctxt->state = end; 1719 switch (atom->quant) { 1720 case XML_REGEXP_QUANT_OPT: 1721 atom->quant = XML_REGEXP_QUANT_ONCE; 1722 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1723 break; 1724 case XML_REGEXP_QUANT_MULT: 1725 atom->quant = XML_REGEXP_QUANT_ONCE; 1726 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1727 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1728 break; 1729 case XML_REGEXP_QUANT_PLUS: 1730 atom->quant = XML_REGEXP_QUANT_ONCE; 1731 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1732 break; 1733 case XML_REGEXP_QUANT_RANGE: 1734 #if DV_test 1735 if (atom->min == 0) { 1736 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1737 } 1738 #endif 1739 break; 1740 default: 1741 break; 1742 } 1743 return(0); 1744 } 1745 1746 /** 1747 * xmlFAReduceEpsilonTransitions: 1748 * @ctxt: a regexp parser context 1749 * @fromnr: the from state 1750 * @tonr: the to state 1751 * @counter: should that transition be associated to a counted 1752 * 1753 */ 1754 static void 1755 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1756 int tonr, int counter) { 1757 int transnr; 1758 xmlRegStatePtr from; 1759 xmlRegStatePtr to; 1760 1761 #ifdef DEBUG_REGEXP_GRAPH 1762 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1763 #endif 1764 from = ctxt->states[fromnr]; 1765 if (from == NULL) 1766 return; 1767 to = ctxt->states[tonr]; 1768 if (to == NULL) 1769 return; 1770 if ((to->mark == XML_REGEXP_MARK_START) || 1771 (to->mark == XML_REGEXP_MARK_VISITED)) 1772 return; 1773 1774 to->mark = XML_REGEXP_MARK_VISITED; 1775 if (to->type == XML_REGEXP_FINAL_STATE) { 1776 #ifdef DEBUG_REGEXP_GRAPH 1777 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1778 #endif 1779 from->type = XML_REGEXP_FINAL_STATE; 1780 } 1781 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1782 if (to->trans[transnr].to < 0) 1783 continue; 1784 if (to->trans[transnr].atom == NULL) { 1785 /* 1786 * Don't remove counted transitions 1787 * Don't loop either 1788 */ 1789 if (to->trans[transnr].to != fromnr) { 1790 if (to->trans[transnr].count >= 0) { 1791 int newto = to->trans[transnr].to; 1792 1793 xmlRegStateAddTrans(ctxt, from, NULL, 1794 ctxt->states[newto], 1795 -1, to->trans[transnr].count); 1796 } else { 1797 #ifdef DEBUG_REGEXP_GRAPH 1798 printf("Found epsilon trans %d from %d to %d\n", 1799 transnr, tonr, to->trans[transnr].to); 1800 #endif 1801 if (to->trans[transnr].counter >= 0) { 1802 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1803 to->trans[transnr].to, 1804 to->trans[transnr].counter); 1805 } else { 1806 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1807 to->trans[transnr].to, 1808 counter); 1809 } 1810 } 1811 } 1812 } else { 1813 int newto = to->trans[transnr].to; 1814 1815 if (to->trans[transnr].counter >= 0) { 1816 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1817 ctxt->states[newto], 1818 to->trans[transnr].counter, -1); 1819 } else { 1820 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1821 ctxt->states[newto], counter, -1); 1822 } 1823 } 1824 } 1825 to->mark = XML_REGEXP_MARK_NORMAL; 1826 } 1827 1828 /** 1829 * xmlFAEliminateSimpleEpsilonTransitions: 1830 * @ctxt: a regexp parser context 1831 * 1832 * Eliminating general epsilon transitions can get costly in the general 1833 * algorithm due to the large amount of generated new transitions and 1834 * associated comparisons. However for simple epsilon transition used just 1835 * to separate building blocks when generating the automata this can be 1836 * reduced to state elimination: 1837 * - if there exists an epsilon from X to Y 1838 * - if there is no other transition from X 1839 * then X and Y are semantically equivalent and X can be eliminated 1840 * If X is the start state then make Y the start state, else replace the 1841 * target of all transitions to X by transitions to Y. 1842 */ 1843 static void 1844 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1845 int statenr, i, j, newto; 1846 xmlRegStatePtr state, tmp; 1847 1848 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1849 state = ctxt->states[statenr]; 1850 if (state == NULL) 1851 continue; 1852 if (state->nbTrans != 1) 1853 continue; 1854 if (state->type == XML_REGEXP_UNREACH_STATE) 1855 continue; 1856 /* is the only transition out a basic transition */ 1857 if ((state->trans[0].atom == NULL) && 1858 (state->trans[0].to >= 0) && 1859 (state->trans[0].to != statenr) && 1860 (state->trans[0].counter < 0) && 1861 (state->trans[0].count < 0)) { 1862 newto = state->trans[0].to; 1863 1864 if (state->type == XML_REGEXP_START_STATE) { 1865 #ifdef DEBUG_REGEXP_GRAPH 1866 printf("Found simple epsilon trans from start %d to %d\n", 1867 statenr, newto); 1868 #endif 1869 } else { 1870 #ifdef DEBUG_REGEXP_GRAPH 1871 printf("Found simple epsilon trans from %d to %d\n", 1872 statenr, newto); 1873 #endif 1874 for (i = 0;i < state->nbTransTo;i++) { 1875 tmp = ctxt->states[state->transTo[i]]; 1876 for (j = 0;j < tmp->nbTrans;j++) { 1877 if (tmp->trans[j].to == statenr) { 1878 #ifdef DEBUG_REGEXP_GRAPH 1879 printf("Changed transition %d on %d to go to %d\n", 1880 j, tmp->no, newto); 1881 #endif 1882 tmp->trans[j].to = -1; 1883 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom, 1884 ctxt->states[newto], 1885 tmp->trans[j].counter, 1886 tmp->trans[j].count); 1887 } 1888 } 1889 } 1890 if (state->type == XML_REGEXP_FINAL_STATE) 1891 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1892 /* eliminate the transition completely */ 1893 state->nbTrans = 0; 1894 1895 state->type = XML_REGEXP_UNREACH_STATE; 1896 1897 } 1898 1899 } 1900 } 1901 } 1902 /** 1903 * xmlFAEliminateEpsilonTransitions: 1904 * @ctxt: a regexp parser context 1905 * 1906 */ 1907 static void 1908 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1909 int statenr, transnr; 1910 xmlRegStatePtr state; 1911 int has_epsilon; 1912 1913 if (ctxt->states == NULL) return; 1914 1915 /* 1916 * Eliminate simple epsilon transition and the associated unreachable 1917 * states. 1918 */ 1919 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1920 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1921 state = ctxt->states[statenr]; 1922 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) { 1923 #ifdef DEBUG_REGEXP_GRAPH 1924 printf("Removed unreachable state %d\n", statenr); 1925 #endif 1926 xmlRegFreeState(state); 1927 ctxt->states[statenr] = NULL; 1928 } 1929 } 1930 1931 has_epsilon = 0; 1932 1933 /* 1934 * Build the completed transitions bypassing the epsilons 1935 * Use a marking algorithm to avoid loops 1936 * Mark sink states too. 1937 * Process from the latests states backward to the start when 1938 * there is long cascading epsilon chains this minimize the 1939 * recursions and transition compares when adding the new ones 1940 */ 1941 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) { 1942 state = ctxt->states[statenr]; 1943 if (state == NULL) 1944 continue; 1945 if ((state->nbTrans == 0) && 1946 (state->type != XML_REGEXP_FINAL_STATE)) { 1947 state->type = XML_REGEXP_SINK_STATE; 1948 } 1949 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1950 if ((state->trans[transnr].atom == NULL) && 1951 (state->trans[transnr].to >= 0)) { 1952 if (state->trans[transnr].to == statenr) { 1953 state->trans[transnr].to = -1; 1954 #ifdef DEBUG_REGEXP_GRAPH 1955 printf("Removed loopback epsilon trans %d on %d\n", 1956 transnr, statenr); 1957 #endif 1958 } else if (state->trans[transnr].count < 0) { 1959 int newto = state->trans[transnr].to; 1960 1961 #ifdef DEBUG_REGEXP_GRAPH 1962 printf("Found epsilon trans %d from %d to %d\n", 1963 transnr, statenr, newto); 1964 #endif 1965 has_epsilon = 1; 1966 state->trans[transnr].to = -2; 1967 state->mark = XML_REGEXP_MARK_START; 1968 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1969 newto, state->trans[transnr].counter); 1970 state->mark = XML_REGEXP_MARK_NORMAL; 1971 #ifdef DEBUG_REGEXP_GRAPH 1972 } else { 1973 printf("Found counted transition %d on %d\n", 1974 transnr, statenr); 1975 #endif 1976 } 1977 } 1978 } 1979 } 1980 /* 1981 * Eliminate the epsilon transitions 1982 */ 1983 if (has_epsilon) { 1984 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1985 state = ctxt->states[statenr]; 1986 if (state == NULL) 1987 continue; 1988 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1989 xmlRegTransPtr trans = &(state->trans[transnr]); 1990 if ((trans->atom == NULL) && 1991 (trans->count < 0) && 1992 (trans->to >= 0)) { 1993 trans->to = -1; 1994 } 1995 } 1996 } 1997 } 1998 1999 /* 2000 * Use this pass to detect unreachable states too 2001 */ 2002 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2003 state = ctxt->states[statenr]; 2004 if (state != NULL) 2005 state->reached = XML_REGEXP_MARK_NORMAL; 2006 } 2007 state = ctxt->states[0]; 2008 if (state != NULL) 2009 state->reached = XML_REGEXP_MARK_START; 2010 while (state != NULL) { 2011 xmlRegStatePtr target = NULL; 2012 state->reached = XML_REGEXP_MARK_VISITED; 2013 /* 2014 * Mark all states reachable from the current reachable state 2015 */ 2016 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2017 if ((state->trans[transnr].to >= 0) && 2018 ((state->trans[transnr].atom != NULL) || 2019 (state->trans[transnr].count >= 0))) { 2020 int newto = state->trans[transnr].to; 2021 2022 if (ctxt->states[newto] == NULL) 2023 continue; 2024 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 2025 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 2026 target = ctxt->states[newto]; 2027 } 2028 } 2029 } 2030 2031 /* 2032 * find the next accessible state not explored 2033 */ 2034 if (target == NULL) { 2035 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 2036 state = ctxt->states[statenr]; 2037 if ((state != NULL) && (state->reached == 2038 XML_REGEXP_MARK_START)) { 2039 target = state; 2040 break; 2041 } 2042 } 2043 } 2044 state = target; 2045 } 2046 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2047 state = ctxt->states[statenr]; 2048 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 2049 #ifdef DEBUG_REGEXP_GRAPH 2050 printf("Removed unreachable state %d\n", statenr); 2051 #endif 2052 xmlRegFreeState(state); 2053 ctxt->states[statenr] = NULL; 2054 } 2055 } 2056 2057 } 2058 2059 static int 2060 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 2061 int ret = 0; 2062 2063 if ((range1->type == XML_REGEXP_RANGES) || 2064 (range2->type == XML_REGEXP_RANGES) || 2065 (range2->type == XML_REGEXP_SUBREG) || 2066 (range1->type == XML_REGEXP_SUBREG) || 2067 (range1->type == XML_REGEXP_STRING) || 2068 (range2->type == XML_REGEXP_STRING)) 2069 return(-1); 2070 2071 /* put them in order */ 2072 if (range1->type > range2->type) { 2073 xmlRegRangePtr tmp; 2074 2075 tmp = range1; 2076 range1 = range2; 2077 range2 = tmp; 2078 } 2079 if ((range1->type == XML_REGEXP_ANYCHAR) || 2080 (range2->type == XML_REGEXP_ANYCHAR)) { 2081 ret = 1; 2082 } else if ((range1->type == XML_REGEXP_EPSILON) || 2083 (range2->type == XML_REGEXP_EPSILON)) { 2084 return(0); 2085 } else if (range1->type == range2->type) { 2086 if ((range1->type != XML_REGEXP_CHARVAL) || 2087 (range1->end < range2->start) || 2088 (range2->end < range1->start)) 2089 ret = 1; 2090 else 2091 ret = 0; 2092 } else if (range1->type == XML_REGEXP_CHARVAL) { 2093 int codepoint; 2094 int neg = 0; 2095 2096 /* 2097 * just check all codepoints in the range for acceptance, 2098 * this is usually way cheaper since done only once at 2099 * compilation than testing over and over at runtime or 2100 * pushing too many states when evaluating. 2101 */ 2102 if (((range1->neg == 0) && (range2->neg != 0)) || 2103 ((range1->neg != 0) && (range2->neg == 0))) 2104 neg = 1; 2105 2106 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 2107 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 2108 0, range2->start, range2->end, 2109 range2->blockName); 2110 if (ret < 0) 2111 return(-1); 2112 if (((neg == 1) && (ret == 0)) || 2113 ((neg == 0) && (ret == 1))) 2114 return(1); 2115 } 2116 return(0); 2117 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 2118 (range2->type == XML_REGEXP_BLOCK_NAME)) { 2119 if (range1->type == range2->type) { 2120 ret = xmlStrEqual(range1->blockName, range2->blockName); 2121 } else { 2122 /* 2123 * comparing a block range with anything else is way 2124 * too costly, and maintining the table is like too much 2125 * memory too, so let's force the automata to save state 2126 * here. 2127 */ 2128 return(1); 2129 } 2130 } else if ((range1->type < XML_REGEXP_LETTER) || 2131 (range2->type < XML_REGEXP_LETTER)) { 2132 if ((range1->type == XML_REGEXP_ANYSPACE) && 2133 (range2->type == XML_REGEXP_NOTSPACE)) 2134 ret = 0; 2135 else if ((range1->type == XML_REGEXP_INITNAME) && 2136 (range2->type == XML_REGEXP_NOTINITNAME)) 2137 ret = 0; 2138 else if ((range1->type == XML_REGEXP_NAMECHAR) && 2139 (range2->type == XML_REGEXP_NOTNAMECHAR)) 2140 ret = 0; 2141 else if ((range1->type == XML_REGEXP_DECIMAL) && 2142 (range2->type == XML_REGEXP_NOTDECIMAL)) 2143 ret = 0; 2144 else if ((range1->type == XML_REGEXP_REALCHAR) && 2145 (range2->type == XML_REGEXP_NOTREALCHAR)) 2146 ret = 0; 2147 else { 2148 /* same thing to limit complexity */ 2149 return(1); 2150 } 2151 } else { 2152 ret = 0; 2153 /* range1->type < range2->type here */ 2154 switch (range1->type) { 2155 case XML_REGEXP_LETTER: 2156 /* all disjoint except in the subgroups */ 2157 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2158 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2159 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2160 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2161 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2162 ret = 1; 2163 break; 2164 case XML_REGEXP_MARK: 2165 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2166 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2167 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2168 ret = 1; 2169 break; 2170 case XML_REGEXP_NUMBER: 2171 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2172 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2173 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2174 ret = 1; 2175 break; 2176 case XML_REGEXP_PUNCT: 2177 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2178 (range2->type == XML_REGEXP_PUNCT_DASH) || 2179 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2180 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2181 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2182 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2183 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2184 ret = 1; 2185 break; 2186 case XML_REGEXP_SEPAR: 2187 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2188 (range2->type == XML_REGEXP_SEPAR_LINE) || 2189 (range2->type == XML_REGEXP_SEPAR_PARA)) 2190 ret = 1; 2191 break; 2192 case XML_REGEXP_SYMBOL: 2193 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2194 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2195 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2196 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2197 ret = 1; 2198 break; 2199 case XML_REGEXP_OTHER: 2200 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2201 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2202 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2203 ret = 1; 2204 break; 2205 default: 2206 if ((range2->type >= XML_REGEXP_LETTER) && 2207 (range2->type < XML_REGEXP_BLOCK_NAME)) 2208 ret = 0; 2209 else { 2210 /* safety net ! */ 2211 return(1); 2212 } 2213 } 2214 } 2215 if (((range1->neg == 0) && (range2->neg != 0)) || 2216 ((range1->neg != 0) && (range2->neg == 0))) 2217 ret = !ret; 2218 return(1); 2219 } 2220 2221 /** 2222 * xmlFACompareAtomTypes: 2223 * @type1: an atom type 2224 * @type2: an atom type 2225 * 2226 * Compares two atoms type to check whether they intersect in some ways, 2227 * this is used by xmlFACompareAtoms only 2228 * 2229 * Returns 1 if they may intersect and 0 otherwise 2230 */ 2231 static int 2232 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2233 if ((type1 == XML_REGEXP_EPSILON) || 2234 (type1 == XML_REGEXP_CHARVAL) || 2235 (type1 == XML_REGEXP_RANGES) || 2236 (type1 == XML_REGEXP_SUBREG) || 2237 (type1 == XML_REGEXP_STRING) || 2238 (type1 == XML_REGEXP_ANYCHAR)) 2239 return(1); 2240 if ((type2 == XML_REGEXP_EPSILON) || 2241 (type2 == XML_REGEXP_CHARVAL) || 2242 (type2 == XML_REGEXP_RANGES) || 2243 (type2 == XML_REGEXP_SUBREG) || 2244 (type2 == XML_REGEXP_STRING) || 2245 (type2 == XML_REGEXP_ANYCHAR)) 2246 return(1); 2247 2248 if (type1 == type2) return(1); 2249 2250 /* simplify subsequent compares by making sure type1 < type2 */ 2251 if (type1 > type2) { 2252 xmlRegAtomType tmp = type1; 2253 type1 = type2; 2254 type2 = tmp; 2255 } 2256 switch (type1) { 2257 case XML_REGEXP_ANYSPACE: /* \s */ 2258 /* can't be a letter, number, mark, pontuation, symbol */ 2259 if ((type2 == XML_REGEXP_NOTSPACE) || 2260 ((type2 >= XML_REGEXP_LETTER) && 2261 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2262 ((type2 >= XML_REGEXP_NUMBER) && 2263 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2264 ((type2 >= XML_REGEXP_MARK) && 2265 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2266 ((type2 >= XML_REGEXP_PUNCT) && 2267 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2268 ((type2 >= XML_REGEXP_SYMBOL) && 2269 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2270 ) return(0); 2271 break; 2272 case XML_REGEXP_NOTSPACE: /* \S */ 2273 break; 2274 case XML_REGEXP_INITNAME: /* \l */ 2275 /* can't be a number, mark, separator, pontuation, symbol or other */ 2276 if ((type2 == XML_REGEXP_NOTINITNAME) || 2277 ((type2 >= XML_REGEXP_NUMBER) && 2278 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2279 ((type2 >= XML_REGEXP_MARK) && 2280 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2281 ((type2 >= XML_REGEXP_SEPAR) && 2282 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2283 ((type2 >= XML_REGEXP_PUNCT) && 2284 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2285 ((type2 >= XML_REGEXP_SYMBOL) && 2286 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2287 ((type2 >= XML_REGEXP_OTHER) && 2288 (type2 <= XML_REGEXP_OTHER_NA)) 2289 ) return(0); 2290 break; 2291 case XML_REGEXP_NOTINITNAME: /* \L */ 2292 break; 2293 case XML_REGEXP_NAMECHAR: /* \c */ 2294 /* can't be a mark, separator, pontuation, symbol or other */ 2295 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2296 ((type2 >= XML_REGEXP_MARK) && 2297 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2298 ((type2 >= XML_REGEXP_PUNCT) && 2299 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2300 ((type2 >= XML_REGEXP_SEPAR) && 2301 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2302 ((type2 >= XML_REGEXP_SYMBOL) && 2303 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2304 ((type2 >= XML_REGEXP_OTHER) && 2305 (type2 <= XML_REGEXP_OTHER_NA)) 2306 ) return(0); 2307 break; 2308 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2309 break; 2310 case XML_REGEXP_DECIMAL: /* \d */ 2311 /* can't be a letter, mark, separator, pontuation, symbol or other */ 2312 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2313 (type2 == XML_REGEXP_REALCHAR) || 2314 ((type2 >= XML_REGEXP_LETTER) && 2315 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2316 ((type2 >= XML_REGEXP_MARK) && 2317 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2318 ((type2 >= XML_REGEXP_PUNCT) && 2319 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2320 ((type2 >= XML_REGEXP_SEPAR) && 2321 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2322 ((type2 >= XML_REGEXP_SYMBOL) && 2323 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2324 ((type2 >= XML_REGEXP_OTHER) && 2325 (type2 <= XML_REGEXP_OTHER_NA)) 2326 )return(0); 2327 break; 2328 case XML_REGEXP_NOTDECIMAL: /* \D */ 2329 break; 2330 case XML_REGEXP_REALCHAR: /* \w */ 2331 /* can't be a mark, separator, pontuation, symbol or other */ 2332 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2333 ((type2 >= XML_REGEXP_MARK) && 2334 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2335 ((type2 >= XML_REGEXP_PUNCT) && 2336 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2337 ((type2 >= XML_REGEXP_SEPAR) && 2338 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2339 ((type2 >= XML_REGEXP_SYMBOL) && 2340 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2341 ((type2 >= XML_REGEXP_OTHER) && 2342 (type2 <= XML_REGEXP_OTHER_NA)) 2343 )return(0); 2344 break; 2345 case XML_REGEXP_NOTREALCHAR: /* \W */ 2346 break; 2347 /* 2348 * at that point we know both type 1 and type2 are from 2349 * character categories are ordered and are different, 2350 * it becomes simple because this is a partition 2351 */ 2352 case XML_REGEXP_LETTER: 2353 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2354 return(1); 2355 return(0); 2356 case XML_REGEXP_LETTER_UPPERCASE: 2357 case XML_REGEXP_LETTER_LOWERCASE: 2358 case XML_REGEXP_LETTER_TITLECASE: 2359 case XML_REGEXP_LETTER_MODIFIER: 2360 case XML_REGEXP_LETTER_OTHERS: 2361 return(0); 2362 case XML_REGEXP_MARK: 2363 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2364 return(1); 2365 return(0); 2366 case XML_REGEXP_MARK_NONSPACING: 2367 case XML_REGEXP_MARK_SPACECOMBINING: 2368 case XML_REGEXP_MARK_ENCLOSING: 2369 return(0); 2370 case XML_REGEXP_NUMBER: 2371 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2372 return(1); 2373 return(0); 2374 case XML_REGEXP_NUMBER_DECIMAL: 2375 case XML_REGEXP_NUMBER_LETTER: 2376 case XML_REGEXP_NUMBER_OTHERS: 2377 return(0); 2378 case XML_REGEXP_PUNCT: 2379 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2380 return(1); 2381 return(0); 2382 case XML_REGEXP_PUNCT_CONNECTOR: 2383 case XML_REGEXP_PUNCT_DASH: 2384 case XML_REGEXP_PUNCT_OPEN: 2385 case XML_REGEXP_PUNCT_CLOSE: 2386 case XML_REGEXP_PUNCT_INITQUOTE: 2387 case XML_REGEXP_PUNCT_FINQUOTE: 2388 case XML_REGEXP_PUNCT_OTHERS: 2389 return(0); 2390 case XML_REGEXP_SEPAR: 2391 if (type2 <= XML_REGEXP_SEPAR_PARA) 2392 return(1); 2393 return(0); 2394 case XML_REGEXP_SEPAR_SPACE: 2395 case XML_REGEXP_SEPAR_LINE: 2396 case XML_REGEXP_SEPAR_PARA: 2397 return(0); 2398 case XML_REGEXP_SYMBOL: 2399 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2400 return(1); 2401 return(0); 2402 case XML_REGEXP_SYMBOL_MATH: 2403 case XML_REGEXP_SYMBOL_CURRENCY: 2404 case XML_REGEXP_SYMBOL_MODIFIER: 2405 case XML_REGEXP_SYMBOL_OTHERS: 2406 return(0); 2407 case XML_REGEXP_OTHER: 2408 if (type2 <= XML_REGEXP_OTHER_NA) 2409 return(1); 2410 return(0); 2411 case XML_REGEXP_OTHER_CONTROL: 2412 case XML_REGEXP_OTHER_FORMAT: 2413 case XML_REGEXP_OTHER_PRIVATE: 2414 case XML_REGEXP_OTHER_NA: 2415 return(0); 2416 default: 2417 break; 2418 } 2419 return(1); 2420 } 2421 2422 /** 2423 * xmlFAEqualAtoms: 2424 * @atom1: an atom 2425 * @atom2: an atom 2426 * 2427 * Compares two atoms to check whether they are the same exactly 2428 * this is used to remove equivalent transitions 2429 * 2430 * Returns 1 if same and 0 otherwise 2431 */ 2432 static int 2433 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { 2434 int ret = 0; 2435 2436 if (atom1 == atom2) 2437 return(1); 2438 if ((atom1 == NULL) || (atom2 == NULL)) 2439 return(0); 2440 2441 if (atom1->type != atom2->type) 2442 return(0); 2443 switch (atom1->type) { 2444 case XML_REGEXP_EPSILON: 2445 ret = 0; 2446 break; 2447 case XML_REGEXP_STRING: 2448 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2449 (xmlChar *)atom2->valuep); 2450 break; 2451 case XML_REGEXP_CHARVAL: 2452 ret = (atom1->codepoint == atom2->codepoint); 2453 break; 2454 case XML_REGEXP_RANGES: 2455 /* too hard to do in the general case */ 2456 ret = 0; 2457 default: 2458 break; 2459 } 2460 return(ret); 2461 } 2462 2463 /** 2464 * xmlFACompareAtoms: 2465 * @atom1: an atom 2466 * @atom2: an atom 2467 * 2468 * Compares two atoms to check whether they intersect in some ways, 2469 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2470 * 2471 * Returns 1 if yes and 0 otherwise 2472 */ 2473 static int 2474 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { 2475 int ret = 1; 2476 2477 if (atom1 == atom2) 2478 return(1); 2479 if ((atom1 == NULL) || (atom2 == NULL)) 2480 return(0); 2481 2482 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2483 (atom2->type == XML_REGEXP_ANYCHAR)) 2484 return(1); 2485 2486 if (atom1->type > atom2->type) { 2487 xmlRegAtomPtr tmp; 2488 tmp = atom1; 2489 atom1 = atom2; 2490 atom2 = tmp; 2491 } 2492 if (atom1->type != atom2->type) { 2493 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2494 /* if they can't intersect at the type level break now */ 2495 if (ret == 0) 2496 return(0); 2497 } 2498 switch (atom1->type) { 2499 case XML_REGEXP_STRING: 2500 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, 2501 (xmlChar *)atom2->valuep); 2502 break; 2503 case XML_REGEXP_EPSILON: 2504 goto not_determinist; 2505 case XML_REGEXP_CHARVAL: 2506 if (atom2->type == XML_REGEXP_CHARVAL) { 2507 ret = (atom1->codepoint == atom2->codepoint); 2508 } else { 2509 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2510 if (ret < 0) 2511 ret = 1; 2512 } 2513 break; 2514 case XML_REGEXP_RANGES: 2515 if (atom2->type == XML_REGEXP_RANGES) { 2516 int i, j, res; 2517 xmlRegRangePtr r1, r2; 2518 2519 /* 2520 * need to check that none of the ranges eventually matches 2521 */ 2522 for (i = 0;i < atom1->nbRanges;i++) { 2523 for (j = 0;j < atom2->nbRanges;j++) { 2524 r1 = atom1->ranges[i]; 2525 r2 = atom2->ranges[j]; 2526 res = xmlFACompareRanges(r1, r2); 2527 if (res == 1) { 2528 ret = 1; 2529 goto done; 2530 } 2531 } 2532 } 2533 ret = 0; 2534 } 2535 break; 2536 default: 2537 goto not_determinist; 2538 } 2539 done: 2540 if (atom1->neg != atom2->neg) { 2541 ret = !ret; 2542 } 2543 if (ret == 0) 2544 return(0); 2545 not_determinist: 2546 return(1); 2547 } 2548 2549 /** 2550 * xmlFARecurseDeterminism: 2551 * @ctxt: a regexp parser context 2552 * 2553 * Check whether the associated regexp is determinist, 2554 * should be called after xmlFAEliminateEpsilonTransitions() 2555 * 2556 */ 2557 static int 2558 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2559 int to, xmlRegAtomPtr atom) { 2560 int ret = 1; 2561 int res; 2562 int transnr, nbTrans; 2563 xmlRegTransPtr t1; 2564 2565 if (state == NULL) 2566 return(ret); 2567 /* 2568 * don't recurse on transitions potentially added in the course of 2569 * the elimination. 2570 */ 2571 nbTrans = state->nbTrans; 2572 for (transnr = 0;transnr < nbTrans;transnr++) { 2573 t1 = &(state->trans[transnr]); 2574 /* 2575 * check transitions conflicting with the one looked at 2576 */ 2577 if (t1->atom == NULL) { 2578 if (t1->to < 0) 2579 continue; 2580 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2581 to, atom); 2582 if (res == 0) { 2583 ret = 0; 2584 /* t1->nd = 1; */ 2585 } 2586 continue; 2587 } 2588 if (t1->to != to) 2589 continue; 2590 if (xmlFACompareAtoms(t1->atom, atom)) { 2591 ret = 0; 2592 /* mark the transition as non-deterministic */ 2593 t1->nd = 1; 2594 } 2595 } 2596 return(ret); 2597 } 2598 2599 /** 2600 * xmlFAComputesDeterminism: 2601 * @ctxt: a regexp parser context 2602 * 2603 * Check whether the associated regexp is determinist, 2604 * should be called after xmlFAEliminateEpsilonTransitions() 2605 * 2606 */ 2607 static int 2608 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2609 int statenr, transnr; 2610 xmlRegStatePtr state; 2611 xmlRegTransPtr t1, t2, last; 2612 int i; 2613 int ret = 1; 2614 2615 #ifdef DEBUG_REGEXP_GRAPH 2616 printf("xmlFAComputesDeterminism\n"); 2617 xmlRegPrintCtxt(stdout, ctxt); 2618 #endif 2619 if (ctxt->determinist != -1) 2620 return(ctxt->determinist); 2621 2622 /* 2623 * First cleanup the automata removing cancelled transitions 2624 */ 2625 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2626 state = ctxt->states[statenr]; 2627 if (state == NULL) 2628 continue; 2629 if (state->nbTrans < 2) 2630 continue; 2631 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2632 t1 = &(state->trans[transnr]); 2633 /* 2634 * Determinism checks in case of counted or all transitions 2635 * will have to be handled separately 2636 */ 2637 if (t1->atom == NULL) { 2638 /* t1->nd = 1; */ 2639 continue; 2640 } 2641 if (t1->to == -1) /* eliminated */ 2642 continue; 2643 for (i = 0;i < transnr;i++) { 2644 t2 = &(state->trans[i]); 2645 if (t2->to == -1) /* eliminated */ 2646 continue; 2647 if (t2->atom != NULL) { 2648 if (t1->to == t2->to) { 2649 if (xmlFAEqualAtoms(t1->atom, t2->atom)) 2650 t2->to = -1; /* eliminated */ 2651 } 2652 } 2653 } 2654 } 2655 } 2656 2657 /* 2658 * Check for all states that there aren't 2 transitions 2659 * with the same atom and a different target. 2660 */ 2661 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2662 state = ctxt->states[statenr]; 2663 if (state == NULL) 2664 continue; 2665 if (state->nbTrans < 2) 2666 continue; 2667 last = NULL; 2668 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2669 t1 = &(state->trans[transnr]); 2670 /* 2671 * Determinism checks in case of counted or all transitions 2672 * will have to be handled separately 2673 */ 2674 if (t1->atom == NULL) { 2675 continue; 2676 } 2677 if (t1->to == -1) /* eliminated */ 2678 continue; 2679 for (i = 0;i < transnr;i++) { 2680 t2 = &(state->trans[i]); 2681 if (t2->to == -1) /* eliminated */ 2682 continue; 2683 if (t2->atom != NULL) { 2684 /* not determinist ! */ 2685 if (xmlFACompareAtoms(t1->atom, t2->atom)) { 2686 ret = 0; 2687 /* mark the transitions as non-deterministic ones */ 2688 t1->nd = 1; 2689 t2->nd = 1; 2690 last = t1; 2691 } 2692 } else if (t1->to != -1) { 2693 /* 2694 * do the closure in case of remaining specific 2695 * epsilon transitions like choices or all 2696 */ 2697 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2698 t2->to, t2->atom); 2699 /* don't shortcut the computation so all non deterministic 2700 transition get marked down 2701 if (ret == 0) 2702 return(0); 2703 */ 2704 if (ret == 0) { 2705 t1->nd = 1; 2706 /* t2->nd = 1; */ 2707 last = t1; 2708 } 2709 } 2710 } 2711 /* don't shortcut the computation so all non deterministic 2712 transition get marked down 2713 if (ret == 0) 2714 break; */ 2715 } 2716 2717 /* 2718 * mark specifically the last non-deterministic transition 2719 * from a state since there is no need to set-up rollback 2720 * from it 2721 */ 2722 if (last != NULL) { 2723 last->nd = 2; 2724 } 2725 2726 /* don't shortcut the computation so all non deterministic 2727 transition get marked down 2728 if (ret == 0) 2729 break; */ 2730 } 2731 2732 ctxt->determinist = ret; 2733 return(ret); 2734 } 2735 2736 /************************************************************************ 2737 * * 2738 * Routines to check input against transition atoms * 2739 * * 2740 ************************************************************************/ 2741 2742 static int 2743 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2744 int start, int end, const xmlChar *blockName) { 2745 int ret = 0; 2746 2747 switch (type) { 2748 case XML_REGEXP_STRING: 2749 case XML_REGEXP_SUBREG: 2750 case XML_REGEXP_RANGES: 2751 case XML_REGEXP_EPSILON: 2752 return(-1); 2753 case XML_REGEXP_ANYCHAR: 2754 ret = ((codepoint != '\n') && (codepoint != '\r')); 2755 break; 2756 case XML_REGEXP_CHARVAL: 2757 ret = ((codepoint >= start) && (codepoint <= end)); 2758 break; 2759 case XML_REGEXP_NOTSPACE: 2760 neg = !neg; 2761 case XML_REGEXP_ANYSPACE: 2762 ret = ((codepoint == '\n') || (codepoint == '\r') || 2763 (codepoint == '\t') || (codepoint == ' ')); 2764 break; 2765 case XML_REGEXP_NOTINITNAME: 2766 neg = !neg; 2767 case XML_REGEXP_INITNAME: 2768 ret = (IS_LETTER(codepoint) || 2769 (codepoint == '_') || (codepoint == ':')); 2770 break; 2771 case XML_REGEXP_NOTNAMECHAR: 2772 neg = !neg; 2773 case XML_REGEXP_NAMECHAR: 2774 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2775 (codepoint == '.') || (codepoint == '-') || 2776 (codepoint == '_') || (codepoint == ':') || 2777 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2778 break; 2779 case XML_REGEXP_NOTDECIMAL: 2780 neg = !neg; 2781 case XML_REGEXP_DECIMAL: 2782 ret = xmlUCSIsCatNd(codepoint); 2783 break; 2784 case XML_REGEXP_REALCHAR: 2785 neg = !neg; 2786 case XML_REGEXP_NOTREALCHAR: 2787 ret = xmlUCSIsCatP(codepoint); 2788 if (ret == 0) 2789 ret = xmlUCSIsCatZ(codepoint); 2790 if (ret == 0) 2791 ret = xmlUCSIsCatC(codepoint); 2792 break; 2793 case XML_REGEXP_LETTER: 2794 ret = xmlUCSIsCatL(codepoint); 2795 break; 2796 case XML_REGEXP_LETTER_UPPERCASE: 2797 ret = xmlUCSIsCatLu(codepoint); 2798 break; 2799 case XML_REGEXP_LETTER_LOWERCASE: 2800 ret = xmlUCSIsCatLl(codepoint); 2801 break; 2802 case XML_REGEXP_LETTER_TITLECASE: 2803 ret = xmlUCSIsCatLt(codepoint); 2804 break; 2805 case XML_REGEXP_LETTER_MODIFIER: 2806 ret = xmlUCSIsCatLm(codepoint); 2807 break; 2808 case XML_REGEXP_LETTER_OTHERS: 2809 ret = xmlUCSIsCatLo(codepoint); 2810 break; 2811 case XML_REGEXP_MARK: 2812 ret = xmlUCSIsCatM(codepoint); 2813 break; 2814 case XML_REGEXP_MARK_NONSPACING: 2815 ret = xmlUCSIsCatMn(codepoint); 2816 break; 2817 case XML_REGEXP_MARK_SPACECOMBINING: 2818 ret = xmlUCSIsCatMc(codepoint); 2819 break; 2820 case XML_REGEXP_MARK_ENCLOSING: 2821 ret = xmlUCSIsCatMe(codepoint); 2822 break; 2823 case XML_REGEXP_NUMBER: 2824 ret = xmlUCSIsCatN(codepoint); 2825 break; 2826 case XML_REGEXP_NUMBER_DECIMAL: 2827 ret = xmlUCSIsCatNd(codepoint); 2828 break; 2829 case XML_REGEXP_NUMBER_LETTER: 2830 ret = xmlUCSIsCatNl(codepoint); 2831 break; 2832 case XML_REGEXP_NUMBER_OTHERS: 2833 ret = xmlUCSIsCatNo(codepoint); 2834 break; 2835 case XML_REGEXP_PUNCT: 2836 ret = xmlUCSIsCatP(codepoint); 2837 break; 2838 case XML_REGEXP_PUNCT_CONNECTOR: 2839 ret = xmlUCSIsCatPc(codepoint); 2840 break; 2841 case XML_REGEXP_PUNCT_DASH: 2842 ret = xmlUCSIsCatPd(codepoint); 2843 break; 2844 case XML_REGEXP_PUNCT_OPEN: 2845 ret = xmlUCSIsCatPs(codepoint); 2846 break; 2847 case XML_REGEXP_PUNCT_CLOSE: 2848 ret = xmlUCSIsCatPe(codepoint); 2849 break; 2850 case XML_REGEXP_PUNCT_INITQUOTE: 2851 ret = xmlUCSIsCatPi(codepoint); 2852 break; 2853 case XML_REGEXP_PUNCT_FINQUOTE: 2854 ret = xmlUCSIsCatPf(codepoint); 2855 break; 2856 case XML_REGEXP_PUNCT_OTHERS: 2857 ret = xmlUCSIsCatPo(codepoint); 2858 break; 2859 case XML_REGEXP_SEPAR: 2860 ret = xmlUCSIsCatZ(codepoint); 2861 break; 2862 case XML_REGEXP_SEPAR_SPACE: 2863 ret = xmlUCSIsCatZs(codepoint); 2864 break; 2865 case XML_REGEXP_SEPAR_LINE: 2866 ret = xmlUCSIsCatZl(codepoint); 2867 break; 2868 case XML_REGEXP_SEPAR_PARA: 2869 ret = xmlUCSIsCatZp(codepoint); 2870 break; 2871 case XML_REGEXP_SYMBOL: 2872 ret = xmlUCSIsCatS(codepoint); 2873 break; 2874 case XML_REGEXP_SYMBOL_MATH: 2875 ret = xmlUCSIsCatSm(codepoint); 2876 break; 2877 case XML_REGEXP_SYMBOL_CURRENCY: 2878 ret = xmlUCSIsCatSc(codepoint); 2879 break; 2880 case XML_REGEXP_SYMBOL_MODIFIER: 2881 ret = xmlUCSIsCatSk(codepoint); 2882 break; 2883 case XML_REGEXP_SYMBOL_OTHERS: 2884 ret = xmlUCSIsCatSo(codepoint); 2885 break; 2886 case XML_REGEXP_OTHER: 2887 ret = xmlUCSIsCatC(codepoint); 2888 break; 2889 case XML_REGEXP_OTHER_CONTROL: 2890 ret = xmlUCSIsCatCc(codepoint); 2891 break; 2892 case XML_REGEXP_OTHER_FORMAT: 2893 ret = xmlUCSIsCatCf(codepoint); 2894 break; 2895 case XML_REGEXP_OTHER_PRIVATE: 2896 ret = xmlUCSIsCatCo(codepoint); 2897 break; 2898 case XML_REGEXP_OTHER_NA: 2899 /* ret = xmlUCSIsCatCn(codepoint); */ 2900 /* Seems it doesn't exist anymore in recent Unicode releases */ 2901 ret = 0; 2902 break; 2903 case XML_REGEXP_BLOCK_NAME: 2904 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2905 break; 2906 } 2907 if (neg) 2908 return(!ret); 2909 return(ret); 2910 } 2911 2912 static int 2913 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2914 int i, ret = 0; 2915 xmlRegRangePtr range; 2916 2917 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2918 return(-1); 2919 2920 switch (atom->type) { 2921 case XML_REGEXP_SUBREG: 2922 case XML_REGEXP_EPSILON: 2923 return(-1); 2924 case XML_REGEXP_CHARVAL: 2925 return(codepoint == atom->codepoint); 2926 case XML_REGEXP_RANGES: { 2927 int accept = 0; 2928 2929 for (i = 0;i < atom->nbRanges;i++) { 2930 range = atom->ranges[i]; 2931 if (range->neg == 2) { 2932 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2933 0, range->start, range->end, 2934 range->blockName); 2935 if (ret != 0) 2936 return(0); /* excluded char */ 2937 } else if (range->neg) { 2938 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2939 0, range->start, range->end, 2940 range->blockName); 2941 if (ret == 0) 2942 accept = 1; 2943 else 2944 return(0); 2945 } else { 2946 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2947 0, range->start, range->end, 2948 range->blockName); 2949 if (ret != 0) 2950 accept = 1; /* might still be excluded */ 2951 } 2952 } 2953 return(accept); 2954 } 2955 case XML_REGEXP_STRING: 2956 printf("TODO: XML_REGEXP_STRING\n"); 2957 return(-1); 2958 case XML_REGEXP_ANYCHAR: 2959 case XML_REGEXP_ANYSPACE: 2960 case XML_REGEXP_NOTSPACE: 2961 case XML_REGEXP_INITNAME: 2962 case XML_REGEXP_NOTINITNAME: 2963 case XML_REGEXP_NAMECHAR: 2964 case XML_REGEXP_NOTNAMECHAR: 2965 case XML_REGEXP_DECIMAL: 2966 case XML_REGEXP_NOTDECIMAL: 2967 case XML_REGEXP_REALCHAR: 2968 case XML_REGEXP_NOTREALCHAR: 2969 case XML_REGEXP_LETTER: 2970 case XML_REGEXP_LETTER_UPPERCASE: 2971 case XML_REGEXP_LETTER_LOWERCASE: 2972 case XML_REGEXP_LETTER_TITLECASE: 2973 case XML_REGEXP_LETTER_MODIFIER: 2974 case XML_REGEXP_LETTER_OTHERS: 2975 case XML_REGEXP_MARK: 2976 case XML_REGEXP_MARK_NONSPACING: 2977 case XML_REGEXP_MARK_SPACECOMBINING: 2978 case XML_REGEXP_MARK_ENCLOSING: 2979 case XML_REGEXP_NUMBER: 2980 case XML_REGEXP_NUMBER_DECIMAL: 2981 case XML_REGEXP_NUMBER_LETTER: 2982 case XML_REGEXP_NUMBER_OTHERS: 2983 case XML_REGEXP_PUNCT: 2984 case XML_REGEXP_PUNCT_CONNECTOR: 2985 case XML_REGEXP_PUNCT_DASH: 2986 case XML_REGEXP_PUNCT_OPEN: 2987 case XML_REGEXP_PUNCT_CLOSE: 2988 case XML_REGEXP_PUNCT_INITQUOTE: 2989 case XML_REGEXP_PUNCT_FINQUOTE: 2990 case XML_REGEXP_PUNCT_OTHERS: 2991 case XML_REGEXP_SEPAR: 2992 case XML_REGEXP_SEPAR_SPACE: 2993 case XML_REGEXP_SEPAR_LINE: 2994 case XML_REGEXP_SEPAR_PARA: 2995 case XML_REGEXP_SYMBOL: 2996 case XML_REGEXP_SYMBOL_MATH: 2997 case XML_REGEXP_SYMBOL_CURRENCY: 2998 case XML_REGEXP_SYMBOL_MODIFIER: 2999 case XML_REGEXP_SYMBOL_OTHERS: 3000 case XML_REGEXP_OTHER: 3001 case XML_REGEXP_OTHER_CONTROL: 3002 case XML_REGEXP_OTHER_FORMAT: 3003 case XML_REGEXP_OTHER_PRIVATE: 3004 case XML_REGEXP_OTHER_NA: 3005 case XML_REGEXP_BLOCK_NAME: 3006 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 3007 (const xmlChar *)atom->valuep); 3008 if (atom->neg) 3009 ret = !ret; 3010 break; 3011 } 3012 return(ret); 3013 } 3014 3015 /************************************************************************ 3016 * * 3017 * Saving and restoring state of an execution context * 3018 * * 3019 ************************************************************************/ 3020 3021 #ifdef DEBUG_REGEXP_EXEC 3022 static void 3023 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 3024 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 3025 if (exec->inputStack != NULL) { 3026 int i; 3027 printf(": "); 3028 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 3029 printf("%s ", (const char *) 3030 exec->inputStack[exec->inputStackNr - (i + 1)].value); 3031 } else { 3032 printf(": %s", &(exec->inputString[exec->index])); 3033 } 3034 printf("\n"); 3035 } 3036 #endif 3037 3038 static void 3039 xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 3040 #ifdef DEBUG_REGEXP_EXEC 3041 printf("saving "); 3042 exec->transno++; 3043 xmlFARegDebugExec(exec); 3044 exec->transno--; 3045 #endif 3046 #ifdef MAX_PUSH 3047 if (exec->nbPush > MAX_PUSH) { 3048 return; 3049 } 3050 exec->nbPush++; 3051 #endif 3052 3053 if (exec->maxRollbacks == 0) { 3054 exec->maxRollbacks = 4; 3055 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 3056 sizeof(xmlRegExecRollback)); 3057 if (exec->rollbacks == NULL) { 3058 xmlRegexpErrMemory(NULL, "saving regexp"); 3059 exec->maxRollbacks = 0; 3060 return; 3061 } 3062 memset(exec->rollbacks, 0, 3063 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3064 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 3065 xmlRegExecRollback *tmp; 3066 int len = exec->maxRollbacks; 3067 3068 exec->maxRollbacks *= 2; 3069 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 3070 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3071 if (tmp == NULL) { 3072 xmlRegexpErrMemory(NULL, "saving regexp"); 3073 exec->maxRollbacks /= 2; 3074 return; 3075 } 3076 exec->rollbacks = tmp; 3077 tmp = &exec->rollbacks[len]; 3078 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 3079 } 3080 exec->rollbacks[exec->nbRollbacks].state = exec->state; 3081 exec->rollbacks[exec->nbRollbacks].index = exec->index; 3082 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 3083 if (exec->comp->nbCounters > 0) { 3084 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3085 exec->rollbacks[exec->nbRollbacks].counts = (int *) 3086 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 3087 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3088 xmlRegexpErrMemory(NULL, "saving regexp"); 3089 exec->status = -5; 3090 return; 3091 } 3092 } 3093 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 3094 exec->comp->nbCounters * sizeof(int)); 3095 } 3096 exec->nbRollbacks++; 3097 } 3098 3099 static void 3100 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 3101 if (exec->nbRollbacks <= 0) { 3102 exec->status = -1; 3103 #ifdef DEBUG_REGEXP_EXEC 3104 printf("rollback failed on empty stack\n"); 3105 #endif 3106 return; 3107 } 3108 exec->nbRollbacks--; 3109 exec->state = exec->rollbacks[exec->nbRollbacks].state; 3110 exec->index = exec->rollbacks[exec->nbRollbacks].index; 3111 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 3112 if (exec->comp->nbCounters > 0) { 3113 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3114 fprintf(stderr, "exec save: allocation failed"); 3115 exec->status = -6; 3116 return; 3117 } 3118 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 3119 exec->comp->nbCounters * sizeof(int)); 3120 } 3121 3122 #ifdef DEBUG_REGEXP_EXEC 3123 printf("restored "); 3124 xmlFARegDebugExec(exec); 3125 #endif 3126 } 3127 3128 /************************************************************************ 3129 * * 3130 * Verifier, running an input against a compiled regexp * 3131 * * 3132 ************************************************************************/ 3133 3134 static int 3135 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 3136 xmlRegExecCtxt execval; 3137 xmlRegExecCtxtPtr exec = &execval; 3138 int ret, codepoint = 0, len, deter; 3139 3140 exec->inputString = content; 3141 exec->index = 0; 3142 exec->nbPush = 0; 3143 exec->determinist = 1; 3144 exec->maxRollbacks = 0; 3145 exec->nbRollbacks = 0; 3146 exec->rollbacks = NULL; 3147 exec->status = 0; 3148 exec->comp = comp; 3149 exec->state = comp->states[0]; 3150 exec->transno = 0; 3151 exec->transcount = 0; 3152 exec->inputStack = NULL; 3153 exec->inputStackMax = 0; 3154 if (comp->nbCounters > 0) { 3155 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3156 if (exec->counts == NULL) { 3157 xmlRegexpErrMemory(NULL, "running regexp"); 3158 return(-1); 3159 } 3160 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3161 } else 3162 exec->counts = NULL; 3163 while ((exec->status == 0) && 3164 ((exec->inputString[exec->index] != 0) || 3165 ((exec->state != NULL) && 3166 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3167 xmlRegTransPtr trans; 3168 xmlRegAtomPtr atom; 3169 3170 /* 3171 * If end of input on non-terminal state, rollback, however we may 3172 * still have epsilon like transition for counted transitions 3173 * on counters, in that case don't break too early. Additionally, 3174 * if we are working on a range like "AB{0,2}", where B is not present, 3175 * we don't want to break. 3176 */ 3177 len = 1; 3178 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3179 /* 3180 * if there is a transition, we must check if 3181 * atom allows minOccurs of 0 3182 */ 3183 if (exec->transno < exec->state->nbTrans) { 3184 trans = &exec->state->trans[exec->transno]; 3185 if (trans->to >=0) { 3186 atom = trans->atom; 3187 if (!((atom->min == 0) && (atom->max > 0))) 3188 goto rollback; 3189 } 3190 } else 3191 goto rollback; 3192 } 3193 3194 exec->transcount = 0; 3195 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3196 trans = &exec->state->trans[exec->transno]; 3197 if (trans->to < 0) 3198 continue; 3199 atom = trans->atom; 3200 ret = 0; 3201 deter = 1; 3202 if (trans->count >= 0) { 3203 int count; 3204 xmlRegCounterPtr counter; 3205 3206 if (exec->counts == NULL) { 3207 exec->status = -1; 3208 goto error; 3209 } 3210 /* 3211 * A counted transition. 3212 */ 3213 3214 count = exec->counts[trans->count]; 3215 counter = &exec->comp->counters[trans->count]; 3216 #ifdef DEBUG_REGEXP_EXEC 3217 printf("testing count %d: val %d, min %d, max %d\n", 3218 trans->count, count, counter->min, counter->max); 3219 #endif 3220 ret = ((count >= counter->min) && (count <= counter->max)); 3221 if ((ret) && (counter->min != counter->max)) 3222 deter = 0; 3223 } else if (atom == NULL) { 3224 fprintf(stderr, "epsilon transition left at runtime\n"); 3225 exec->status = -2; 3226 break; 3227 } else if (exec->inputString[exec->index] != 0) { 3228 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3229 ret = xmlRegCheckCharacter(atom, codepoint); 3230 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3231 xmlRegStatePtr to = comp->states[trans->to]; 3232 3233 /* 3234 * this is a multiple input sequence 3235 * If there is a counter associated increment it now. 3236 * before potentially saving and rollback 3237 * do not increment if the counter is already over the 3238 * maximum limit in which case get to next transition 3239 */ 3240 if (trans->counter >= 0) { 3241 xmlRegCounterPtr counter; 3242 3243 if ((exec->counts == NULL) || 3244 (exec->comp == NULL) || 3245 (exec->comp->counters == NULL)) { 3246 exec->status = -1; 3247 goto error; 3248 } 3249 counter = &exec->comp->counters[trans->counter]; 3250 if (exec->counts[trans->counter] >= counter->max) 3251 continue; /* for loop on transitions */ 3252 3253 #ifdef DEBUG_REGEXP_EXEC 3254 printf("Increasing count %d\n", trans->counter); 3255 #endif 3256 exec->counts[trans->counter]++; 3257 } 3258 if (exec->state->nbTrans > exec->transno + 1) { 3259 xmlFARegExecSave(exec); 3260 } 3261 exec->transcount = 1; 3262 do { 3263 /* 3264 * Try to progress as much as possible on the input 3265 */ 3266 if (exec->transcount == atom->max) { 3267 break; 3268 } 3269 exec->index += len; 3270 /* 3271 * End of input: stop here 3272 */ 3273 if (exec->inputString[exec->index] == 0) { 3274 exec->index -= len; 3275 break; 3276 } 3277 if (exec->transcount >= atom->min) { 3278 int transno = exec->transno; 3279 xmlRegStatePtr state = exec->state; 3280 3281 /* 3282 * The transition is acceptable save it 3283 */ 3284 exec->transno = -1; /* trick */ 3285 exec->state = to; 3286 xmlFARegExecSave(exec); 3287 exec->transno = transno; 3288 exec->state = state; 3289 } 3290 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3291 len); 3292 ret = xmlRegCheckCharacter(atom, codepoint); 3293 exec->transcount++; 3294 } while (ret == 1); 3295 if (exec->transcount < atom->min) 3296 ret = 0; 3297 3298 /* 3299 * If the last check failed but one transition was found 3300 * possible, rollback 3301 */ 3302 if (ret < 0) 3303 ret = 0; 3304 if (ret == 0) { 3305 goto rollback; 3306 } 3307 if (trans->counter >= 0) { 3308 if (exec->counts == NULL) { 3309 exec->status = -1; 3310 goto error; 3311 } 3312 #ifdef DEBUG_REGEXP_EXEC 3313 printf("Decreasing count %d\n", trans->counter); 3314 #endif 3315 exec->counts[trans->counter]--; 3316 } 3317 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3318 /* 3319 * we don't match on the codepoint, but minOccurs of 0 3320 * says that's ok. Setting len to 0 inhibits stepping 3321 * over the codepoint. 3322 */ 3323 exec->transcount = 1; 3324 len = 0; 3325 ret = 1; 3326 } 3327 } else if ((atom->min == 0) && (atom->max > 0)) { 3328 /* another spot to match when minOccurs is 0 */ 3329 exec->transcount = 1; 3330 len = 0; 3331 ret = 1; 3332 } 3333 if (ret == 1) { 3334 if ((trans->nd == 1) || 3335 ((trans->count >= 0) && (deter == 0) && 3336 (exec->state->nbTrans > exec->transno + 1))) { 3337 #ifdef DEBUG_REGEXP_EXEC 3338 if (trans->nd == 1) 3339 printf("Saving on nd transition atom %d for %c at %d\n", 3340 trans->atom->no, codepoint, exec->index); 3341 else 3342 printf("Saving on counted transition count %d for %c at %d\n", 3343 trans->count, codepoint, exec->index); 3344 #endif 3345 xmlFARegExecSave(exec); 3346 } 3347 if (trans->counter >= 0) { 3348 xmlRegCounterPtr counter; 3349 3350 /* make sure we don't go over the counter maximum value */ 3351 if ((exec->counts == NULL) || 3352 (exec->comp == NULL) || 3353 (exec->comp->counters == NULL)) { 3354 exec->status = -1; 3355 goto error; 3356 } 3357 counter = &exec->comp->counters[trans->counter]; 3358 if (exec->counts[trans->counter] >= counter->max) 3359 continue; /* for loop on transitions */ 3360 #ifdef DEBUG_REGEXP_EXEC 3361 printf("Increasing count %d\n", trans->counter); 3362 #endif 3363 exec->counts[trans->counter]++; 3364 } 3365 if ((trans->count >= 0) && 3366 (trans->count < REGEXP_ALL_COUNTER)) { 3367 if (exec->counts == NULL) { 3368 exec->status = -1; 3369 goto error; 3370 } 3371 #ifdef DEBUG_REGEXP_EXEC 3372 printf("resetting count %d on transition\n", 3373 trans->count); 3374 #endif 3375 exec->counts[trans->count] = 0; 3376 } 3377 #ifdef DEBUG_REGEXP_EXEC 3378 printf("entering state %d\n", trans->to); 3379 #endif 3380 exec->state = comp->states[trans->to]; 3381 exec->transno = 0; 3382 if (trans->atom != NULL) { 3383 exec->index += len; 3384 } 3385 goto progress; 3386 } else if (ret < 0) { 3387 exec->status = -4; 3388 break; 3389 } 3390 } 3391 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3392 rollback: 3393 /* 3394 * Failed to find a way out 3395 */ 3396 exec->determinist = 0; 3397 #ifdef DEBUG_REGEXP_EXEC 3398 printf("rollback from state %d on %d:%c\n", exec->state->no, 3399 codepoint,codepoint); 3400 #endif 3401 xmlFARegExecRollBack(exec); 3402 } 3403 progress: 3404 continue; 3405 } 3406 error: 3407 if (exec->rollbacks != NULL) { 3408 if (exec->counts != NULL) { 3409 int i; 3410 3411 for (i = 0;i < exec->maxRollbacks;i++) 3412 if (exec->rollbacks[i].counts != NULL) 3413 xmlFree(exec->rollbacks[i].counts); 3414 } 3415 xmlFree(exec->rollbacks); 3416 } 3417 if (exec->counts != NULL) 3418 xmlFree(exec->counts); 3419 if (exec->status == 0) 3420 return(1); 3421 if (exec->status == -1) { 3422 if (exec->nbPush > MAX_PUSH) 3423 return(-1); 3424 return(0); 3425 } 3426 return(exec->status); 3427 } 3428 3429 /************************************************************************ 3430 * * 3431 * Progressive interface to the verifier one atom at a time * 3432 * * 3433 ************************************************************************/ 3434 #ifdef DEBUG_ERR 3435 static void testerr(xmlRegExecCtxtPtr exec); 3436 #endif 3437 3438 /** 3439 * xmlRegNewExecCtxt: 3440 * @comp: a precompiled regular expression 3441 * @callback: a callback function used for handling progresses in the 3442 * automata matching phase 3443 * @data: the context data associated to the callback in this context 3444 * 3445 * Build a context used for progressive evaluation of a regexp. 3446 * 3447 * Returns the new context 3448 */ 3449 xmlRegExecCtxtPtr 3450 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3451 xmlRegExecCtxtPtr exec; 3452 3453 if (comp == NULL) 3454 return(NULL); 3455 if ((comp->compact == NULL) && (comp->states == NULL)) 3456 return(NULL); 3457 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3458 if (exec == NULL) { 3459 xmlRegexpErrMemory(NULL, "creating execution context"); 3460 return(NULL); 3461 } 3462 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3463 exec->inputString = NULL; 3464 exec->index = 0; 3465 exec->determinist = 1; 3466 exec->maxRollbacks = 0; 3467 exec->nbRollbacks = 0; 3468 exec->rollbacks = NULL; 3469 exec->status = 0; 3470 exec->comp = comp; 3471 if (comp->compact == NULL) 3472 exec->state = comp->states[0]; 3473 exec->transno = 0; 3474 exec->transcount = 0; 3475 exec->callback = callback; 3476 exec->data = data; 3477 if (comp->nbCounters > 0) { 3478 /* 3479 * For error handling, exec->counts is allocated twice the size 3480 * the second half is used to store the data in case of rollback 3481 */ 3482 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3483 * 2); 3484 if (exec->counts == NULL) { 3485 xmlRegexpErrMemory(NULL, "creating execution context"); 3486 xmlFree(exec); 3487 return(NULL); 3488 } 3489 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3490 exec->errCounts = &exec->counts[comp->nbCounters]; 3491 } else { 3492 exec->counts = NULL; 3493 exec->errCounts = NULL; 3494 } 3495 exec->inputStackMax = 0; 3496 exec->inputStackNr = 0; 3497 exec->inputStack = NULL; 3498 exec->errStateNo = -1; 3499 exec->errString = NULL; 3500 exec->nbPush = 0; 3501 return(exec); 3502 } 3503 3504 /** 3505 * xmlRegFreeExecCtxt: 3506 * @exec: a regular expression evaulation context 3507 * 3508 * Free the structures associated to a regular expression evaulation context. 3509 */ 3510 void 3511 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3512 if (exec == NULL) 3513 return; 3514 3515 if (exec->rollbacks != NULL) { 3516 if (exec->counts != NULL) { 3517 int i; 3518 3519 for (i = 0;i < exec->maxRollbacks;i++) 3520 if (exec->rollbacks[i].counts != NULL) 3521 xmlFree(exec->rollbacks[i].counts); 3522 } 3523 xmlFree(exec->rollbacks); 3524 } 3525 if (exec->counts != NULL) 3526 xmlFree(exec->counts); 3527 if (exec->inputStack != NULL) { 3528 int i; 3529 3530 for (i = 0;i < exec->inputStackNr;i++) { 3531 if (exec->inputStack[i].value != NULL) 3532 xmlFree(exec->inputStack[i].value); 3533 } 3534 xmlFree(exec->inputStack); 3535 } 3536 if (exec->errString != NULL) 3537 xmlFree(exec->errString); 3538 xmlFree(exec); 3539 } 3540 3541 static void 3542 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3543 void *data) { 3544 #ifdef DEBUG_PUSH 3545 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3546 #endif 3547 if (exec->inputStackMax == 0) { 3548 exec->inputStackMax = 4; 3549 exec->inputStack = (xmlRegInputTokenPtr) 3550 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3551 if (exec->inputStack == NULL) { 3552 xmlRegexpErrMemory(NULL, "pushing input string"); 3553 exec->inputStackMax = 0; 3554 return; 3555 } 3556 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3557 xmlRegInputTokenPtr tmp; 3558 3559 exec->inputStackMax *= 2; 3560 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3561 exec->inputStackMax * sizeof(xmlRegInputToken)); 3562 if (tmp == NULL) { 3563 xmlRegexpErrMemory(NULL, "pushing input string"); 3564 exec->inputStackMax /= 2; 3565 return; 3566 } 3567 exec->inputStack = tmp; 3568 } 3569 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3570 exec->inputStack[exec->inputStackNr].data = data; 3571 exec->inputStackNr++; 3572 exec->inputStack[exec->inputStackNr].value = NULL; 3573 exec->inputStack[exec->inputStackNr].data = NULL; 3574 } 3575 3576 /** 3577 * xmlRegStrEqualWildcard: 3578 * @expStr: the string to be evaluated 3579 * @valStr: the validation string 3580 * 3581 * Checks if both strings are equal or have the same content. "*" 3582 * can be used as a wildcard in @valStr; "|" is used as a seperator of 3583 * substrings in both @expStr and @valStr. 3584 * 3585 * Returns 1 if the comparison is satisfied and the number of substrings 3586 * is equal, 0 otherwise. 3587 */ 3588 3589 static int 3590 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3591 if (expStr == valStr) return(1); 3592 if (expStr == NULL) return(0); 3593 if (valStr == NULL) return(0); 3594 do { 3595 /* 3596 * Eval if we have a wildcard for the current item. 3597 */ 3598 if (*expStr != *valStr) { 3599 /* if one of them starts with a wildcard make valStr be it */ 3600 if (*valStr == '*') { 3601 const xmlChar *tmp; 3602 3603 tmp = valStr; 3604 valStr = expStr; 3605 expStr = tmp; 3606 } 3607 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3608 do { 3609 if (*valStr == XML_REG_STRING_SEPARATOR) 3610 break; 3611 valStr++; 3612 } while (*valStr != 0); 3613 continue; 3614 } else 3615 return(0); 3616 } 3617 expStr++; 3618 valStr++; 3619 } while (*valStr != 0); 3620 if (*expStr != 0) 3621 return (0); 3622 else 3623 return (1); 3624 } 3625 3626 /** 3627 * xmlRegCompactPushString: 3628 * @exec: a regexp execution context 3629 * @comp: the precompiled exec with a compact table 3630 * @value: a string token input 3631 * @data: data associated to the token to reuse in callbacks 3632 * 3633 * Push one input token in the execution context 3634 * 3635 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3636 * a negative value in case of error. 3637 */ 3638 static int 3639 xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3640 xmlRegexpPtr comp, 3641 const xmlChar *value, 3642 void *data) { 3643 int state = exec->index; 3644 int i, target; 3645 3646 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3647 return(-1); 3648 3649 if (value == NULL) { 3650 /* 3651 * are we at a final state ? 3652 */ 3653 if (comp->compact[state * (comp->nbstrings + 1)] == 3654 XML_REGEXP_FINAL_STATE) 3655 return(1); 3656 return(0); 3657 } 3658 3659 #ifdef DEBUG_PUSH 3660 printf("value pushed: %s\n", value); 3661 #endif 3662 3663 /* 3664 * Examine all outside transitions from current state 3665 */ 3666 for (i = 0;i < comp->nbstrings;i++) { 3667 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3668 if ((target > 0) && (target <= comp->nbstates)) { 3669 target--; /* to avoid 0 */ 3670 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3671 exec->index = target; 3672 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3673 exec->callback(exec->data, value, 3674 comp->transdata[state * comp->nbstrings + i], data); 3675 } 3676 #ifdef DEBUG_PUSH 3677 printf("entering state %d\n", target); 3678 #endif 3679 if (comp->compact[target * (comp->nbstrings + 1)] == 3680 XML_REGEXP_SINK_STATE) 3681 goto error; 3682 3683 if (comp->compact[target * (comp->nbstrings + 1)] == 3684 XML_REGEXP_FINAL_STATE) 3685 return(1); 3686 return(0); 3687 } 3688 } 3689 } 3690 /* 3691 * Failed to find an exit transition out from current state for the 3692 * current token 3693 */ 3694 #ifdef DEBUG_PUSH 3695 printf("failed to find a transition for %s on state %d\n", value, state); 3696 #endif 3697 error: 3698 if (exec->errString != NULL) 3699 xmlFree(exec->errString); 3700 exec->errString = xmlStrdup(value); 3701 exec->errStateNo = state; 3702 exec->status = -1; 3703 #ifdef DEBUG_ERR 3704 testerr(exec); 3705 #endif 3706 return(-1); 3707 } 3708 3709 /** 3710 * xmlRegExecPushStringInternal: 3711 * @exec: a regexp execution context or NULL to indicate the end 3712 * @value: a string token input 3713 * @data: data associated to the token to reuse in callbacks 3714 * @compound: value was assembled from 2 strings 3715 * 3716 * Push one input token in the execution context 3717 * 3718 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3719 * a negative value in case of error. 3720 */ 3721 static int 3722 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3723 void *data, int compound) { 3724 xmlRegTransPtr trans; 3725 xmlRegAtomPtr atom; 3726 int ret; 3727 int final = 0; 3728 int progress = 1; 3729 3730 if (exec == NULL) 3731 return(-1); 3732 if (exec->comp == NULL) 3733 return(-1); 3734 if (exec->status != 0) 3735 return(exec->status); 3736 3737 if (exec->comp->compact != NULL) 3738 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3739 3740 if (value == NULL) { 3741 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3742 return(1); 3743 final = 1; 3744 } 3745 3746 #ifdef DEBUG_PUSH 3747 printf("value pushed: %s\n", value); 3748 #endif 3749 /* 3750 * If we have an active rollback stack push the new value there 3751 * and get back to where we were left 3752 */ 3753 if ((value != NULL) && (exec->inputStackNr > 0)) { 3754 xmlFARegExecSaveInputString(exec, value, data); 3755 value = exec->inputStack[exec->index].value; 3756 data = exec->inputStack[exec->index].data; 3757 #ifdef DEBUG_PUSH 3758 printf("value loaded: %s\n", value); 3759 #endif 3760 } 3761 3762 while ((exec->status == 0) && 3763 ((value != NULL) || 3764 ((final == 1) && 3765 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3766 3767 /* 3768 * End of input on non-terminal state, rollback, however we may 3769 * still have epsilon like transition for counted transitions 3770 * on counters, in that case don't break too early. 3771 */ 3772 if ((value == NULL) && (exec->counts == NULL)) 3773 goto rollback; 3774 3775 exec->transcount = 0; 3776 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3777 trans = &exec->state->trans[exec->transno]; 3778 if (trans->to < 0) 3779 continue; 3780 atom = trans->atom; 3781 ret = 0; 3782 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3783 int i; 3784 int count; 3785 xmlRegTransPtr t; 3786 xmlRegCounterPtr counter; 3787 3788 ret = 0; 3789 3790 #ifdef DEBUG_PUSH 3791 printf("testing all lax %d\n", trans->count); 3792 #endif 3793 /* 3794 * Check all counted transitions from the current state 3795 */ 3796 if ((value == NULL) && (final)) { 3797 ret = 1; 3798 } else if (value != NULL) { 3799 for (i = 0;i < exec->state->nbTrans;i++) { 3800 t = &exec->state->trans[i]; 3801 if ((t->counter < 0) || (t == trans)) 3802 continue; 3803 counter = &exec->comp->counters[t->counter]; 3804 count = exec->counts[t->counter]; 3805 if ((count < counter->max) && 3806 (t->atom != NULL) && 3807 (xmlStrEqual(value, t->atom->valuep))) { 3808 ret = 0; 3809 break; 3810 } 3811 if ((count >= counter->min) && 3812 (count < counter->max) && 3813 (t->atom != NULL) && 3814 (xmlStrEqual(value, t->atom->valuep))) { 3815 ret = 1; 3816 break; 3817 } 3818 } 3819 } 3820 } else if (trans->count == REGEXP_ALL_COUNTER) { 3821 int i; 3822 int count; 3823 xmlRegTransPtr t; 3824 xmlRegCounterPtr counter; 3825 3826 ret = 1; 3827 3828 #ifdef DEBUG_PUSH 3829 printf("testing all %d\n", trans->count); 3830 #endif 3831 /* 3832 * Check all counted transitions from the current state 3833 */ 3834 for (i = 0;i < exec->state->nbTrans;i++) { 3835 t = &exec->state->trans[i]; 3836 if ((t->counter < 0) || (t == trans)) 3837 continue; 3838 counter = &exec->comp->counters[t->counter]; 3839 count = exec->counts[t->counter]; 3840 if ((count < counter->min) || (count > counter->max)) { 3841 ret = 0; 3842 break; 3843 } 3844 } 3845 } else if (trans->count >= 0) { 3846 int count; 3847 xmlRegCounterPtr counter; 3848 3849 /* 3850 * A counted transition. 3851 */ 3852 3853 count = exec->counts[trans->count]; 3854 counter = &exec->comp->counters[trans->count]; 3855 #ifdef DEBUG_PUSH 3856 printf("testing count %d: val %d, min %d, max %d\n", 3857 trans->count, count, counter->min, counter->max); 3858 #endif 3859 ret = ((count >= counter->min) && (count <= counter->max)); 3860 } else if (atom == NULL) { 3861 fprintf(stderr, "epsilon transition left at runtime\n"); 3862 exec->status = -2; 3863 break; 3864 } else if (value != NULL) { 3865 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3866 if (atom->neg) { 3867 ret = !ret; 3868 if (!compound) 3869 ret = 0; 3870 } 3871 if ((ret == 1) && (trans->counter >= 0)) { 3872 xmlRegCounterPtr counter; 3873 int count; 3874 3875 count = exec->counts[trans->counter]; 3876 counter = &exec->comp->counters[trans->counter]; 3877 if (count >= counter->max) 3878 ret = 0; 3879 } 3880 3881 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3882 xmlRegStatePtr to = exec->comp->states[trans->to]; 3883 3884 /* 3885 * this is a multiple input sequence 3886 */ 3887 if (exec->state->nbTrans > exec->transno + 1) { 3888 if (exec->inputStackNr <= 0) { 3889 xmlFARegExecSaveInputString(exec, value, data); 3890 } 3891 xmlFARegExecSave(exec); 3892 } 3893 exec->transcount = 1; 3894 do { 3895 /* 3896 * Try to progress as much as possible on the input 3897 */ 3898 if (exec->transcount == atom->max) { 3899 break; 3900 } 3901 exec->index++; 3902 value = exec->inputStack[exec->index].value; 3903 data = exec->inputStack[exec->index].data; 3904 #ifdef DEBUG_PUSH 3905 printf("value loaded: %s\n", value); 3906 #endif 3907 3908 /* 3909 * End of input: stop here 3910 */ 3911 if (value == NULL) { 3912 exec->index --; 3913 break; 3914 } 3915 if (exec->transcount >= atom->min) { 3916 int transno = exec->transno; 3917 xmlRegStatePtr state = exec->state; 3918 3919 /* 3920 * The transition is acceptable save it 3921 */ 3922 exec->transno = -1; /* trick */ 3923 exec->state = to; 3924 if (exec->inputStackNr <= 0) { 3925 xmlFARegExecSaveInputString(exec, value, data); 3926 } 3927 xmlFARegExecSave(exec); 3928 exec->transno = transno; 3929 exec->state = state; 3930 } 3931 ret = xmlStrEqual(value, atom->valuep); 3932 exec->transcount++; 3933 } while (ret == 1); 3934 if (exec->transcount < atom->min) 3935 ret = 0; 3936 3937 /* 3938 * If the last check failed but one transition was found 3939 * possible, rollback 3940 */ 3941 if (ret < 0) 3942 ret = 0; 3943 if (ret == 0) { 3944 goto rollback; 3945 } 3946 } 3947 } 3948 if (ret == 1) { 3949 if ((exec->callback != NULL) && (atom != NULL) && 3950 (data != NULL)) { 3951 exec->callback(exec->data, atom->valuep, 3952 atom->data, data); 3953 } 3954 if (exec->state->nbTrans > exec->transno + 1) { 3955 if (exec->inputStackNr <= 0) { 3956 xmlFARegExecSaveInputString(exec, value, data); 3957 } 3958 xmlFARegExecSave(exec); 3959 } 3960 if (trans->counter >= 0) { 3961 #ifdef DEBUG_PUSH 3962 printf("Increasing count %d\n", trans->counter); 3963 #endif 3964 exec->counts[trans->counter]++; 3965 } 3966 if ((trans->count >= 0) && 3967 (trans->count < REGEXP_ALL_COUNTER)) { 3968 #ifdef DEBUG_REGEXP_EXEC 3969 printf("resetting count %d on transition\n", 3970 trans->count); 3971 #endif 3972 exec->counts[trans->count] = 0; 3973 } 3974 #ifdef DEBUG_PUSH 3975 printf("entering state %d\n", trans->to); 3976 #endif 3977 if ((exec->comp->states[trans->to] != NULL) && 3978 (exec->comp->states[trans->to]->type == 3979 XML_REGEXP_SINK_STATE)) { 3980 /* 3981 * entering a sink state, save the current state as error 3982 * state. 3983 */ 3984 if (exec->errString != NULL) 3985 xmlFree(exec->errString); 3986 exec->errString = xmlStrdup(value); 3987 exec->errState = exec->state; 3988 memcpy(exec->errCounts, exec->counts, 3989 exec->comp->nbCounters * sizeof(int)); 3990 } 3991 exec->state = exec->comp->states[trans->to]; 3992 exec->transno = 0; 3993 if (trans->atom != NULL) { 3994 if (exec->inputStack != NULL) { 3995 exec->index++; 3996 if (exec->index < exec->inputStackNr) { 3997 value = exec->inputStack[exec->index].value; 3998 data = exec->inputStack[exec->index].data; 3999 #ifdef DEBUG_PUSH 4000 printf("value loaded: %s\n", value); 4001 #endif 4002 } else { 4003 value = NULL; 4004 data = NULL; 4005 #ifdef DEBUG_PUSH 4006 printf("end of input\n"); 4007 #endif 4008 } 4009 } else { 4010 value = NULL; 4011 data = NULL; 4012 #ifdef DEBUG_PUSH 4013 printf("end of input\n"); 4014 #endif 4015 } 4016 } 4017 goto progress; 4018 } else if (ret < 0) { 4019 exec->status = -4; 4020 break; 4021 } 4022 } 4023 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4024 rollback: 4025 /* 4026 * if we didn't yet rollback on the current input 4027 * store the current state as the error state. 4028 */ 4029 if ((progress) && (exec->state != NULL) && 4030 (exec->state->type != XML_REGEXP_SINK_STATE)) { 4031 progress = 0; 4032 if (exec->errString != NULL) 4033 xmlFree(exec->errString); 4034 exec->errString = xmlStrdup(value); 4035 exec->errState = exec->state; 4036 memcpy(exec->errCounts, exec->counts, 4037 exec->comp->nbCounters * sizeof(int)); 4038 } 4039 4040 /* 4041 * Failed to find a way out 4042 */ 4043 exec->determinist = 0; 4044 xmlFARegExecRollBack(exec); 4045 if (exec->status == 0) { 4046 value = exec->inputStack[exec->index].value; 4047 data = exec->inputStack[exec->index].data; 4048 #ifdef DEBUG_PUSH 4049 printf("value loaded: %s\n", value); 4050 #endif 4051 } 4052 } 4053 continue; 4054 progress: 4055 progress = 1; 4056 continue; 4057 } 4058 if (exec->status == 0) { 4059 return(exec->state->type == XML_REGEXP_FINAL_STATE); 4060 } 4061 #ifdef DEBUG_ERR 4062 if (exec->status < 0) { 4063 testerr(exec); 4064 } 4065 #endif 4066 return(exec->status); 4067 } 4068 4069 /** 4070 * xmlRegExecPushString: 4071 * @exec: a regexp execution context or NULL to indicate the end 4072 * @value: a string token input 4073 * @data: data associated to the token to reuse in callbacks 4074 * 4075 * Push one input token in the execution context 4076 * 4077 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4078 * a negative value in case of error. 4079 */ 4080 int 4081 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 4082 void *data) { 4083 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 4084 } 4085 4086 /** 4087 * xmlRegExecPushString2: 4088 * @exec: a regexp execution context or NULL to indicate the end 4089 * @value: the first string token input 4090 * @value2: the second string token input 4091 * @data: data associated to the token to reuse in callbacks 4092 * 4093 * Push one input token in the execution context 4094 * 4095 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4096 * a negative value in case of error. 4097 */ 4098 int 4099 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 4100 const xmlChar *value2, void *data) { 4101 xmlChar buf[150]; 4102 int lenn, lenp, ret; 4103 xmlChar *str; 4104 4105 if (exec == NULL) 4106 return(-1); 4107 if (exec->comp == NULL) 4108 return(-1); 4109 if (exec->status != 0) 4110 return(exec->status); 4111 4112 if (value2 == NULL) 4113 return(xmlRegExecPushString(exec, value, data)); 4114 4115 lenn = strlen((char *) value2); 4116 lenp = strlen((char *) value); 4117 4118 if (150 < lenn + lenp + 2) { 4119 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4120 if (str == NULL) { 4121 exec->status = -1; 4122 return(-1); 4123 } 4124 } else { 4125 str = buf; 4126 } 4127 memcpy(&str[0], value, lenp); 4128 str[lenp] = XML_REG_STRING_SEPARATOR; 4129 memcpy(&str[lenp + 1], value2, lenn); 4130 str[lenn + lenp + 1] = 0; 4131 4132 if (exec->comp->compact != NULL) 4133 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 4134 else 4135 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 4136 4137 if (str != buf) 4138 xmlFree(str); 4139 return(ret); 4140 } 4141 4142 /** 4143 * xmlRegExecGetValues: 4144 * @exec: a regexp execution context 4145 * @err: error extraction or normal one 4146 * @nbval: pointer to the number of accepted values IN/OUT 4147 * @nbneg: return number of negative transitions 4148 * @values: pointer to the array of acceptable values 4149 * @terminal: return value if this was a terminal state 4150 * 4151 * Extract informations from the regexp execution, internal routine to 4152 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 4153 * 4154 * Returns: 0 in case of success or -1 in case of error. 4155 */ 4156 static int 4157 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 4158 int *nbval, int *nbneg, 4159 xmlChar **values, int *terminal) { 4160 int maxval; 4161 int nb = 0; 4162 4163 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 4164 (values == NULL) || (*nbval <= 0)) 4165 return(-1); 4166 4167 maxval = *nbval; 4168 *nbval = 0; 4169 *nbneg = 0; 4170 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4171 xmlRegexpPtr comp; 4172 int target, i, state; 4173 4174 comp = exec->comp; 4175 4176 if (err) { 4177 if (exec->errStateNo == -1) return(-1); 4178 state = exec->errStateNo; 4179 } else { 4180 state = exec->index; 4181 } 4182 if (terminal != NULL) { 4183 if (comp->compact[state * (comp->nbstrings + 1)] == 4184 XML_REGEXP_FINAL_STATE) 4185 *terminal = 1; 4186 else 4187 *terminal = 0; 4188 } 4189 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4190 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4191 if ((target > 0) && (target <= comp->nbstates) && 4192 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4193 XML_REGEXP_SINK_STATE)) { 4194 values[nb++] = comp->stringMap[i]; 4195 (*nbval)++; 4196 } 4197 } 4198 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4199 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4200 if ((target > 0) && (target <= comp->nbstates) && 4201 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4202 XML_REGEXP_SINK_STATE)) { 4203 values[nb++] = comp->stringMap[i]; 4204 (*nbneg)++; 4205 } 4206 } 4207 } else { 4208 int transno; 4209 xmlRegTransPtr trans; 4210 xmlRegAtomPtr atom; 4211 xmlRegStatePtr state; 4212 4213 if (terminal != NULL) { 4214 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4215 *terminal = 1; 4216 else 4217 *terminal = 0; 4218 } 4219 4220 if (err) { 4221 if (exec->errState == NULL) return(-1); 4222 state = exec->errState; 4223 } else { 4224 if (exec->state == NULL) return(-1); 4225 state = exec->state; 4226 } 4227 for (transno = 0; 4228 (transno < state->nbTrans) && (nb < maxval); 4229 transno++) { 4230 trans = &state->trans[transno]; 4231 if (trans->to < 0) 4232 continue; 4233 atom = trans->atom; 4234 if ((atom == NULL) || (atom->valuep == NULL)) 4235 continue; 4236 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4237 /* this should not be reached but ... */ 4238 TODO; 4239 } else if (trans->count == REGEXP_ALL_COUNTER) { 4240 /* this should not be reached but ... */ 4241 TODO; 4242 } else if (trans->counter >= 0) { 4243 xmlRegCounterPtr counter = NULL; 4244 int count; 4245 4246 if (err) 4247 count = exec->errCounts[trans->counter]; 4248 else 4249 count = exec->counts[trans->counter]; 4250 if (exec->comp != NULL) 4251 counter = &exec->comp->counters[trans->counter]; 4252 if ((counter == NULL) || (count < counter->max)) { 4253 if (atom->neg) 4254 values[nb++] = (xmlChar *) atom->valuep2; 4255 else 4256 values[nb++] = (xmlChar *) atom->valuep; 4257 (*nbval)++; 4258 } 4259 } else { 4260 if ((exec->comp->states[trans->to] != NULL) && 4261 (exec->comp->states[trans->to]->type != 4262 XML_REGEXP_SINK_STATE)) { 4263 if (atom->neg) 4264 values[nb++] = (xmlChar *) atom->valuep2; 4265 else 4266 values[nb++] = (xmlChar *) atom->valuep; 4267 (*nbval)++; 4268 } 4269 } 4270 } 4271 for (transno = 0; 4272 (transno < state->nbTrans) && (nb < maxval); 4273 transno++) { 4274 trans = &state->trans[transno]; 4275 if (trans->to < 0) 4276 continue; 4277 atom = trans->atom; 4278 if ((atom == NULL) || (atom->valuep == NULL)) 4279 continue; 4280 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4281 continue; 4282 } else if (trans->count == REGEXP_ALL_COUNTER) { 4283 continue; 4284 } else if (trans->counter >= 0) { 4285 continue; 4286 } else { 4287 if ((exec->comp->states[trans->to] != NULL) && 4288 (exec->comp->states[trans->to]->type == 4289 XML_REGEXP_SINK_STATE)) { 4290 if (atom->neg) 4291 values[nb++] = (xmlChar *) atom->valuep2; 4292 else 4293 values[nb++] = (xmlChar *) atom->valuep; 4294 (*nbneg)++; 4295 } 4296 } 4297 } 4298 } 4299 return(0); 4300 } 4301 4302 /** 4303 * xmlRegExecNextValues: 4304 * @exec: a regexp execution context 4305 * @nbval: pointer to the number of accepted values IN/OUT 4306 * @nbneg: return number of negative transitions 4307 * @values: pointer to the array of acceptable values 4308 * @terminal: return value if this was a terminal state 4309 * 4310 * Extract informations from the regexp execution, 4311 * the parameter @values must point to an array of @nbval string pointers 4312 * on return nbval will contain the number of possible strings in that 4313 * state and the @values array will be updated with them. The string values 4314 * returned will be freed with the @exec context and don't need to be 4315 * deallocated. 4316 * 4317 * Returns: 0 in case of success or -1 in case of error. 4318 */ 4319 int 4320 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4321 xmlChar **values, int *terminal) { 4322 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4323 } 4324 4325 /** 4326 * xmlRegExecErrInfo: 4327 * @exec: a regexp execution context generating an error 4328 * @string: return value for the error string 4329 * @nbval: pointer to the number of accepted values IN/OUT 4330 * @nbneg: return number of negative transitions 4331 * @values: pointer to the array of acceptable values 4332 * @terminal: return value if this was a terminal state 4333 * 4334 * Extract error informations from the regexp execution, the parameter 4335 * @string will be updated with the value pushed and not accepted, 4336 * the parameter @values must point to an array of @nbval string pointers 4337 * on return nbval will contain the number of possible strings in that 4338 * state and the @values array will be updated with them. The string values 4339 * returned will be freed with the @exec context and don't need to be 4340 * deallocated. 4341 * 4342 * Returns: 0 in case of success or -1 in case of error. 4343 */ 4344 int 4345 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4346 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4347 if (exec == NULL) 4348 return(-1); 4349 if (string != NULL) { 4350 if (exec->status != 0) 4351 *string = exec->errString; 4352 else 4353 *string = NULL; 4354 } 4355 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4356 } 4357 4358 #ifdef DEBUG_ERR 4359 static void testerr(xmlRegExecCtxtPtr exec) { 4360 const xmlChar *string; 4361 xmlChar *values[5]; 4362 int nb = 5; 4363 int nbneg; 4364 int terminal; 4365 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4366 } 4367 #endif 4368 4369 #if 0 4370 static int 4371 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4372 xmlRegTransPtr trans; 4373 xmlRegAtomPtr atom; 4374 int ret; 4375 int codepoint, len; 4376 4377 if (exec == NULL) 4378 return(-1); 4379 if (exec->status != 0) 4380 return(exec->status); 4381 4382 while ((exec->status == 0) && 4383 ((exec->inputString[exec->index] != 0) || 4384 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4385 4386 /* 4387 * End of input on non-terminal state, rollback, however we may 4388 * still have epsilon like transition for counted transitions 4389 * on counters, in that case don't break too early. 4390 */ 4391 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4392 goto rollback; 4393 4394 exec->transcount = 0; 4395 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4396 trans = &exec->state->trans[exec->transno]; 4397 if (trans->to < 0) 4398 continue; 4399 atom = trans->atom; 4400 ret = 0; 4401 if (trans->count >= 0) { 4402 int count; 4403 xmlRegCounterPtr counter; 4404 4405 /* 4406 * A counted transition. 4407 */ 4408 4409 count = exec->counts[trans->count]; 4410 counter = &exec->comp->counters[trans->count]; 4411 #ifdef DEBUG_REGEXP_EXEC 4412 printf("testing count %d: val %d, min %d, max %d\n", 4413 trans->count, count, counter->min, counter->max); 4414 #endif 4415 ret = ((count >= counter->min) && (count <= counter->max)); 4416 } else if (atom == NULL) { 4417 fprintf(stderr, "epsilon transition left at runtime\n"); 4418 exec->status = -2; 4419 break; 4420 } else if (exec->inputString[exec->index] != 0) { 4421 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4422 ret = xmlRegCheckCharacter(atom, codepoint); 4423 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4424 xmlRegStatePtr to = exec->comp->states[trans->to]; 4425 4426 /* 4427 * this is a multiple input sequence 4428 */ 4429 if (exec->state->nbTrans > exec->transno + 1) { 4430 xmlFARegExecSave(exec); 4431 } 4432 exec->transcount = 1; 4433 do { 4434 /* 4435 * Try to progress as much as possible on the input 4436 */ 4437 if (exec->transcount == atom->max) { 4438 break; 4439 } 4440 exec->index += len; 4441 /* 4442 * End of input: stop here 4443 */ 4444 if (exec->inputString[exec->index] == 0) { 4445 exec->index -= len; 4446 break; 4447 } 4448 if (exec->transcount >= atom->min) { 4449 int transno = exec->transno; 4450 xmlRegStatePtr state = exec->state; 4451 4452 /* 4453 * The transition is acceptable save it 4454 */ 4455 exec->transno = -1; /* trick */ 4456 exec->state = to; 4457 xmlFARegExecSave(exec); 4458 exec->transno = transno; 4459 exec->state = state; 4460 } 4461 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4462 len); 4463 ret = xmlRegCheckCharacter(atom, codepoint); 4464 exec->transcount++; 4465 } while (ret == 1); 4466 if (exec->transcount < atom->min) 4467 ret = 0; 4468 4469 /* 4470 * If the last check failed but one transition was found 4471 * possible, rollback 4472 */ 4473 if (ret < 0) 4474 ret = 0; 4475 if (ret == 0) { 4476 goto rollback; 4477 } 4478 } 4479 } 4480 if (ret == 1) { 4481 if (exec->state->nbTrans > exec->transno + 1) { 4482 xmlFARegExecSave(exec); 4483 } 4484 /* 4485 * restart count for expressions like this ((abc){2})* 4486 */ 4487 if (trans->count >= 0) { 4488 #ifdef DEBUG_REGEXP_EXEC 4489 printf("Reset count %d\n", trans->count); 4490 #endif 4491 exec->counts[trans->count] = 0; 4492 } 4493 if (trans->counter >= 0) { 4494 #ifdef DEBUG_REGEXP_EXEC 4495 printf("Increasing count %d\n", trans->counter); 4496 #endif 4497 exec->counts[trans->counter]++; 4498 } 4499 #ifdef DEBUG_REGEXP_EXEC 4500 printf("entering state %d\n", trans->to); 4501 #endif 4502 exec->state = exec->comp->states[trans->to]; 4503 exec->transno = 0; 4504 if (trans->atom != NULL) { 4505 exec->index += len; 4506 } 4507 goto progress; 4508 } else if (ret < 0) { 4509 exec->status = -4; 4510 break; 4511 } 4512 } 4513 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4514 rollback: 4515 /* 4516 * Failed to find a way out 4517 */ 4518 exec->determinist = 0; 4519 xmlFARegExecRollBack(exec); 4520 } 4521 progress: 4522 continue; 4523 } 4524 } 4525 #endif 4526 /************************************************************************ 4527 * * 4528 * Parser for the Schemas Datatype Regular Expressions * 4529 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4530 * * 4531 ************************************************************************/ 4532 4533 /** 4534 * xmlFAIsChar: 4535 * @ctxt: a regexp parser context 4536 * 4537 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4538 */ 4539 static int 4540 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4541 int cur; 4542 int len; 4543 4544 cur = CUR_SCHAR(ctxt->cur, len); 4545 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4546 (cur == '*') || (cur == '+') || (cur == '(') || 4547 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4548 (cur == 0x5D) || (cur == 0)) 4549 return(-1); 4550 return(cur); 4551 } 4552 4553 /** 4554 * xmlFAParseCharProp: 4555 * @ctxt: a regexp parser context 4556 * 4557 * [27] charProp ::= IsCategory | IsBlock 4558 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4559 * Separators | Symbols | Others 4560 * [29] Letters ::= 'L' [ultmo]? 4561 * [30] Marks ::= 'M' [nce]? 4562 * [31] Numbers ::= 'N' [dlo]? 4563 * [32] Punctuation ::= 'P' [cdseifo]? 4564 * [33] Separators ::= 'Z' [slp]? 4565 * [34] Symbols ::= 'S' [mcko]? 4566 * [35] Others ::= 'C' [cfon]? 4567 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4568 */ 4569 static void 4570 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4571 int cur; 4572 xmlRegAtomType type = (xmlRegAtomType) 0; 4573 xmlChar *blockName = NULL; 4574 4575 cur = CUR; 4576 if (cur == 'L') { 4577 NEXT; 4578 cur = CUR; 4579 if (cur == 'u') { 4580 NEXT; 4581 type = XML_REGEXP_LETTER_UPPERCASE; 4582 } else if (cur == 'l') { 4583 NEXT; 4584 type = XML_REGEXP_LETTER_LOWERCASE; 4585 } else if (cur == 't') { 4586 NEXT; 4587 type = XML_REGEXP_LETTER_TITLECASE; 4588 } else if (cur == 'm') { 4589 NEXT; 4590 type = XML_REGEXP_LETTER_MODIFIER; 4591 } else if (cur == 'o') { 4592 NEXT; 4593 type = XML_REGEXP_LETTER_OTHERS; 4594 } else { 4595 type = XML_REGEXP_LETTER; 4596 } 4597 } else if (cur == 'M') { 4598 NEXT; 4599 cur = CUR; 4600 if (cur == 'n') { 4601 NEXT; 4602 /* nonspacing */ 4603 type = XML_REGEXP_MARK_NONSPACING; 4604 } else if (cur == 'c') { 4605 NEXT; 4606 /* spacing combining */ 4607 type = XML_REGEXP_MARK_SPACECOMBINING; 4608 } else if (cur == 'e') { 4609 NEXT; 4610 /* enclosing */ 4611 type = XML_REGEXP_MARK_ENCLOSING; 4612 } else { 4613 /* all marks */ 4614 type = XML_REGEXP_MARK; 4615 } 4616 } else if (cur == 'N') { 4617 NEXT; 4618 cur = CUR; 4619 if (cur == 'd') { 4620 NEXT; 4621 /* digital */ 4622 type = XML_REGEXP_NUMBER_DECIMAL; 4623 } else if (cur == 'l') { 4624 NEXT; 4625 /* letter */ 4626 type = XML_REGEXP_NUMBER_LETTER; 4627 } else if (cur == 'o') { 4628 NEXT; 4629 /* other */ 4630 type = XML_REGEXP_NUMBER_OTHERS; 4631 } else { 4632 /* all numbers */ 4633 type = XML_REGEXP_NUMBER; 4634 } 4635 } else if (cur == 'P') { 4636 NEXT; 4637 cur = CUR; 4638 if (cur == 'c') { 4639 NEXT; 4640 /* connector */ 4641 type = XML_REGEXP_PUNCT_CONNECTOR; 4642 } else if (cur == 'd') { 4643 NEXT; 4644 /* dash */ 4645 type = XML_REGEXP_PUNCT_DASH; 4646 } else if (cur == 's') { 4647 NEXT; 4648 /* open */ 4649 type = XML_REGEXP_PUNCT_OPEN; 4650 } else if (cur == 'e') { 4651 NEXT; 4652 /* close */ 4653 type = XML_REGEXP_PUNCT_CLOSE; 4654 } else if (cur == 'i') { 4655 NEXT; 4656 /* initial quote */ 4657 type = XML_REGEXP_PUNCT_INITQUOTE; 4658 } else if (cur == 'f') { 4659 NEXT; 4660 /* final quote */ 4661 type = XML_REGEXP_PUNCT_FINQUOTE; 4662 } else if (cur == 'o') { 4663 NEXT; 4664 /* other */ 4665 type = XML_REGEXP_PUNCT_OTHERS; 4666 } else { 4667 /* all punctuation */ 4668 type = XML_REGEXP_PUNCT; 4669 } 4670 } else if (cur == 'Z') { 4671 NEXT; 4672 cur = CUR; 4673 if (cur == 's') { 4674 NEXT; 4675 /* space */ 4676 type = XML_REGEXP_SEPAR_SPACE; 4677 } else if (cur == 'l') { 4678 NEXT; 4679 /* line */ 4680 type = XML_REGEXP_SEPAR_LINE; 4681 } else if (cur == 'p') { 4682 NEXT; 4683 /* paragraph */ 4684 type = XML_REGEXP_SEPAR_PARA; 4685 } else { 4686 /* all separators */ 4687 type = XML_REGEXP_SEPAR; 4688 } 4689 } else if (cur == 'S') { 4690 NEXT; 4691 cur = CUR; 4692 if (cur == 'm') { 4693 NEXT; 4694 type = XML_REGEXP_SYMBOL_MATH; 4695 /* math */ 4696 } else if (cur == 'c') { 4697 NEXT; 4698 type = XML_REGEXP_SYMBOL_CURRENCY; 4699 /* currency */ 4700 } else if (cur == 'k') { 4701 NEXT; 4702 type = XML_REGEXP_SYMBOL_MODIFIER; 4703 /* modifiers */ 4704 } else if (cur == 'o') { 4705 NEXT; 4706 type = XML_REGEXP_SYMBOL_OTHERS; 4707 /* other */ 4708 } else { 4709 /* all symbols */ 4710 type = XML_REGEXP_SYMBOL; 4711 } 4712 } else if (cur == 'C') { 4713 NEXT; 4714 cur = CUR; 4715 if (cur == 'c') { 4716 NEXT; 4717 /* control */ 4718 type = XML_REGEXP_OTHER_CONTROL; 4719 } else if (cur == 'f') { 4720 NEXT; 4721 /* format */ 4722 type = XML_REGEXP_OTHER_FORMAT; 4723 } else if (cur == 'o') { 4724 NEXT; 4725 /* private use */ 4726 type = XML_REGEXP_OTHER_PRIVATE; 4727 } else if (cur == 'n') { 4728 NEXT; 4729 /* not assigned */ 4730 type = XML_REGEXP_OTHER_NA; 4731 } else { 4732 /* all others */ 4733 type = XML_REGEXP_OTHER; 4734 } 4735 } else if (cur == 'I') { 4736 const xmlChar *start; 4737 NEXT; 4738 cur = CUR; 4739 if (cur != 's') { 4740 ERROR("IsXXXX expected"); 4741 return; 4742 } 4743 NEXT; 4744 start = ctxt->cur; 4745 cur = CUR; 4746 if (((cur >= 'a') && (cur <= 'z')) || 4747 ((cur >= 'A') && (cur <= 'Z')) || 4748 ((cur >= '0') && (cur <= '9')) || 4749 (cur == 0x2D)) { 4750 NEXT; 4751 cur = CUR; 4752 while (((cur >= 'a') && (cur <= 'z')) || 4753 ((cur >= 'A') && (cur <= 'Z')) || 4754 ((cur >= '0') && (cur <= '9')) || 4755 (cur == 0x2D)) { 4756 NEXT; 4757 cur = CUR; 4758 } 4759 } 4760 type = XML_REGEXP_BLOCK_NAME; 4761 blockName = xmlStrndup(start, ctxt->cur - start); 4762 } else { 4763 ERROR("Unknown char property"); 4764 return; 4765 } 4766 if (ctxt->atom == NULL) { 4767 ctxt->atom = xmlRegNewAtom(ctxt, type); 4768 if (ctxt->atom != NULL) 4769 ctxt->atom->valuep = blockName; 4770 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4771 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4772 type, 0, 0, blockName); 4773 } 4774 } 4775 4776 /** 4777 * xmlFAParseCharClassEsc: 4778 * @ctxt: a regexp parser context 4779 * 4780 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4781 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4782 * [25] catEsc ::= '\p{' charProp '}' 4783 * [26] complEsc ::= '\P{' charProp '}' 4784 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4785 */ 4786 static void 4787 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4788 int cur; 4789 4790 if (CUR == '.') { 4791 if (ctxt->atom == NULL) { 4792 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4793 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4794 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4795 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4796 } 4797 NEXT; 4798 return; 4799 } 4800 if (CUR != '\\') { 4801 ERROR("Escaped sequence: expecting \\"); 4802 return; 4803 } 4804 NEXT; 4805 cur = CUR; 4806 if (cur == 'p') { 4807 NEXT; 4808 if (CUR != '{') { 4809 ERROR("Expecting '{'"); 4810 return; 4811 } 4812 NEXT; 4813 xmlFAParseCharProp(ctxt); 4814 if (CUR != '}') { 4815 ERROR("Expecting '}'"); 4816 return; 4817 } 4818 NEXT; 4819 } else if (cur == 'P') { 4820 NEXT; 4821 if (CUR != '{') { 4822 ERROR("Expecting '{'"); 4823 return; 4824 } 4825 NEXT; 4826 xmlFAParseCharProp(ctxt); 4827 ctxt->atom->neg = 1; 4828 if (CUR != '}') { 4829 ERROR("Expecting '}'"); 4830 return; 4831 } 4832 NEXT; 4833 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4834 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4835 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4836 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4837 (cur == 0x5E)) { 4838 if (ctxt->atom == NULL) { 4839 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4840 if (ctxt->atom != NULL) { 4841 switch (cur) { 4842 case 'n': 4843 ctxt->atom->codepoint = '\n'; 4844 break; 4845 case 'r': 4846 ctxt->atom->codepoint = '\r'; 4847 break; 4848 case 't': 4849 ctxt->atom->codepoint = '\t'; 4850 break; 4851 default: 4852 ctxt->atom->codepoint = cur; 4853 } 4854 } 4855 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4856 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4857 XML_REGEXP_CHARVAL, cur, cur, NULL); 4858 } 4859 NEXT; 4860 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4861 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4862 (cur == 'w') || (cur == 'W')) { 4863 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4864 4865 switch (cur) { 4866 case 's': 4867 type = XML_REGEXP_ANYSPACE; 4868 break; 4869 case 'S': 4870 type = XML_REGEXP_NOTSPACE; 4871 break; 4872 case 'i': 4873 type = XML_REGEXP_INITNAME; 4874 break; 4875 case 'I': 4876 type = XML_REGEXP_NOTINITNAME; 4877 break; 4878 case 'c': 4879 type = XML_REGEXP_NAMECHAR; 4880 break; 4881 case 'C': 4882 type = XML_REGEXP_NOTNAMECHAR; 4883 break; 4884 case 'd': 4885 type = XML_REGEXP_DECIMAL; 4886 break; 4887 case 'D': 4888 type = XML_REGEXP_NOTDECIMAL; 4889 break; 4890 case 'w': 4891 type = XML_REGEXP_REALCHAR; 4892 break; 4893 case 'W': 4894 type = XML_REGEXP_NOTREALCHAR; 4895 break; 4896 } 4897 NEXT; 4898 if (ctxt->atom == NULL) { 4899 ctxt->atom = xmlRegNewAtom(ctxt, type); 4900 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4901 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4902 type, 0, 0, NULL); 4903 } 4904 } else { 4905 ERROR("Wrong escape sequence, misuse of character '\\'"); 4906 } 4907 } 4908 4909 /** 4910 * xmlFAParseCharRange: 4911 * @ctxt: a regexp parser context 4912 * 4913 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 4914 * [18] seRange ::= charOrEsc '-' charOrEsc 4915 * [20] charOrEsc ::= XmlChar | SingleCharEsc 4916 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 4917 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 4918 */ 4919 static void 4920 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 4921 int cur, len; 4922 int start = -1; 4923 int end = -1; 4924 4925 if (CUR == '\0') { 4926 ERROR("Expecting ']'"); 4927 return; 4928 } 4929 4930 cur = CUR; 4931 if (cur == '\\') { 4932 NEXT; 4933 cur = CUR; 4934 switch (cur) { 4935 case 'n': start = 0xA; break; 4936 case 'r': start = 0xD; break; 4937 case 't': start = 0x9; break; 4938 case '\\': case '|': case '.': case '-': case '^': case '?': 4939 case '*': case '+': case '{': case '}': case '(': case ')': 4940 case '[': case ']': 4941 start = cur; break; 4942 default: 4943 ERROR("Invalid escape value"); 4944 return; 4945 } 4946 end = start; 4947 len = 1; 4948 } else if ((cur != 0x5B) && (cur != 0x5D)) { 4949 end = start = CUR_SCHAR(ctxt->cur, len); 4950 } else { 4951 ERROR("Expecting a char range"); 4952 return; 4953 } 4954 /* 4955 * Since we are "inside" a range, we can assume ctxt->cur is past 4956 * the start of ctxt->string, and PREV should be safe 4957 */ 4958 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) { 4959 NEXTL(len); 4960 return; 4961 } 4962 NEXTL(len); 4963 cur = CUR; 4964 if ((cur != '-') || (NXT(1) == ']')) { 4965 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4966 XML_REGEXP_CHARVAL, start, end, NULL); 4967 return; 4968 } 4969 NEXT; 4970 cur = CUR; 4971 if (cur == '\\') { 4972 NEXT; 4973 cur = CUR; 4974 switch (cur) { 4975 case 'n': end = 0xA; break; 4976 case 'r': end = 0xD; break; 4977 case 't': end = 0x9; break; 4978 case '\\': case '|': case '.': case '-': case '^': case '?': 4979 case '*': case '+': case '{': case '}': case '(': case ')': 4980 case '[': case ']': 4981 end = cur; break; 4982 default: 4983 ERROR("Invalid escape value"); 4984 return; 4985 } 4986 len = 1; 4987 } else if ((cur != 0x5B) && (cur != 0x5D)) { 4988 end = CUR_SCHAR(ctxt->cur, len); 4989 } else { 4990 ERROR("Expecting the end of a char range"); 4991 return; 4992 } 4993 NEXTL(len); 4994 /* TODO check that the values are acceptable character ranges for XML */ 4995 if (end < start) { 4996 ERROR("End of range is before start of range"); 4997 } else { 4998 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4999 XML_REGEXP_CHARVAL, start, end, NULL); 5000 } 5001 return; 5002 } 5003 5004 /** 5005 * xmlFAParsePosCharGroup: 5006 * @ctxt: a regexp parser context 5007 * 5008 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 5009 */ 5010 static void 5011 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 5012 do { 5013 if (CUR == '\\') { 5014 xmlFAParseCharClassEsc(ctxt); 5015 } else { 5016 xmlFAParseCharRange(ctxt); 5017 } 5018 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 5019 (CUR != 0) && (ctxt->error == 0)); 5020 } 5021 5022 /** 5023 * xmlFAParseCharGroup: 5024 * @ctxt: a regexp parser context 5025 * 5026 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 5027 * [15] negCharGroup ::= '^' posCharGroup 5028 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 5029 * [12] charClassExpr ::= '[' charGroup ']' 5030 */ 5031 static void 5032 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 5033 int n = ctxt->neg; 5034 while ((CUR != ']') && (ctxt->error == 0)) { 5035 if (CUR == '^') { 5036 int neg = ctxt->neg; 5037 5038 NEXT; 5039 ctxt->neg = !ctxt->neg; 5040 xmlFAParsePosCharGroup(ctxt); 5041 ctxt->neg = neg; 5042 } else if ((CUR == '-') && (NXT(1) == '[')) { 5043 int neg = ctxt->neg; 5044 ctxt->neg = 2; 5045 NEXT; /* eat the '-' */ 5046 NEXT; /* eat the '[' */ 5047 xmlFAParseCharGroup(ctxt); 5048 if (CUR == ']') { 5049 NEXT; 5050 } else { 5051 ERROR("charClassExpr: ']' expected"); 5052 break; 5053 } 5054 ctxt->neg = neg; 5055 break; 5056 } else if (CUR != ']') { 5057 xmlFAParsePosCharGroup(ctxt); 5058 } 5059 } 5060 ctxt->neg = n; 5061 } 5062 5063 /** 5064 * xmlFAParseCharClass: 5065 * @ctxt: a regexp parser context 5066 * 5067 * [11] charClass ::= charClassEsc | charClassExpr 5068 * [12] charClassExpr ::= '[' charGroup ']' 5069 */ 5070 static void 5071 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 5072 if (CUR == '[') { 5073 NEXT; 5074 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 5075 if (ctxt->atom == NULL) 5076 return; 5077 xmlFAParseCharGroup(ctxt); 5078 if (CUR == ']') { 5079 NEXT; 5080 } else { 5081 ERROR("xmlFAParseCharClass: ']' expected"); 5082 } 5083 } else { 5084 xmlFAParseCharClassEsc(ctxt); 5085 } 5086 } 5087 5088 /** 5089 * xmlFAParseQuantExact: 5090 * @ctxt: a regexp parser context 5091 * 5092 * [8] QuantExact ::= [0-9]+ 5093 * 5094 * Returns 0 if success or -1 in case of error 5095 */ 5096 static int 5097 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 5098 int ret = 0; 5099 int ok = 0; 5100 5101 while ((CUR >= '0') && (CUR <= '9')) { 5102 ret = ret * 10 + (CUR - '0'); 5103 ok = 1; 5104 NEXT; 5105 } 5106 if (ok != 1) { 5107 return(-1); 5108 } 5109 return(ret); 5110 } 5111 5112 /** 5113 * xmlFAParseQuantifier: 5114 * @ctxt: a regexp parser context 5115 * 5116 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 5117 * [5] quantity ::= quantRange | quantMin | QuantExact 5118 * [6] quantRange ::= QuantExact ',' QuantExact 5119 * [7] quantMin ::= QuantExact ',' 5120 * [8] QuantExact ::= [0-9]+ 5121 */ 5122 static int 5123 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5124 int cur; 5125 5126 cur = CUR; 5127 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5128 if (ctxt->atom != NULL) { 5129 if (cur == '?') 5130 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5131 else if (cur == '*') 5132 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5133 else if (cur == '+') 5134 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5135 } 5136 NEXT; 5137 return(1); 5138 } 5139 if (cur == '{') { 5140 int min = 0, max = 0; 5141 5142 NEXT; 5143 cur = xmlFAParseQuantExact(ctxt); 5144 if (cur >= 0) 5145 min = cur; 5146 if (CUR == ',') { 5147 NEXT; 5148 if (CUR == '}') 5149 max = INT_MAX; 5150 else { 5151 cur = xmlFAParseQuantExact(ctxt); 5152 if (cur >= 0) 5153 max = cur; 5154 else { 5155 ERROR("Improper quantifier"); 5156 } 5157 } 5158 } 5159 if (CUR == '}') { 5160 NEXT; 5161 } else { 5162 ERROR("Unterminated quantifier"); 5163 } 5164 if (max == 0) 5165 max = min; 5166 if (ctxt->atom != NULL) { 5167 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5168 ctxt->atom->min = min; 5169 ctxt->atom->max = max; 5170 } 5171 return(1); 5172 } 5173 return(0); 5174 } 5175 5176 /** 5177 * xmlFAParseAtom: 5178 * @ctxt: a regexp parser context 5179 * 5180 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5181 */ 5182 static int 5183 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5184 int codepoint, len; 5185 5186 codepoint = xmlFAIsChar(ctxt); 5187 if (codepoint > 0) { 5188 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5189 if (ctxt->atom == NULL) 5190 return(-1); 5191 codepoint = CUR_SCHAR(ctxt->cur, len); 5192 ctxt->atom->codepoint = codepoint; 5193 NEXTL(len); 5194 return(1); 5195 } else if (CUR == '|') { 5196 return(0); 5197 } else if (CUR == 0) { 5198 return(0); 5199 } else if (CUR == ')') { 5200 return(0); 5201 } else if (CUR == '(') { 5202 xmlRegStatePtr start, oldend, start0; 5203 5204 NEXT; 5205 /* 5206 * this extra Epsilon transition is needed if we count with 0 allowed 5207 * unfortunately this can't be known at that point 5208 */ 5209 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5210 start0 = ctxt->state; 5211 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5212 start = ctxt->state; 5213 oldend = ctxt->end; 5214 ctxt->end = NULL; 5215 ctxt->atom = NULL; 5216 xmlFAParseRegExp(ctxt, 0); 5217 if (CUR == ')') { 5218 NEXT; 5219 } else { 5220 ERROR("xmlFAParseAtom: expecting ')'"); 5221 } 5222 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5223 if (ctxt->atom == NULL) 5224 return(-1); 5225 ctxt->atom->start = start; 5226 ctxt->atom->start0 = start0; 5227 ctxt->atom->stop = ctxt->state; 5228 ctxt->end = oldend; 5229 return(1); 5230 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5231 xmlFAParseCharClass(ctxt); 5232 return(1); 5233 } 5234 return(0); 5235 } 5236 5237 /** 5238 * xmlFAParsePiece: 5239 * @ctxt: a regexp parser context 5240 * 5241 * [3] piece ::= atom quantifier? 5242 */ 5243 static int 5244 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5245 int ret; 5246 5247 ctxt->atom = NULL; 5248 ret = xmlFAParseAtom(ctxt); 5249 if (ret == 0) 5250 return(0); 5251 if (ctxt->atom == NULL) { 5252 ERROR("internal: no atom generated"); 5253 } 5254 xmlFAParseQuantifier(ctxt); 5255 return(1); 5256 } 5257 5258 /** 5259 * xmlFAParseBranch: 5260 * @ctxt: a regexp parser context 5261 * @to: optional target to the end of the branch 5262 * 5263 * @to is used to optimize by removing duplicate path in automata 5264 * in expressions like (a|b)(c|d) 5265 * 5266 * [2] branch ::= piece* 5267 */ 5268 static int 5269 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5270 xmlRegStatePtr previous; 5271 int ret; 5272 5273 previous = ctxt->state; 5274 ret = xmlFAParsePiece(ctxt); 5275 if (ret != 0) { 5276 if (xmlFAGenerateTransitions(ctxt, previous, 5277 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5278 return(-1); 5279 previous = ctxt->state; 5280 ctxt->atom = NULL; 5281 } 5282 while ((ret != 0) && (ctxt->error == 0)) { 5283 ret = xmlFAParsePiece(ctxt); 5284 if (ret != 0) { 5285 if (xmlFAGenerateTransitions(ctxt, previous, 5286 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5287 return(-1); 5288 previous = ctxt->state; 5289 ctxt->atom = NULL; 5290 } 5291 } 5292 return(0); 5293 } 5294 5295 /** 5296 * xmlFAParseRegExp: 5297 * @ctxt: a regexp parser context 5298 * @top: is this the top-level expression ? 5299 * 5300 * [1] regExp ::= branch ( '|' branch )* 5301 */ 5302 static void 5303 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5304 xmlRegStatePtr start, end; 5305 5306 /* if not top start should have been generated by an epsilon trans */ 5307 start = ctxt->state; 5308 ctxt->end = NULL; 5309 xmlFAParseBranch(ctxt, NULL); 5310 if (top) { 5311 #ifdef DEBUG_REGEXP_GRAPH 5312 printf("State %d is final\n", ctxt->state->no); 5313 #endif 5314 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5315 } 5316 if (CUR != '|') { 5317 ctxt->end = ctxt->state; 5318 return; 5319 } 5320 end = ctxt->state; 5321 while ((CUR == '|') && (ctxt->error == 0)) { 5322 NEXT; 5323 ctxt->state = start; 5324 ctxt->end = NULL; 5325 xmlFAParseBranch(ctxt, end); 5326 } 5327 if (!top) { 5328 ctxt->state = end; 5329 ctxt->end = end; 5330 } 5331 } 5332 5333 /************************************************************************ 5334 * * 5335 * The basic API * 5336 * * 5337 ************************************************************************/ 5338 5339 /** 5340 * xmlRegexpPrint: 5341 * @output: the file for the output debug 5342 * @regexp: the compiled regexp 5343 * 5344 * Print the content of the compiled regular expression 5345 */ 5346 void 5347 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5348 int i; 5349 5350 if (output == NULL) 5351 return; 5352 fprintf(output, " regexp: "); 5353 if (regexp == NULL) { 5354 fprintf(output, "NULL\n"); 5355 return; 5356 } 5357 fprintf(output, "'%s' ", regexp->string); 5358 fprintf(output, "\n"); 5359 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5360 for (i = 0;i < regexp->nbAtoms; i++) { 5361 fprintf(output, " %02d ", i); 5362 xmlRegPrintAtom(output, regexp->atoms[i]); 5363 } 5364 fprintf(output, "%d states:", regexp->nbStates); 5365 fprintf(output, "\n"); 5366 for (i = 0;i < regexp->nbStates; i++) { 5367 xmlRegPrintState(output, regexp->states[i]); 5368 } 5369 fprintf(output, "%d counters:\n", regexp->nbCounters); 5370 for (i = 0;i < regexp->nbCounters; i++) { 5371 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5372 regexp->counters[i].max); 5373 } 5374 } 5375 5376 /** 5377 * xmlRegexpCompile: 5378 * @regexp: a regular expression string 5379 * 5380 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5381 * Appendix F and builds an automata suitable for testing strings against 5382 * that regular expression 5383 * 5384 * Returns the compiled expression or NULL in case of error 5385 */ 5386 xmlRegexpPtr 5387 xmlRegexpCompile(const xmlChar *regexp) { 5388 xmlRegexpPtr ret; 5389 xmlRegParserCtxtPtr ctxt; 5390 5391 ctxt = xmlRegNewParserCtxt(regexp); 5392 if (ctxt == NULL) 5393 return(NULL); 5394 5395 /* initialize the parser */ 5396 ctxt->end = NULL; 5397 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5398 xmlRegStatePush(ctxt, ctxt->start); 5399 5400 /* parse the expression building an automata */ 5401 xmlFAParseRegExp(ctxt, 1); 5402 if (CUR != 0) { 5403 ERROR("xmlFAParseRegExp: extra characters"); 5404 } 5405 if (ctxt->error != 0) { 5406 xmlRegFreeParserCtxt(ctxt); 5407 return(NULL); 5408 } 5409 ctxt->end = ctxt->state; 5410 ctxt->start->type = XML_REGEXP_START_STATE; 5411 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5412 5413 /* remove the Epsilon except for counted transitions */ 5414 xmlFAEliminateEpsilonTransitions(ctxt); 5415 5416 5417 if (ctxt->error != 0) { 5418 xmlRegFreeParserCtxt(ctxt); 5419 return(NULL); 5420 } 5421 ret = xmlRegEpxFromParse(ctxt); 5422 xmlRegFreeParserCtxt(ctxt); 5423 return(ret); 5424 } 5425 5426 /** 5427 * xmlRegexpExec: 5428 * @comp: the compiled regular expression 5429 * @content: the value to check against the regular expression 5430 * 5431 * Check if the regular expression generates the value 5432 * 5433 * Returns 1 if it matches, 0 if not and a negative value in case of error 5434 */ 5435 int 5436 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5437 if ((comp == NULL) || (content == NULL)) 5438 return(-1); 5439 return(xmlFARegExec(comp, content)); 5440 } 5441 5442 /** 5443 * xmlRegexpIsDeterminist: 5444 * @comp: the compiled regular expression 5445 * 5446 * Check if the regular expression is determinist 5447 * 5448 * Returns 1 if it yes, 0 if not and a negative value in case of error 5449 */ 5450 int 5451 xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5452 xmlAutomataPtr am; 5453 int ret; 5454 5455 if (comp == NULL) 5456 return(-1); 5457 if (comp->determinist != -1) 5458 return(comp->determinist); 5459 5460 am = xmlNewAutomata(); 5461 if (am->states != NULL) { 5462 int i; 5463 5464 for (i = 0;i < am->nbStates;i++) 5465 xmlRegFreeState(am->states[i]); 5466 xmlFree(am->states); 5467 } 5468 am->nbAtoms = comp->nbAtoms; 5469 am->atoms = comp->atoms; 5470 am->nbStates = comp->nbStates; 5471 am->states = comp->states; 5472 am->determinist = -1; 5473 ret = xmlFAComputesDeterminism(am); 5474 am->atoms = NULL; 5475 am->states = NULL; 5476 xmlFreeAutomata(am); 5477 return(ret); 5478 } 5479 5480 /** 5481 * xmlRegFreeRegexp: 5482 * @regexp: the regexp 5483 * 5484 * Free a regexp 5485 */ 5486 void 5487 xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5488 int i; 5489 if (regexp == NULL) 5490 return; 5491 5492 if (regexp->string != NULL) 5493 xmlFree(regexp->string); 5494 if (regexp->states != NULL) { 5495 for (i = 0;i < regexp->nbStates;i++) 5496 xmlRegFreeState(regexp->states[i]); 5497 xmlFree(regexp->states); 5498 } 5499 if (regexp->atoms != NULL) { 5500 for (i = 0;i < regexp->nbAtoms;i++) 5501 xmlRegFreeAtom(regexp->atoms[i]); 5502 xmlFree(regexp->atoms); 5503 } 5504 if (regexp->counters != NULL) 5505 xmlFree(regexp->counters); 5506 if (regexp->compact != NULL) 5507 xmlFree(regexp->compact); 5508 if (regexp->transdata != NULL) 5509 xmlFree(regexp->transdata); 5510 if (regexp->stringMap != NULL) { 5511 for (i = 0; i < regexp->nbstrings;i++) 5512 xmlFree(regexp->stringMap[i]); 5513 xmlFree(regexp->stringMap); 5514 } 5515 5516 xmlFree(regexp); 5517 } 5518 5519 #ifdef LIBXML_AUTOMATA_ENABLED 5520 /************************************************************************ 5521 * * 5522 * The Automata interface * 5523 * * 5524 ************************************************************************/ 5525 5526 /** 5527 * xmlNewAutomata: 5528 * 5529 * Create a new automata 5530 * 5531 * Returns the new object or NULL in case of failure 5532 */ 5533 xmlAutomataPtr 5534 xmlNewAutomata(void) { 5535 xmlAutomataPtr ctxt; 5536 5537 ctxt = xmlRegNewParserCtxt(NULL); 5538 if (ctxt == NULL) 5539 return(NULL); 5540 5541 /* initialize the parser */ 5542 ctxt->end = NULL; 5543 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5544 if (ctxt->start == NULL) { 5545 xmlFreeAutomata(ctxt); 5546 return(NULL); 5547 } 5548 ctxt->start->type = XML_REGEXP_START_STATE; 5549 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5550 xmlRegFreeState(ctxt->start); 5551 xmlFreeAutomata(ctxt); 5552 return(NULL); 5553 } 5554 5555 return(ctxt); 5556 } 5557 5558 /** 5559 * xmlFreeAutomata: 5560 * @am: an automata 5561 * 5562 * Free an automata 5563 */ 5564 void 5565 xmlFreeAutomata(xmlAutomataPtr am) { 5566 if (am == NULL) 5567 return; 5568 xmlRegFreeParserCtxt(am); 5569 } 5570 5571 /** 5572 * xmlAutomataGetInitState: 5573 * @am: an automata 5574 * 5575 * Initial state lookup 5576 * 5577 * Returns the initial state of the automata 5578 */ 5579 xmlAutomataStatePtr 5580 xmlAutomataGetInitState(xmlAutomataPtr am) { 5581 if (am == NULL) 5582 return(NULL); 5583 return(am->start); 5584 } 5585 5586 /** 5587 * xmlAutomataSetFinalState: 5588 * @am: an automata 5589 * @state: a state in this automata 5590 * 5591 * Makes that state a final state 5592 * 5593 * Returns 0 or -1 in case of error 5594 */ 5595 int 5596 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5597 if ((am == NULL) || (state == NULL)) 5598 return(-1); 5599 state->type = XML_REGEXP_FINAL_STATE; 5600 return(0); 5601 } 5602 5603 /** 5604 * xmlAutomataNewTransition: 5605 * @am: an automata 5606 * @from: the starting point of the transition 5607 * @to: the target point of the transition or NULL 5608 * @token: the input string associated to that transition 5609 * @data: data passed to the callback function if the transition is activated 5610 * 5611 * If @to is NULL, this creates first a new target state in the automata 5612 * and then adds a transition from the @from state to the target state 5613 * activated by the value of @token 5614 * 5615 * Returns the target state or NULL in case of error 5616 */ 5617 xmlAutomataStatePtr 5618 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5619 xmlAutomataStatePtr to, const xmlChar *token, 5620 void *data) { 5621 xmlRegAtomPtr atom; 5622 5623 if ((am == NULL) || (from == NULL) || (token == NULL)) 5624 return(NULL); 5625 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5626 if (atom == NULL) 5627 return(NULL); 5628 atom->data = data; 5629 if (atom == NULL) 5630 return(NULL); 5631 atom->valuep = xmlStrdup(token); 5632 5633 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5634 xmlRegFreeAtom(atom); 5635 return(NULL); 5636 } 5637 if (to == NULL) 5638 return(am->state); 5639 return(to); 5640 } 5641 5642 /** 5643 * xmlAutomataNewTransition2: 5644 * @am: an automata 5645 * @from: the starting point of the transition 5646 * @to: the target point of the transition or NULL 5647 * @token: the first input string associated to that transition 5648 * @token2: the second input string associated to that transition 5649 * @data: data passed to the callback function if the transition is activated 5650 * 5651 * If @to is NULL, this creates first a new target state in the automata 5652 * and then adds a transition from the @from state to the target state 5653 * activated by the value of @token 5654 * 5655 * Returns the target state or NULL in case of error 5656 */ 5657 xmlAutomataStatePtr 5658 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5659 xmlAutomataStatePtr to, const xmlChar *token, 5660 const xmlChar *token2, void *data) { 5661 xmlRegAtomPtr atom; 5662 5663 if ((am == NULL) || (from == NULL) || (token == NULL)) 5664 return(NULL); 5665 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5666 if (atom == NULL) 5667 return(NULL); 5668 atom->data = data; 5669 if ((token2 == NULL) || (*token2 == 0)) { 5670 atom->valuep = xmlStrdup(token); 5671 } else { 5672 int lenn, lenp; 5673 xmlChar *str; 5674 5675 lenn = strlen((char *) token2); 5676 lenp = strlen((char *) token); 5677 5678 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5679 if (str == NULL) { 5680 xmlRegFreeAtom(atom); 5681 return(NULL); 5682 } 5683 memcpy(&str[0], token, lenp); 5684 str[lenp] = '|'; 5685 memcpy(&str[lenp + 1], token2, lenn); 5686 str[lenn + lenp + 1] = 0; 5687 5688 atom->valuep = str; 5689 } 5690 5691 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5692 xmlRegFreeAtom(atom); 5693 return(NULL); 5694 } 5695 if (to == NULL) 5696 return(am->state); 5697 return(to); 5698 } 5699 5700 /** 5701 * xmlAutomataNewNegTrans: 5702 * @am: an automata 5703 * @from: the starting point of the transition 5704 * @to: the target point of the transition or NULL 5705 * @token: the first input string associated to that transition 5706 * @token2: the second input string associated to that transition 5707 * @data: data passed to the callback function if the transition is activated 5708 * 5709 * If @to is NULL, this creates first a new target state in the automata 5710 * and then adds a transition from the @from state to the target state 5711 * activated by any value except (@token,@token2) 5712 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5713 # the semantic of XSD ##other 5714 * 5715 * Returns the target state or NULL in case of error 5716 */ 5717 xmlAutomataStatePtr 5718 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5719 xmlAutomataStatePtr to, const xmlChar *token, 5720 const xmlChar *token2, void *data) { 5721 xmlRegAtomPtr atom; 5722 xmlChar err_msg[200]; 5723 5724 if ((am == NULL) || (from == NULL) || (token == NULL)) 5725 return(NULL); 5726 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5727 if (atom == NULL) 5728 return(NULL); 5729 atom->data = data; 5730 atom->neg = 1; 5731 if ((token2 == NULL) || (*token2 == 0)) { 5732 atom->valuep = xmlStrdup(token); 5733 } else { 5734 int lenn, lenp; 5735 xmlChar *str; 5736 5737 lenn = strlen((char *) token2); 5738 lenp = strlen((char *) token); 5739 5740 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5741 if (str == NULL) { 5742 xmlRegFreeAtom(atom); 5743 return(NULL); 5744 } 5745 memcpy(&str[0], token, lenp); 5746 str[lenp] = '|'; 5747 memcpy(&str[lenp + 1], token2, lenn); 5748 str[lenn + lenp + 1] = 0; 5749 5750 atom->valuep = str; 5751 } 5752 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5753 err_msg[199] = 0; 5754 atom->valuep2 = xmlStrdup(err_msg); 5755 5756 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5757 xmlRegFreeAtom(atom); 5758 return(NULL); 5759 } 5760 am->negs++; 5761 if (to == NULL) 5762 return(am->state); 5763 return(to); 5764 } 5765 5766 /** 5767 * xmlAutomataNewCountTrans2: 5768 * @am: an automata 5769 * @from: the starting point of the transition 5770 * @to: the target point of the transition or NULL 5771 * @token: the input string associated to that transition 5772 * @token2: the second input string associated to that transition 5773 * @min: the minimum successive occurences of token 5774 * @max: the maximum successive occurences of token 5775 * @data: data associated to the transition 5776 * 5777 * If @to is NULL, this creates first a new target state in the automata 5778 * and then adds a transition from the @from state to the target state 5779 * activated by a succession of input of value @token and @token2 and 5780 * whose number is between @min and @max 5781 * 5782 * Returns the target state or NULL in case of error 5783 */ 5784 xmlAutomataStatePtr 5785 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5786 xmlAutomataStatePtr to, const xmlChar *token, 5787 const xmlChar *token2, 5788 int min, int max, void *data) { 5789 xmlRegAtomPtr atom; 5790 int counter; 5791 5792 if ((am == NULL) || (from == NULL) || (token == NULL)) 5793 return(NULL); 5794 if (min < 0) 5795 return(NULL); 5796 if ((max < min) || (max < 1)) 5797 return(NULL); 5798 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5799 if (atom == NULL) 5800 return(NULL); 5801 if ((token2 == NULL) || (*token2 == 0)) { 5802 atom->valuep = xmlStrdup(token); 5803 } else { 5804 int lenn, lenp; 5805 xmlChar *str; 5806 5807 lenn = strlen((char *) token2); 5808 lenp = strlen((char *) token); 5809 5810 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5811 if (str == NULL) { 5812 xmlRegFreeAtom(atom); 5813 return(NULL); 5814 } 5815 memcpy(&str[0], token, lenp); 5816 str[lenp] = '|'; 5817 memcpy(&str[lenp + 1], token2, lenn); 5818 str[lenn + lenp + 1] = 0; 5819 5820 atom->valuep = str; 5821 } 5822 atom->data = data; 5823 if (min == 0) 5824 atom->min = 1; 5825 else 5826 atom->min = min; 5827 atom->max = max; 5828 5829 /* 5830 * associate a counter to the transition. 5831 */ 5832 counter = xmlRegGetCounter(am); 5833 am->counters[counter].min = min; 5834 am->counters[counter].max = max; 5835 5836 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5837 if (to == NULL) { 5838 to = xmlRegNewState(am); 5839 xmlRegStatePush(am, to); 5840 } 5841 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5842 xmlRegAtomPush(am, atom); 5843 am->state = to; 5844 5845 if (to == NULL) 5846 to = am->state; 5847 if (to == NULL) 5848 return(NULL); 5849 if (min == 0) 5850 xmlFAGenerateEpsilonTransition(am, from, to); 5851 return(to); 5852 } 5853 5854 /** 5855 * xmlAutomataNewCountTrans: 5856 * @am: an automata 5857 * @from: the starting point of the transition 5858 * @to: the target point of the transition or NULL 5859 * @token: the input string associated to that transition 5860 * @min: the minimum successive occurences of token 5861 * @max: the maximum successive occurences of token 5862 * @data: data associated to the transition 5863 * 5864 * If @to is NULL, this creates first a new target state in the automata 5865 * and then adds a transition from the @from state to the target state 5866 * activated by a succession of input of value @token and whose number 5867 * is between @min and @max 5868 * 5869 * Returns the target state or NULL in case of error 5870 */ 5871 xmlAutomataStatePtr 5872 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5873 xmlAutomataStatePtr to, const xmlChar *token, 5874 int min, int max, void *data) { 5875 xmlRegAtomPtr atom; 5876 int counter; 5877 5878 if ((am == NULL) || (from == NULL) || (token == NULL)) 5879 return(NULL); 5880 if (min < 0) 5881 return(NULL); 5882 if ((max < min) || (max < 1)) 5883 return(NULL); 5884 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5885 if (atom == NULL) 5886 return(NULL); 5887 atom->valuep = xmlStrdup(token); 5888 atom->data = data; 5889 if (min == 0) 5890 atom->min = 1; 5891 else 5892 atom->min = min; 5893 atom->max = max; 5894 5895 /* 5896 * associate a counter to the transition. 5897 */ 5898 counter = xmlRegGetCounter(am); 5899 am->counters[counter].min = min; 5900 am->counters[counter].max = max; 5901 5902 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5903 if (to == NULL) { 5904 to = xmlRegNewState(am); 5905 xmlRegStatePush(am, to); 5906 } 5907 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5908 xmlRegAtomPush(am, atom); 5909 am->state = to; 5910 5911 if (to == NULL) 5912 to = am->state; 5913 if (to == NULL) 5914 return(NULL); 5915 if (min == 0) 5916 xmlFAGenerateEpsilonTransition(am, from, to); 5917 return(to); 5918 } 5919 5920 /** 5921 * xmlAutomataNewOnceTrans2: 5922 * @am: an automata 5923 * @from: the starting point of the transition 5924 * @to: the target point of the transition or NULL 5925 * @token: the input string associated to that transition 5926 * @token2: the second input string associated to that transition 5927 * @min: the minimum successive occurences of token 5928 * @max: the maximum successive occurences of token 5929 * @data: data associated to the transition 5930 * 5931 * If @to is NULL, this creates first a new target state in the automata 5932 * and then adds a transition from the @from state to the target state 5933 * activated by a succession of input of value @token and @token2 and whose 5934 * number is between @min and @max, moreover that transition can only be 5935 * crossed once. 5936 * 5937 * Returns the target state or NULL in case of error 5938 */ 5939 xmlAutomataStatePtr 5940 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5941 xmlAutomataStatePtr to, const xmlChar *token, 5942 const xmlChar *token2, 5943 int min, int max, void *data) { 5944 xmlRegAtomPtr atom; 5945 int counter; 5946 5947 if ((am == NULL) || (from == NULL) || (token == NULL)) 5948 return(NULL); 5949 if (min < 1) 5950 return(NULL); 5951 if ((max < min) || (max < 1)) 5952 return(NULL); 5953 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5954 if (atom == NULL) 5955 return(NULL); 5956 if ((token2 == NULL) || (*token2 == 0)) { 5957 atom->valuep = xmlStrdup(token); 5958 } else { 5959 int lenn, lenp; 5960 xmlChar *str; 5961 5962 lenn = strlen((char *) token2); 5963 lenp = strlen((char *) token); 5964 5965 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5966 if (str == NULL) { 5967 xmlRegFreeAtom(atom); 5968 return(NULL); 5969 } 5970 memcpy(&str[0], token, lenp); 5971 str[lenp] = '|'; 5972 memcpy(&str[lenp + 1], token2, lenn); 5973 str[lenn + lenp + 1] = 0; 5974 5975 atom->valuep = str; 5976 } 5977 atom->data = data; 5978 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 5979 atom->min = min; 5980 atom->max = max; 5981 /* 5982 * associate a counter to the transition. 5983 */ 5984 counter = xmlRegGetCounter(am); 5985 am->counters[counter].min = 1; 5986 am->counters[counter].max = 1; 5987 5988 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5989 if (to == NULL) { 5990 to = xmlRegNewState(am); 5991 xmlRegStatePush(am, to); 5992 } 5993 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5994 xmlRegAtomPush(am, atom); 5995 am->state = to; 5996 return(to); 5997 } 5998 5999 6000 6001 /** 6002 * xmlAutomataNewOnceTrans: 6003 * @am: an automata 6004 * @from: the starting point of the transition 6005 * @to: the target point of the transition or NULL 6006 * @token: the input string associated to that transition 6007 * @min: the minimum successive occurences of token 6008 * @max: the maximum successive occurences of token 6009 * @data: data associated to the transition 6010 * 6011 * If @to is NULL, this creates first a new target state in the automata 6012 * and then adds a transition from the @from state to the target state 6013 * activated by a succession of input of value @token and whose number 6014 * is between @min and @max, moreover that transition can only be crossed 6015 * once. 6016 * 6017 * Returns the target state or NULL in case of error 6018 */ 6019 xmlAutomataStatePtr 6020 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6021 xmlAutomataStatePtr to, const xmlChar *token, 6022 int min, int max, void *data) { 6023 xmlRegAtomPtr atom; 6024 int counter; 6025 6026 if ((am == NULL) || (from == NULL) || (token == NULL)) 6027 return(NULL); 6028 if (min < 1) 6029 return(NULL); 6030 if ((max < min) || (max < 1)) 6031 return(NULL); 6032 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6033 if (atom == NULL) 6034 return(NULL); 6035 atom->valuep = xmlStrdup(token); 6036 atom->data = data; 6037 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6038 atom->min = min; 6039 atom->max = max; 6040 /* 6041 * associate a counter to the transition. 6042 */ 6043 counter = xmlRegGetCounter(am); 6044 am->counters[counter].min = 1; 6045 am->counters[counter].max = 1; 6046 6047 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6048 if (to == NULL) { 6049 to = xmlRegNewState(am); 6050 xmlRegStatePush(am, to); 6051 } 6052 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6053 xmlRegAtomPush(am, atom); 6054 am->state = to; 6055 return(to); 6056 } 6057 6058 /** 6059 * xmlAutomataNewState: 6060 * @am: an automata 6061 * 6062 * Create a new disconnected state in the automata 6063 * 6064 * Returns the new state or NULL in case of error 6065 */ 6066 xmlAutomataStatePtr 6067 xmlAutomataNewState(xmlAutomataPtr am) { 6068 xmlAutomataStatePtr to; 6069 6070 if (am == NULL) 6071 return(NULL); 6072 to = xmlRegNewState(am); 6073 xmlRegStatePush(am, to); 6074 return(to); 6075 } 6076 6077 /** 6078 * xmlAutomataNewEpsilon: 6079 * @am: an automata 6080 * @from: the starting point of the transition 6081 * @to: the target point of the transition or NULL 6082 * 6083 * If @to is NULL, this creates first a new target state in the automata 6084 * and then adds an epsilon transition from the @from state to the 6085 * target state 6086 * 6087 * Returns the target state or NULL in case of error 6088 */ 6089 xmlAutomataStatePtr 6090 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 6091 xmlAutomataStatePtr to) { 6092 if ((am == NULL) || (from == NULL)) 6093 return(NULL); 6094 xmlFAGenerateEpsilonTransition(am, from, to); 6095 if (to == NULL) 6096 return(am->state); 6097 return(to); 6098 } 6099 6100 /** 6101 * xmlAutomataNewAllTrans: 6102 * @am: an automata 6103 * @from: the starting point of the transition 6104 * @to: the target point of the transition or NULL 6105 * @lax: allow to transition if not all all transitions have been activated 6106 * 6107 * If @to is NULL, this creates first a new target state in the automata 6108 * and then adds a an ALL transition from the @from state to the 6109 * target state. That transition is an epsilon transition allowed only when 6110 * all transitions from the @from node have been activated. 6111 * 6112 * Returns the target state or NULL in case of error 6113 */ 6114 xmlAutomataStatePtr 6115 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6116 xmlAutomataStatePtr to, int lax) { 6117 if ((am == NULL) || (from == NULL)) 6118 return(NULL); 6119 xmlFAGenerateAllTransition(am, from, to, lax); 6120 if (to == NULL) 6121 return(am->state); 6122 return(to); 6123 } 6124 6125 /** 6126 * xmlAutomataNewCounter: 6127 * @am: an automata 6128 * @min: the minimal value on the counter 6129 * @max: the maximal value on the counter 6130 * 6131 * Create a new counter 6132 * 6133 * Returns the counter number or -1 in case of error 6134 */ 6135 int 6136 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 6137 int ret; 6138 6139 if (am == NULL) 6140 return(-1); 6141 6142 ret = xmlRegGetCounter(am); 6143 if (ret < 0) 6144 return(-1); 6145 am->counters[ret].min = min; 6146 am->counters[ret].max = max; 6147 return(ret); 6148 } 6149 6150 /** 6151 * xmlAutomataNewCountedTrans: 6152 * @am: an automata 6153 * @from: the starting point of the transition 6154 * @to: the target point of the transition or NULL 6155 * @counter: the counter associated to that transition 6156 * 6157 * If @to is NULL, this creates first a new target state in the automata 6158 * and then adds an epsilon transition from the @from state to the target state 6159 * which will increment the counter provided 6160 * 6161 * Returns the target state or NULL in case of error 6162 */ 6163 xmlAutomataStatePtr 6164 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6165 xmlAutomataStatePtr to, int counter) { 6166 if ((am == NULL) || (from == NULL) || (counter < 0)) 6167 return(NULL); 6168 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 6169 if (to == NULL) 6170 return(am->state); 6171 return(to); 6172 } 6173 6174 /** 6175 * xmlAutomataNewCounterTrans: 6176 * @am: an automata 6177 * @from: the starting point of the transition 6178 * @to: the target point of the transition or NULL 6179 * @counter: the counter associated to that transition 6180 * 6181 * If @to is NULL, this creates first a new target state in the automata 6182 * and then adds an epsilon transition from the @from state to the target state 6183 * which will be allowed only if the counter is within the right range. 6184 * 6185 * Returns the target state or NULL in case of error 6186 */ 6187 xmlAutomataStatePtr 6188 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6189 xmlAutomataStatePtr to, int counter) { 6190 if ((am == NULL) || (from == NULL) || (counter < 0)) 6191 return(NULL); 6192 xmlFAGenerateCountedTransition(am, from, to, counter); 6193 if (to == NULL) 6194 return(am->state); 6195 return(to); 6196 } 6197 6198 /** 6199 * xmlAutomataCompile: 6200 * @am: an automata 6201 * 6202 * Compile the automata into a Reg Exp ready for being executed. 6203 * The automata should be free after this point. 6204 * 6205 * Returns the compiled regexp or NULL in case of error 6206 */ 6207 xmlRegexpPtr 6208 xmlAutomataCompile(xmlAutomataPtr am) { 6209 xmlRegexpPtr ret; 6210 6211 if ((am == NULL) || (am->error != 0)) return(NULL); 6212 xmlFAEliminateEpsilonTransitions(am); 6213 /* xmlFAComputesDeterminism(am); */ 6214 ret = xmlRegEpxFromParse(am); 6215 6216 return(ret); 6217 } 6218 6219 /** 6220 * xmlAutomataIsDeterminist: 6221 * @am: an automata 6222 * 6223 * Checks if an automata is determinist. 6224 * 6225 * Returns 1 if true, 0 if not, and -1 in case of error 6226 */ 6227 int 6228 xmlAutomataIsDeterminist(xmlAutomataPtr am) { 6229 int ret; 6230 6231 if (am == NULL) 6232 return(-1); 6233 6234 ret = xmlFAComputesDeterminism(am); 6235 return(ret); 6236 } 6237 #endif /* LIBXML_AUTOMATA_ENABLED */ 6238 6239 #ifdef LIBXML_EXPR_ENABLED 6240 /************************************************************************ 6241 * * 6242 * Formal Expression handling code * 6243 * * 6244 ************************************************************************/ 6245 /************************************************************************ 6246 * * 6247 * Expression handling context * 6248 * * 6249 ************************************************************************/ 6250 6251 struct _xmlExpCtxt { 6252 xmlDictPtr dict; 6253 xmlExpNodePtr *table; 6254 int size; 6255 int nbElems; 6256 int nb_nodes; 6257 const char *expr; 6258 const char *cur; 6259 int nb_cons; 6260 int tabSize; 6261 }; 6262 6263 /** 6264 * xmlExpNewCtxt: 6265 * @maxNodes: the maximum number of nodes 6266 * @dict: optional dictionnary to use internally 6267 * 6268 * Creates a new context for manipulating expressions 6269 * 6270 * Returns the context or NULL in case of error 6271 */ 6272 xmlExpCtxtPtr 6273 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 6274 xmlExpCtxtPtr ret; 6275 int size = 256; 6276 6277 if (maxNodes <= 4096) 6278 maxNodes = 4096; 6279 6280 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 6281 if (ret == NULL) 6282 return(NULL); 6283 memset(ret, 0, sizeof(xmlExpCtxt)); 6284 ret->size = size; 6285 ret->nbElems = 0; 6286 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 6287 if (ret->table == NULL) { 6288 xmlFree(ret); 6289 return(NULL); 6290 } 6291 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 6292 if (dict == NULL) { 6293 ret->dict = xmlDictCreate(); 6294 if (ret->dict == NULL) { 6295 xmlFree(ret->table); 6296 xmlFree(ret); 6297 return(NULL); 6298 } 6299 } else { 6300 ret->dict = dict; 6301 xmlDictReference(ret->dict); 6302 } 6303 return(ret); 6304 } 6305 6306 /** 6307 * xmlExpFreeCtxt: 6308 * @ctxt: an expression context 6309 * 6310 * Free an expression context 6311 */ 6312 void 6313 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 6314 if (ctxt == NULL) 6315 return; 6316 xmlDictFree(ctxt->dict); 6317 if (ctxt->table != NULL) 6318 xmlFree(ctxt->table); 6319 xmlFree(ctxt); 6320 } 6321 6322 /************************************************************************ 6323 * * 6324 * Structure associated to an expression node * 6325 * * 6326 ************************************************************************/ 6327 #define MAX_NODES 10000 6328 6329 /* #define DEBUG_DERIV */ 6330 6331 /* 6332 * TODO: 6333 * - Wildcards 6334 * - public API for creation 6335 * 6336 * Started 6337 * - regression testing 6338 * 6339 * Done 6340 * - split into module and test tool 6341 * - memleaks 6342 */ 6343 6344 typedef enum { 6345 XML_EXP_NILABLE = (1 << 0) 6346 } xmlExpNodeInfo; 6347 6348 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 6349 6350 struct _xmlExpNode { 6351 unsigned char type;/* xmlExpNodeType */ 6352 unsigned char info;/* OR of xmlExpNodeInfo */ 6353 unsigned short key; /* the hash key */ 6354 unsigned int ref; /* The number of references */ 6355 int c_max; /* the maximum length it can consume */ 6356 xmlExpNodePtr exp_left; 6357 xmlExpNodePtr next;/* the next node in the hash table or free list */ 6358 union { 6359 struct { 6360 int f_min; 6361 int f_max; 6362 } count; 6363 struct { 6364 xmlExpNodePtr f_right; 6365 } children; 6366 const xmlChar *f_str; 6367 } field; 6368 }; 6369 6370 #define exp_min field.count.f_min 6371 #define exp_max field.count.f_max 6372 /* #define exp_left field.children.f_left */ 6373 #define exp_right field.children.f_right 6374 #define exp_str field.f_str 6375 6376 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 6377 static xmlExpNode forbiddenExpNode = { 6378 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6379 }; 6380 xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 6381 static xmlExpNode emptyExpNode = { 6382 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6383 }; 6384 xmlExpNodePtr emptyExp = &emptyExpNode; 6385 6386 /************************************************************************ 6387 * * 6388 * The custom hash table for unicity and canonicalization * 6389 * of sub-expressions pointers * 6390 * * 6391 ************************************************************************/ 6392 /* 6393 * xmlExpHashNameComputeKey: 6394 * Calculate the hash key for a token 6395 */ 6396 static unsigned short 6397 xmlExpHashNameComputeKey(const xmlChar *name) { 6398 unsigned short value = 0L; 6399 char ch; 6400 6401 if (name != NULL) { 6402 value += 30 * (*name); 6403 while ((ch = *name++) != 0) { 6404 value = value ^ ((value << 5) + (value >> 3) + (unsigned short)ch); 6405 } 6406 } 6407 return (value); 6408 } 6409 6410 /* 6411 * xmlExpHashComputeKey: 6412 * Calculate the hash key for a compound expression 6413 */ 6414 static unsigned short 6415 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 6416 xmlExpNodePtr right) { 6417 unsigned long value; 6418 unsigned short ret; 6419 6420 switch (type) { 6421 case XML_EXP_SEQ: 6422 value = left->key; 6423 value += right->key; 6424 value *= 3; 6425 ret = (unsigned short) value; 6426 break; 6427 case XML_EXP_OR: 6428 value = left->key; 6429 value += right->key; 6430 value *= 7; 6431 ret = (unsigned short) value; 6432 break; 6433 case XML_EXP_COUNT: 6434 value = left->key; 6435 value += right->key; 6436 ret = (unsigned short) value; 6437 break; 6438 default: 6439 ret = 0; 6440 } 6441 return(ret); 6442 } 6443 6444 6445 static xmlExpNodePtr 6446 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6447 xmlExpNodePtr ret; 6448 6449 if (ctxt->nb_nodes >= MAX_NODES) 6450 return(NULL); 6451 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6452 if (ret == NULL) 6453 return(NULL); 6454 memset(ret, 0, sizeof(xmlExpNode)); 6455 ret->type = type; 6456 ret->next = NULL; 6457 ctxt->nb_nodes++; 6458 ctxt->nb_cons++; 6459 return(ret); 6460 } 6461 6462 /** 6463 * xmlExpHashGetEntry: 6464 * @table: the hash table 6465 * 6466 * Get the unique entry from the hash table. The entry is created if 6467 * needed. @left and @right are consumed, i.e. their ref count will 6468 * be decremented by the operation. 6469 * 6470 * Returns the pointer or NULL in case of error 6471 */ 6472 static xmlExpNodePtr 6473 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6474 xmlExpNodePtr left, xmlExpNodePtr right, 6475 const xmlChar *name, int min, int max) { 6476 unsigned short kbase, key; 6477 xmlExpNodePtr entry; 6478 xmlExpNodePtr insert; 6479 6480 if (ctxt == NULL) 6481 return(NULL); 6482 6483 /* 6484 * Check for duplicate and insertion location. 6485 */ 6486 if (type == XML_EXP_ATOM) { 6487 kbase = xmlExpHashNameComputeKey(name); 6488 } else if (type == XML_EXP_COUNT) { 6489 /* COUNT reduction rule 1 */ 6490 /* a{1} -> a */ 6491 if (min == max) { 6492 if (min == 1) { 6493 return(left); 6494 } 6495 if (min == 0) { 6496 xmlExpFree(ctxt, left); 6497 return(emptyExp); 6498 } 6499 } 6500 if (min < 0) { 6501 xmlExpFree(ctxt, left); 6502 return(forbiddenExp); 6503 } 6504 if (max == -1) 6505 kbase = min + 79; 6506 else 6507 kbase = max - min; 6508 kbase += left->key; 6509 } else if (type == XML_EXP_OR) { 6510 /* Forbid reduction rules */ 6511 if (left->type == XML_EXP_FORBID) { 6512 xmlExpFree(ctxt, left); 6513 return(right); 6514 } 6515 if (right->type == XML_EXP_FORBID) { 6516 xmlExpFree(ctxt, right); 6517 return(left); 6518 } 6519 6520 /* OR reduction rule 1 */ 6521 /* a | a reduced to a */ 6522 if (left == right) { 6523 left->ref--; 6524 return(left); 6525 } 6526 /* OR canonicalization rule 1 */ 6527 /* linearize (a | b) | c into a | (b | c) */ 6528 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6529 xmlExpNodePtr tmp = left; 6530 left = right; 6531 right = tmp; 6532 } 6533 /* OR reduction rule 2 */ 6534 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6535 if (right->type == XML_EXP_OR) { 6536 if ((left == right->exp_left) || 6537 (left == right->exp_right)) { 6538 xmlExpFree(ctxt, left); 6539 return(right); 6540 } 6541 } 6542 /* OR canonicalization rule 2 */ 6543 /* linearize (a | b) | c into a | (b | c) */ 6544 if (left->type == XML_EXP_OR) { 6545 xmlExpNodePtr tmp; 6546 6547 /* OR canonicalization rule 2 */ 6548 if ((left->exp_right->type != XML_EXP_OR) && 6549 (left->exp_right->key < left->exp_left->key)) { 6550 tmp = left->exp_right; 6551 left->exp_right = left->exp_left; 6552 left->exp_left = tmp; 6553 } 6554 left->exp_right->ref++; 6555 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6556 NULL, 0, 0); 6557 left->exp_left->ref++; 6558 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6559 NULL, 0, 0); 6560 6561 xmlExpFree(ctxt, left); 6562 return(tmp); 6563 } 6564 if (right->type == XML_EXP_OR) { 6565 /* Ordering in the tree */ 6566 /* C | (A | B) -> A | (B | C) */ 6567 if (left->key > right->exp_right->key) { 6568 xmlExpNodePtr tmp; 6569 right->exp_right->ref++; 6570 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6571 left, NULL, 0, 0); 6572 right->exp_left->ref++; 6573 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6574 tmp, NULL, 0, 0); 6575 xmlExpFree(ctxt, right); 6576 return(tmp); 6577 } 6578 /* Ordering in the tree */ 6579 /* B | (A | C) -> A | (B | C) */ 6580 if (left->key > right->exp_left->key) { 6581 xmlExpNodePtr tmp; 6582 right->exp_right->ref++; 6583 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6584 right->exp_right, NULL, 0, 0); 6585 right->exp_left->ref++; 6586 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6587 tmp, NULL, 0, 0); 6588 xmlExpFree(ctxt, right); 6589 return(tmp); 6590 } 6591 } 6592 /* we know both types are != XML_EXP_OR here */ 6593 else if (left->key > right->key) { 6594 xmlExpNodePtr tmp = left; 6595 left = right; 6596 right = tmp; 6597 } 6598 kbase = xmlExpHashComputeKey(type, left, right); 6599 } else if (type == XML_EXP_SEQ) { 6600 /* Forbid reduction rules */ 6601 if (left->type == XML_EXP_FORBID) { 6602 xmlExpFree(ctxt, right); 6603 return(left); 6604 } 6605 if (right->type == XML_EXP_FORBID) { 6606 xmlExpFree(ctxt, left); 6607 return(right); 6608 } 6609 /* Empty reduction rules */ 6610 if (right->type == XML_EXP_EMPTY) { 6611 return(left); 6612 } 6613 if (left->type == XML_EXP_EMPTY) { 6614 return(right); 6615 } 6616 kbase = xmlExpHashComputeKey(type, left, right); 6617 } else 6618 return(NULL); 6619 6620 key = kbase % ctxt->size; 6621 if (ctxt->table[key] != NULL) { 6622 for (insert = ctxt->table[key]; insert != NULL; 6623 insert = insert->next) { 6624 if ((insert->key == kbase) && 6625 (insert->type == type)) { 6626 if (type == XML_EXP_ATOM) { 6627 if (name == insert->exp_str) { 6628 insert->ref++; 6629 return(insert); 6630 } 6631 } else if (type == XML_EXP_COUNT) { 6632 if ((insert->exp_min == min) && (insert->exp_max == max) && 6633 (insert->exp_left == left)) { 6634 insert->ref++; 6635 left->ref--; 6636 return(insert); 6637 } 6638 } else if ((insert->exp_left == left) && 6639 (insert->exp_right == right)) { 6640 insert->ref++; 6641 left->ref--; 6642 right->ref--; 6643 return(insert); 6644 } 6645 } 6646 } 6647 } 6648 6649 entry = xmlExpNewNode(ctxt, type); 6650 if (entry == NULL) 6651 return(NULL); 6652 entry->key = kbase; 6653 if (type == XML_EXP_ATOM) { 6654 entry->exp_str = name; 6655 entry->c_max = 1; 6656 } else if (type == XML_EXP_COUNT) { 6657 entry->exp_min = min; 6658 entry->exp_max = max; 6659 entry->exp_left = left; 6660 if ((min == 0) || (IS_NILLABLE(left))) 6661 entry->info |= XML_EXP_NILABLE; 6662 if (max < 0) 6663 entry->c_max = -1; 6664 else 6665 entry->c_max = max * entry->exp_left->c_max; 6666 } else { 6667 entry->exp_left = left; 6668 entry->exp_right = right; 6669 if (type == XML_EXP_OR) { 6670 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6671 entry->info |= XML_EXP_NILABLE; 6672 if ((entry->exp_left->c_max == -1) || 6673 (entry->exp_right->c_max == -1)) 6674 entry->c_max = -1; 6675 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6676 entry->c_max = entry->exp_left->c_max; 6677 else 6678 entry->c_max = entry->exp_right->c_max; 6679 } else { 6680 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6681 entry->info |= XML_EXP_NILABLE; 6682 if ((entry->exp_left->c_max == -1) || 6683 (entry->exp_right->c_max == -1)) 6684 entry->c_max = -1; 6685 else 6686 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6687 } 6688 } 6689 entry->ref = 1; 6690 if (ctxt->table[key] != NULL) 6691 entry->next = ctxt->table[key]; 6692 6693 ctxt->table[key] = entry; 6694 ctxt->nbElems++; 6695 6696 return(entry); 6697 } 6698 6699 /** 6700 * xmlExpFree: 6701 * @ctxt: the expression context 6702 * @exp: the expression 6703 * 6704 * Dereference the expression 6705 */ 6706 void 6707 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6708 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6709 return; 6710 exp->ref--; 6711 if (exp->ref == 0) { 6712 unsigned short key; 6713 6714 /* Unlink it first from the hash table */ 6715 key = exp->key % ctxt->size; 6716 if (ctxt->table[key] == exp) { 6717 ctxt->table[key] = exp->next; 6718 } else { 6719 xmlExpNodePtr tmp; 6720 6721 tmp = ctxt->table[key]; 6722 while (tmp != NULL) { 6723 if (tmp->next == exp) { 6724 tmp->next = exp->next; 6725 break; 6726 } 6727 tmp = tmp->next; 6728 } 6729 } 6730 6731 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6732 xmlExpFree(ctxt, exp->exp_left); 6733 xmlExpFree(ctxt, exp->exp_right); 6734 } else if (exp->type == XML_EXP_COUNT) { 6735 xmlExpFree(ctxt, exp->exp_left); 6736 } 6737 xmlFree(exp); 6738 ctxt->nb_nodes--; 6739 } 6740 } 6741 6742 /** 6743 * xmlExpRef: 6744 * @exp: the expression 6745 * 6746 * Increase the reference count of the expression 6747 */ 6748 void 6749 xmlExpRef(xmlExpNodePtr exp) { 6750 if (exp != NULL) 6751 exp->ref++; 6752 } 6753 6754 /** 6755 * xmlExpNewAtom: 6756 * @ctxt: the expression context 6757 * @name: the atom name 6758 * @len: the atom name lenght in byte (or -1); 6759 * 6760 * Get the atom associated to this name from that context 6761 * 6762 * Returns the node or NULL in case of error 6763 */ 6764 xmlExpNodePtr 6765 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 6766 if ((ctxt == NULL) || (name == NULL)) 6767 return(NULL); 6768 name = xmlDictLookup(ctxt->dict, name, len); 6769 if (name == NULL) 6770 return(NULL); 6771 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 6772 } 6773 6774 /** 6775 * xmlExpNewOr: 6776 * @ctxt: the expression context 6777 * @left: left expression 6778 * @right: right expression 6779 * 6780 * Get the atom associated to the choice @left | @right 6781 * Note that @left and @right are consumed in the operation, to keep 6782 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6783 * this is true even in case of failure (unless ctxt == NULL). 6784 * 6785 * Returns the node or NULL in case of error 6786 */ 6787 xmlExpNodePtr 6788 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6789 if (ctxt == NULL) 6790 return(NULL); 6791 if ((left == NULL) || (right == NULL)) { 6792 xmlExpFree(ctxt, left); 6793 xmlExpFree(ctxt, right); 6794 return(NULL); 6795 } 6796 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 6797 } 6798 6799 /** 6800 * xmlExpNewSeq: 6801 * @ctxt: the expression context 6802 * @left: left expression 6803 * @right: right expression 6804 * 6805 * Get the atom associated to the sequence @left , @right 6806 * Note that @left and @right are consumed in the operation, to keep 6807 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6808 * this is true even in case of failure (unless ctxt == NULL). 6809 * 6810 * Returns the node or NULL in case of error 6811 */ 6812 xmlExpNodePtr 6813 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6814 if (ctxt == NULL) 6815 return(NULL); 6816 if ((left == NULL) || (right == NULL)) { 6817 xmlExpFree(ctxt, left); 6818 xmlExpFree(ctxt, right); 6819 return(NULL); 6820 } 6821 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 6822 } 6823 6824 /** 6825 * xmlExpNewRange: 6826 * @ctxt: the expression context 6827 * @subset: the expression to be repeated 6828 * @min: the lower bound for the repetition 6829 * @max: the upper bound for the repetition, -1 means infinite 6830 * 6831 * Get the atom associated to the range (@subset){@min, @max} 6832 * Note that @subset is consumed in the operation, to keep 6833 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 6834 * this is true even in case of failure (unless ctxt == NULL). 6835 * 6836 * Returns the node or NULL in case of error 6837 */ 6838 xmlExpNodePtr 6839 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 6840 if (ctxt == NULL) 6841 return(NULL); 6842 if ((subset == NULL) || (min < 0) || (max < -1) || 6843 ((max >= 0) && (min > max))) { 6844 xmlExpFree(ctxt, subset); 6845 return(NULL); 6846 } 6847 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 6848 NULL, NULL, min, max)); 6849 } 6850 6851 /************************************************************************ 6852 * * 6853 * Public API for operations on expressions * 6854 * * 6855 ************************************************************************/ 6856 6857 static int 6858 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6859 const xmlChar**list, int len, int nb) { 6860 int tmp, tmp2; 6861 tail: 6862 switch (exp->type) { 6863 case XML_EXP_EMPTY: 6864 return(0); 6865 case XML_EXP_ATOM: 6866 for (tmp = 0;tmp < nb;tmp++) 6867 if (list[tmp] == exp->exp_str) 6868 return(0); 6869 if (nb >= len) 6870 return(-2); 6871 list[nb++] = exp->exp_str; 6872 return(1); 6873 case XML_EXP_COUNT: 6874 exp = exp->exp_left; 6875 goto tail; 6876 case XML_EXP_SEQ: 6877 case XML_EXP_OR: 6878 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 6879 if (tmp < 0) 6880 return(tmp); 6881 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 6882 nb + tmp); 6883 if (tmp2 < 0) 6884 return(tmp2); 6885 return(tmp + tmp2); 6886 } 6887 return(-1); 6888 } 6889 6890 /** 6891 * xmlExpGetLanguage: 6892 * @ctxt: the expression context 6893 * @exp: the expression 6894 * @langList: where to store the tokens 6895 * @len: the allocated lenght of @list 6896 * 6897 * Find all the strings used in @exp and store them in @list 6898 * 6899 * Returns the number of unique strings found, -1 in case of errors and 6900 * -2 if there is more than @len strings 6901 */ 6902 int 6903 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6904 const xmlChar**langList, int len) { 6905 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) 6906 return(-1); 6907 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); 6908 } 6909 6910 static int 6911 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6912 const xmlChar**list, int len, int nb) { 6913 int tmp, tmp2; 6914 tail: 6915 switch (exp->type) { 6916 case XML_EXP_FORBID: 6917 return(0); 6918 case XML_EXP_EMPTY: 6919 return(0); 6920 case XML_EXP_ATOM: 6921 for (tmp = 0;tmp < nb;tmp++) 6922 if (list[tmp] == exp->exp_str) 6923 return(0); 6924 if (nb >= len) 6925 return(-2); 6926 list[nb++] = exp->exp_str; 6927 return(1); 6928 case XML_EXP_COUNT: 6929 exp = exp->exp_left; 6930 goto tail; 6931 case XML_EXP_SEQ: 6932 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 6933 if (tmp < 0) 6934 return(tmp); 6935 if (IS_NILLABLE(exp->exp_left)) { 6936 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 6937 nb + tmp); 6938 if (tmp2 < 0) 6939 return(tmp2); 6940 tmp += tmp2; 6941 } 6942 return(tmp); 6943 case XML_EXP_OR: 6944 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 6945 if (tmp < 0) 6946 return(tmp); 6947 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 6948 nb + tmp); 6949 if (tmp2 < 0) 6950 return(tmp2); 6951 return(tmp + tmp2); 6952 } 6953 return(-1); 6954 } 6955 6956 /** 6957 * xmlExpGetStart: 6958 * @ctxt: the expression context 6959 * @exp: the expression 6960 * @tokList: where to store the tokens 6961 * @len: the allocated lenght of @list 6962 * 6963 * Find all the strings that appears at the start of the languages 6964 * accepted by @exp and store them in @list. E.g. for (a, b) | c 6965 * it will return the list [a, c] 6966 * 6967 * Returns the number of unique strings found, -1 in case of errors and 6968 * -2 if there is more than @len strings 6969 */ 6970 int 6971 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6972 const xmlChar**tokList, int len) { 6973 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) 6974 return(-1); 6975 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); 6976 } 6977 6978 /** 6979 * xmlExpIsNillable: 6980 * @exp: the expression 6981 * 6982 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce 6983 * 6984 * Returns 1 if nillable, 0 if not and -1 in case of error 6985 */ 6986 int 6987 xmlExpIsNillable(xmlExpNodePtr exp) { 6988 if (exp == NULL) 6989 return(-1); 6990 return(IS_NILLABLE(exp) != 0); 6991 } 6992 6993 static xmlExpNodePtr 6994 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 6995 { 6996 xmlExpNodePtr ret; 6997 6998 switch (exp->type) { 6999 case XML_EXP_EMPTY: 7000 return(forbiddenExp); 7001 case XML_EXP_FORBID: 7002 return(forbiddenExp); 7003 case XML_EXP_ATOM: 7004 if (exp->exp_str == str) { 7005 #ifdef DEBUG_DERIV 7006 printf("deriv atom: equal => Empty\n"); 7007 #endif 7008 ret = emptyExp; 7009 } else { 7010 #ifdef DEBUG_DERIV 7011 printf("deriv atom: mismatch => forbid\n"); 7012 #endif 7013 /* TODO wildcards here */ 7014 ret = forbiddenExp; 7015 } 7016 return(ret); 7017 case XML_EXP_OR: { 7018 xmlExpNodePtr tmp; 7019 7020 #ifdef DEBUG_DERIV 7021 printf("deriv or: => or(derivs)\n"); 7022 #endif 7023 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7024 if (tmp == NULL) { 7025 return(NULL); 7026 } 7027 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7028 if (ret == NULL) { 7029 xmlExpFree(ctxt, tmp); 7030 return(NULL); 7031 } 7032 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 7033 NULL, 0, 0); 7034 return(ret); 7035 } 7036 case XML_EXP_SEQ: 7037 #ifdef DEBUG_DERIV 7038 printf("deriv seq: starting with left\n"); 7039 #endif 7040 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7041 if (ret == NULL) { 7042 return(NULL); 7043 } else if (ret == forbiddenExp) { 7044 if (IS_NILLABLE(exp->exp_left)) { 7045 #ifdef DEBUG_DERIV 7046 printf("deriv seq: left failed but nillable\n"); 7047 #endif 7048 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7049 } 7050 } else { 7051 #ifdef DEBUG_DERIV 7052 printf("deriv seq: left match => sequence\n"); 7053 #endif 7054 exp->exp_right->ref++; 7055 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 7056 NULL, 0, 0); 7057 } 7058 return(ret); 7059 case XML_EXP_COUNT: { 7060 int min, max; 7061 xmlExpNodePtr tmp; 7062 7063 if (exp->exp_max == 0) 7064 return(forbiddenExp); 7065 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7066 if (ret == NULL) 7067 return(NULL); 7068 if (ret == forbiddenExp) { 7069 #ifdef DEBUG_DERIV 7070 printf("deriv count: pattern mismatch => forbid\n"); 7071 #endif 7072 return(ret); 7073 } 7074 if (exp->exp_max == 1) 7075 return(ret); 7076 if (exp->exp_max < 0) /* unbounded */ 7077 max = -1; 7078 else 7079 max = exp->exp_max - 1; 7080 if (exp->exp_min > 0) 7081 min = exp->exp_min - 1; 7082 else 7083 min = 0; 7084 exp->exp_left->ref++; 7085 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 7086 NULL, min, max); 7087 if (ret == emptyExp) { 7088 #ifdef DEBUG_DERIV 7089 printf("deriv count: match to empty => new count\n"); 7090 #endif 7091 return(tmp); 7092 } 7093 #ifdef DEBUG_DERIV 7094 printf("deriv count: match => sequence with new count\n"); 7095 #endif 7096 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 7097 NULL, 0, 0)); 7098 } 7099 } 7100 return(NULL); 7101 } 7102 7103 /** 7104 * xmlExpStringDerive: 7105 * @ctxt: the expression context 7106 * @exp: the expression 7107 * @str: the string 7108 * @len: the string len in bytes if available 7109 * 7110 * Do one step of Brzozowski derivation of the expression @exp with 7111 * respect to the input string 7112 * 7113 * Returns the resulting expression or NULL in case of internal error 7114 */ 7115 xmlExpNodePtr 7116 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7117 const xmlChar *str, int len) { 7118 const xmlChar *input; 7119 7120 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 7121 return(NULL); 7122 } 7123 /* 7124 * check the string is in the dictionnary, if yes use an interned 7125 * copy, otherwise we know it's not an acceptable input 7126 */ 7127 input = xmlDictExists(ctxt->dict, str, len); 7128 if (input == NULL) { 7129 return(forbiddenExp); 7130 } 7131 return(xmlExpStringDeriveInt(ctxt, exp, input)); 7132 } 7133 7134 static int 7135 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 7136 int ret = 1; 7137 7138 if (sub->c_max == -1) { 7139 if (exp->c_max != -1) 7140 ret = 0; 7141 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 7142 ret = 0; 7143 } 7144 #if 0 7145 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 7146 ret = 0; 7147 #endif 7148 return(ret); 7149 } 7150 7151 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7152 xmlExpNodePtr sub); 7153 /** 7154 * xmlExpDivide: 7155 * @ctxt: the expressions context 7156 * @exp: the englobing expression 7157 * @sub: the subexpression 7158 * @mult: the multiple expression 7159 * @remain: the remain from the derivation of the multiple 7160 * 7161 * Check if exp is a multiple of sub, i.e. if there is a finite number n 7162 * so that sub{n} subsume exp 7163 * 7164 * Returns the multiple value if successful, 0 if it is not a multiple 7165 * and -1 in case of internel error. 7166 */ 7167 7168 static int 7169 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 7170 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 7171 int i; 7172 xmlExpNodePtr tmp, tmp2; 7173 7174 if (mult != NULL) *mult = NULL; 7175 if (remain != NULL) *remain = NULL; 7176 if (exp->c_max == -1) return(0); 7177 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 7178 7179 for (i = 1;i <= exp->c_max;i++) { 7180 sub->ref++; 7181 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7182 sub, NULL, NULL, i, i); 7183 if (tmp == NULL) { 7184 return(-1); 7185 } 7186 if (!xmlExpCheckCard(tmp, exp)) { 7187 xmlExpFree(ctxt, tmp); 7188 continue; 7189 } 7190 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 7191 if (tmp2 == NULL) { 7192 xmlExpFree(ctxt, tmp); 7193 return(-1); 7194 } 7195 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 7196 if (remain != NULL) 7197 *remain = tmp2; 7198 else 7199 xmlExpFree(ctxt, tmp2); 7200 if (mult != NULL) 7201 *mult = tmp; 7202 else 7203 xmlExpFree(ctxt, tmp); 7204 #ifdef DEBUG_DERIV 7205 printf("Divide succeeded %d\n", i); 7206 #endif 7207 return(i); 7208 } 7209 xmlExpFree(ctxt, tmp); 7210 xmlExpFree(ctxt, tmp2); 7211 } 7212 #ifdef DEBUG_DERIV 7213 printf("Divide failed\n"); 7214 #endif 7215 return(0); 7216 } 7217 7218 /** 7219 * xmlExpExpDeriveInt: 7220 * @ctxt: the expressions context 7221 * @exp: the englobing expression 7222 * @sub: the subexpression 7223 * 7224 * Try to do a step of Brzozowski derivation but at a higher level 7225 * the input being a subexpression. 7226 * 7227 * Returns the resulting expression or NULL in case of internal error 7228 */ 7229 static xmlExpNodePtr 7230 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7231 xmlExpNodePtr ret, tmp, tmp2, tmp3; 7232 const xmlChar **tab; 7233 int len, i; 7234 7235 /* 7236 * In case of equality and if the expression can only consume a finite 7237 * amount, then the derivation is empty 7238 */ 7239 if ((exp == sub) && (exp->c_max >= 0)) { 7240 #ifdef DEBUG_DERIV 7241 printf("Equal(exp, sub) and finite -> Empty\n"); 7242 #endif 7243 return(emptyExp); 7244 } 7245 /* 7246 * decompose sub sequence first 7247 */ 7248 if (sub->type == XML_EXP_EMPTY) { 7249 #ifdef DEBUG_DERIV 7250 printf("Empty(sub) -> Empty\n"); 7251 #endif 7252 exp->ref++; 7253 return(exp); 7254 } 7255 if (sub->type == XML_EXP_SEQ) { 7256 #ifdef DEBUG_DERIV 7257 printf("Seq(sub) -> decompose\n"); 7258 #endif 7259 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7260 if (tmp == NULL) 7261 return(NULL); 7262 if (tmp == forbiddenExp) 7263 return(tmp); 7264 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 7265 xmlExpFree(ctxt, tmp); 7266 return(ret); 7267 } 7268 if (sub->type == XML_EXP_OR) { 7269 #ifdef DEBUG_DERIV 7270 printf("Or(sub) -> decompose\n"); 7271 #endif 7272 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7273 if (tmp == forbiddenExp) 7274 return(tmp); 7275 if (tmp == NULL) 7276 return(NULL); 7277 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 7278 if ((ret == NULL) || (ret == forbiddenExp)) { 7279 xmlExpFree(ctxt, tmp); 7280 return(ret); 7281 } 7282 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 7283 } 7284 if (!xmlExpCheckCard(exp, sub)) { 7285 #ifdef DEBUG_DERIV 7286 printf("CheckCard(exp, sub) failed -> Forbid\n"); 7287 #endif 7288 return(forbiddenExp); 7289 } 7290 switch (exp->type) { 7291 case XML_EXP_EMPTY: 7292 if (sub == emptyExp) 7293 return(emptyExp); 7294 #ifdef DEBUG_DERIV 7295 printf("Empty(exp) -> Forbid\n"); 7296 #endif 7297 return(forbiddenExp); 7298 case XML_EXP_FORBID: 7299 #ifdef DEBUG_DERIV 7300 printf("Forbid(exp) -> Forbid\n"); 7301 #endif 7302 return(forbiddenExp); 7303 case XML_EXP_ATOM: 7304 if (sub->type == XML_EXP_ATOM) { 7305 /* TODO: handle wildcards */ 7306 if (exp->exp_str == sub->exp_str) { 7307 #ifdef DEBUG_DERIV 7308 printf("Atom match -> Empty\n"); 7309 #endif 7310 return(emptyExp); 7311 } 7312 #ifdef DEBUG_DERIV 7313 printf("Atom mismatch -> Forbid\n"); 7314 #endif 7315 return(forbiddenExp); 7316 } 7317 if ((sub->type == XML_EXP_COUNT) && 7318 (sub->exp_max == 1) && 7319 (sub->exp_left->type == XML_EXP_ATOM)) { 7320 /* TODO: handle wildcards */ 7321 if (exp->exp_str == sub->exp_left->exp_str) { 7322 #ifdef DEBUG_DERIV 7323 printf("Atom match -> Empty\n"); 7324 #endif 7325 return(emptyExp); 7326 } 7327 #ifdef DEBUG_DERIV 7328 printf("Atom mismatch -> Forbid\n"); 7329 #endif 7330 return(forbiddenExp); 7331 } 7332 #ifdef DEBUG_DERIV 7333 printf("Compex exp vs Atom -> Forbid\n"); 7334 #endif 7335 return(forbiddenExp); 7336 case XML_EXP_SEQ: 7337 /* try to get the sequence consumed only if possible */ 7338 if (xmlExpCheckCard(exp->exp_left, sub)) { 7339 /* See if the sequence can be consumed directly */ 7340 #ifdef DEBUG_DERIV 7341 printf("Seq trying left only\n"); 7342 #endif 7343 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7344 if ((ret != forbiddenExp) && (ret != NULL)) { 7345 #ifdef DEBUG_DERIV 7346 printf("Seq trying left only worked\n"); 7347 #endif 7348 /* 7349 * TODO: assumption here that we are determinist 7350 * i.e. we won't get to a nillable exp left 7351 * subset which could be matched by the right 7352 * part too. 7353 * e.g.: (a | b)+,(a | c) and 'a+,a' 7354 */ 7355 exp->exp_right->ref++; 7356 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7357 exp->exp_right, NULL, 0, 0)); 7358 } 7359 #ifdef DEBUG_DERIV 7360 } else { 7361 printf("Seq: left too short\n"); 7362 #endif 7363 } 7364 /* Try instead to decompose */ 7365 if (sub->type == XML_EXP_COUNT) { 7366 int min, max; 7367 7368 #ifdef DEBUG_DERIV 7369 printf("Seq: sub is a count\n"); 7370 #endif 7371 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7372 if (ret == NULL) 7373 return(NULL); 7374 if (ret != forbiddenExp) { 7375 #ifdef DEBUG_DERIV 7376 printf("Seq , Count match on left\n"); 7377 #endif 7378 if (sub->exp_max < 0) 7379 max = -1; 7380 else 7381 max = sub->exp_max -1; 7382 if (sub->exp_min > 0) 7383 min = sub->exp_min -1; 7384 else 7385 min = 0; 7386 exp->exp_right->ref++; 7387 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7388 exp->exp_right, NULL, 0, 0); 7389 if (tmp == NULL) 7390 return(NULL); 7391 7392 sub->exp_left->ref++; 7393 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7394 sub->exp_left, NULL, NULL, min, max); 7395 if (tmp2 == NULL) { 7396 xmlExpFree(ctxt, tmp); 7397 return(NULL); 7398 } 7399 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7400 xmlExpFree(ctxt, tmp); 7401 xmlExpFree(ctxt, tmp2); 7402 return(ret); 7403 } 7404 } 7405 /* we made no progress on structured operations */ 7406 break; 7407 case XML_EXP_OR: 7408 #ifdef DEBUG_DERIV 7409 printf("Or , trying both side\n"); 7410 #endif 7411 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7412 if (ret == NULL) 7413 return(NULL); 7414 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 7415 if (tmp == NULL) { 7416 xmlExpFree(ctxt, ret); 7417 return(NULL); 7418 } 7419 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 7420 case XML_EXP_COUNT: { 7421 int min, max; 7422 7423 if (sub->type == XML_EXP_COUNT) { 7424 /* 7425 * Try to see if the loop is completely subsumed 7426 */ 7427 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7428 if (tmp == NULL) 7429 return(NULL); 7430 if (tmp == forbiddenExp) { 7431 int mult; 7432 7433 #ifdef DEBUG_DERIV 7434 printf("Count, Count inner don't subsume\n"); 7435 #endif 7436 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7437 NULL, &tmp); 7438 if (mult <= 0) { 7439 #ifdef DEBUG_DERIV 7440 printf("Count, Count not multiple => forbidden\n"); 7441 #endif 7442 return(forbiddenExp); 7443 } 7444 if (sub->exp_max == -1) { 7445 max = -1; 7446 if (exp->exp_max == -1) { 7447 if (exp->exp_min <= sub->exp_min * mult) 7448 min = 0; 7449 else 7450 min = exp->exp_min - sub->exp_min * mult; 7451 } else { 7452 #ifdef DEBUG_DERIV 7453 printf("Count, Count finite can't subsume infinite\n"); 7454 #endif 7455 xmlExpFree(ctxt, tmp); 7456 return(forbiddenExp); 7457 } 7458 } else { 7459 if (exp->exp_max == -1) { 7460 #ifdef DEBUG_DERIV 7461 printf("Infinite loop consume mult finite loop\n"); 7462 #endif 7463 if (exp->exp_min > sub->exp_min * mult) { 7464 max = -1; 7465 min = exp->exp_min - sub->exp_min * mult; 7466 } else { 7467 max = -1; 7468 min = 0; 7469 } 7470 } else { 7471 if (exp->exp_max < sub->exp_max * mult) { 7472 #ifdef DEBUG_DERIV 7473 printf("loops max mult mismatch => forbidden\n"); 7474 #endif 7475 xmlExpFree(ctxt, tmp); 7476 return(forbiddenExp); 7477 } 7478 if (sub->exp_max * mult > exp->exp_min) 7479 min = 0; 7480 else 7481 min = exp->exp_min - sub->exp_max * mult; 7482 max = exp->exp_max - sub->exp_max * mult; 7483 } 7484 } 7485 } else if (!IS_NILLABLE(tmp)) { 7486 /* 7487 * TODO: loop here to try to grow if working on finite 7488 * blocks. 7489 */ 7490 #ifdef DEBUG_DERIV 7491 printf("Count, Count remain not nillable => forbidden\n"); 7492 #endif 7493 xmlExpFree(ctxt, tmp); 7494 return(forbiddenExp); 7495 } else if (sub->exp_max == -1) { 7496 if (exp->exp_max == -1) { 7497 if (exp->exp_min <= sub->exp_min) { 7498 #ifdef DEBUG_DERIV 7499 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7500 #endif 7501 max = -1; 7502 min = 0; 7503 } else { 7504 #ifdef DEBUG_DERIV 7505 printf("Infinite loops min => Count(X,Inf)\n"); 7506 #endif 7507 max = -1; 7508 min = exp->exp_min - sub->exp_min; 7509 } 7510 } else if (exp->exp_min > sub->exp_min) { 7511 #ifdef DEBUG_DERIV 7512 printf("loops min mismatch 1 => forbidden ???\n"); 7513 #endif 7514 xmlExpFree(ctxt, tmp); 7515 return(forbiddenExp); 7516 } else { 7517 max = -1; 7518 min = 0; 7519 } 7520 } else { 7521 if (exp->exp_max == -1) { 7522 #ifdef DEBUG_DERIV 7523 printf("Infinite loop consume finite loop\n"); 7524 #endif 7525 if (exp->exp_min > sub->exp_min) { 7526 max = -1; 7527 min = exp->exp_min - sub->exp_min; 7528 } else { 7529 max = -1; 7530 min = 0; 7531 } 7532 } else { 7533 if (exp->exp_max < sub->exp_max) { 7534 #ifdef DEBUG_DERIV 7535 printf("loops max mismatch => forbidden\n"); 7536 #endif 7537 xmlExpFree(ctxt, tmp); 7538 return(forbiddenExp); 7539 } 7540 if (sub->exp_max > exp->exp_min) 7541 min = 0; 7542 else 7543 min = exp->exp_min - sub->exp_max; 7544 max = exp->exp_max - sub->exp_max; 7545 } 7546 } 7547 #ifdef DEBUG_DERIV 7548 printf("loops match => SEQ(COUNT())\n"); 7549 #endif 7550 exp->exp_left->ref++; 7551 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7552 NULL, NULL, min, max); 7553 if (tmp2 == NULL) { 7554 return(NULL); 7555 } 7556 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7557 NULL, 0, 0); 7558 return(ret); 7559 } 7560 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7561 if (tmp == NULL) 7562 return(NULL); 7563 if (tmp == forbiddenExp) { 7564 #ifdef DEBUG_DERIV 7565 printf("loop mismatch => forbidden\n"); 7566 #endif 7567 return(forbiddenExp); 7568 } 7569 if (exp->exp_min > 0) 7570 min = exp->exp_min - 1; 7571 else 7572 min = 0; 7573 if (exp->exp_max < 0) 7574 max = -1; 7575 else 7576 max = exp->exp_max - 1; 7577 7578 #ifdef DEBUG_DERIV 7579 printf("loop match => SEQ(COUNT())\n"); 7580 #endif 7581 exp->exp_left->ref++; 7582 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7583 NULL, NULL, min, max); 7584 if (tmp2 == NULL) 7585 return(NULL); 7586 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7587 NULL, 0, 0); 7588 return(ret); 7589 } 7590 } 7591 7592 #ifdef DEBUG_DERIV 7593 printf("Fallback to derivative\n"); 7594 #endif 7595 if (IS_NILLABLE(sub)) { 7596 if (!(IS_NILLABLE(exp))) 7597 return(forbiddenExp); 7598 else 7599 ret = emptyExp; 7600 } else 7601 ret = NULL; 7602 /* 7603 * here the structured derivation made no progress so 7604 * we use the default token based derivation to force one more step 7605 */ 7606 if (ctxt->tabSize == 0) 7607 ctxt->tabSize = 40; 7608 7609 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7610 sizeof(const xmlChar *)); 7611 if (tab == NULL) { 7612 return(NULL); 7613 } 7614 7615 /* 7616 * collect all the strings accepted by the subexpression on input 7617 */ 7618 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7619 while (len < 0) { 7620 const xmlChar **temp; 7621 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7622 sizeof(const xmlChar *)); 7623 if (temp == NULL) { 7624 xmlFree((xmlChar **) tab); 7625 return(NULL); 7626 } 7627 tab = temp; 7628 ctxt->tabSize *= 2; 7629 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7630 } 7631 for (i = 0;i < len;i++) { 7632 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7633 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7634 xmlExpFree(ctxt, ret); 7635 xmlFree((xmlChar **) tab); 7636 return(tmp); 7637 } 7638 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7639 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7640 xmlExpFree(ctxt, tmp); 7641 xmlExpFree(ctxt, ret); 7642 xmlFree((xmlChar **) tab); 7643 return(tmp); 7644 } 7645 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7646 xmlExpFree(ctxt, tmp); 7647 xmlExpFree(ctxt, tmp2); 7648 7649 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7650 xmlExpFree(ctxt, ret); 7651 xmlFree((xmlChar **) tab); 7652 return(tmp3); 7653 } 7654 7655 if (ret == NULL) 7656 ret = tmp3; 7657 else { 7658 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7659 if (ret == NULL) { 7660 xmlFree((xmlChar **) tab); 7661 return(NULL); 7662 } 7663 } 7664 } 7665 xmlFree((xmlChar **) tab); 7666 return(ret); 7667 } 7668 7669 /** 7670 * xmlExpExpDerive: 7671 * @ctxt: the expressions context 7672 * @exp: the englobing expression 7673 * @sub: the subexpression 7674 * 7675 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7676 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7677 * it usually tatkes less than linear time and can handle expressions generating 7678 * infinite languages. 7679 * 7680 * Returns the resulting expression or NULL in case of internal error, the 7681 * result must be freed 7682 */ 7683 xmlExpNodePtr 7684 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7685 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7686 return(NULL); 7687 7688 /* 7689 * O(1) speedups 7690 */ 7691 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7692 #ifdef DEBUG_DERIV 7693 printf("Sub nillable and not exp : can't subsume\n"); 7694 #endif 7695 return(forbiddenExp); 7696 } 7697 if (xmlExpCheckCard(exp, sub) == 0) { 7698 #ifdef DEBUG_DERIV 7699 printf("sub generate longuer sequances than exp : can't subsume\n"); 7700 #endif 7701 return(forbiddenExp); 7702 } 7703 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7704 } 7705 7706 /** 7707 * xmlExpSubsume: 7708 * @ctxt: the expressions context 7709 * @exp: the englobing expression 7710 * @sub: the subexpression 7711 * 7712 * Check whether @exp accepts all the languages accexpted by @sub 7713 * the input being a subexpression. 7714 * 7715 * Returns 1 if true 0 if false and -1 in case of failure. 7716 */ 7717 int 7718 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7719 xmlExpNodePtr tmp; 7720 7721 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7722 return(-1); 7723 7724 /* 7725 * TODO: speedup by checking the language of sub is a subset of the 7726 * language of exp 7727 */ 7728 /* 7729 * O(1) speedups 7730 */ 7731 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7732 #ifdef DEBUG_DERIV 7733 printf("Sub nillable and not exp : can't subsume\n"); 7734 #endif 7735 return(0); 7736 } 7737 if (xmlExpCheckCard(exp, sub) == 0) { 7738 #ifdef DEBUG_DERIV 7739 printf("sub generate longuer sequances than exp : can't subsume\n"); 7740 #endif 7741 return(0); 7742 } 7743 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7744 #ifdef DEBUG_DERIV 7745 printf("Result derivation :\n"); 7746 PRINT_EXP(tmp); 7747 #endif 7748 if (tmp == NULL) 7749 return(-1); 7750 if (tmp == forbiddenExp) 7751 return(0); 7752 if (tmp == emptyExp) 7753 return(1); 7754 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 7755 xmlExpFree(ctxt, tmp); 7756 return(1); 7757 } 7758 xmlExpFree(ctxt, tmp); 7759 return(0); 7760 } 7761 7762 /************************************************************************ 7763 * * 7764 * Parsing expression * 7765 * * 7766 ************************************************************************/ 7767 7768 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 7769 7770 #undef CUR 7771 #define CUR (*ctxt->cur) 7772 #undef NEXT 7773 #define NEXT ctxt->cur++; 7774 #undef IS_BLANK 7775 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 7776 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 7777 7778 static int 7779 xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 7780 int ret = 0; 7781 7782 SKIP_BLANKS 7783 if (CUR == '*') { 7784 NEXT 7785 return(-1); 7786 } 7787 if ((CUR < '0') || (CUR > '9')) 7788 return(-1); 7789 while ((CUR >= '0') && (CUR <= '9')) { 7790 ret = ret * 10 + (CUR - '0'); 7791 NEXT 7792 } 7793 return(ret); 7794 } 7795 7796 static xmlExpNodePtr 7797 xmlExpParseOr(xmlExpCtxtPtr ctxt) { 7798 const char *base; 7799 xmlExpNodePtr ret; 7800 const xmlChar *val; 7801 7802 SKIP_BLANKS 7803 base = ctxt->cur; 7804 if (*ctxt->cur == '(') { 7805 NEXT 7806 ret = xmlExpParseExpr(ctxt); 7807 SKIP_BLANKS 7808 if (*ctxt->cur != ')') { 7809 fprintf(stderr, "unbalanced '(' : %s\n", base); 7810 xmlExpFree(ctxt, ret); 7811 return(NULL); 7812 } 7813 NEXT; 7814 SKIP_BLANKS 7815 goto parse_quantifier; 7816 } 7817 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 7818 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 7819 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 7820 NEXT; 7821 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 7822 if (val == NULL) 7823 return(NULL); 7824 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 7825 if (ret == NULL) 7826 return(NULL); 7827 SKIP_BLANKS 7828 parse_quantifier: 7829 if (CUR == '{') { 7830 int min, max; 7831 7832 NEXT 7833 min = xmlExpParseNumber(ctxt); 7834 if (min < 0) { 7835 xmlExpFree(ctxt, ret); 7836 return(NULL); 7837 } 7838 SKIP_BLANKS 7839 if (CUR == ',') { 7840 NEXT 7841 max = xmlExpParseNumber(ctxt); 7842 SKIP_BLANKS 7843 } else 7844 max = min; 7845 if (CUR != '}') { 7846 xmlExpFree(ctxt, ret); 7847 return(NULL); 7848 } 7849 NEXT 7850 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7851 min, max); 7852 SKIP_BLANKS 7853 } else if (CUR == '?') { 7854 NEXT 7855 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7856 0, 1); 7857 SKIP_BLANKS 7858 } else if (CUR == '+') { 7859 NEXT 7860 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7861 1, -1); 7862 SKIP_BLANKS 7863 } else if (CUR == '*') { 7864 NEXT 7865 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7866 0, -1); 7867 SKIP_BLANKS 7868 } 7869 return(ret); 7870 } 7871 7872 7873 static xmlExpNodePtr 7874 xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 7875 xmlExpNodePtr ret, right; 7876 7877 ret = xmlExpParseOr(ctxt); 7878 SKIP_BLANKS 7879 while (CUR == '|') { 7880 NEXT 7881 right = xmlExpParseOr(ctxt); 7882 if (right == NULL) { 7883 xmlExpFree(ctxt, ret); 7884 return(NULL); 7885 } 7886 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 7887 if (ret == NULL) 7888 return(NULL); 7889 } 7890 return(ret); 7891 } 7892 7893 static xmlExpNodePtr 7894 xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 7895 xmlExpNodePtr ret, right; 7896 7897 ret = xmlExpParseSeq(ctxt); 7898 SKIP_BLANKS 7899 while (CUR == ',') { 7900 NEXT 7901 right = xmlExpParseSeq(ctxt); 7902 if (right == NULL) { 7903 xmlExpFree(ctxt, ret); 7904 return(NULL); 7905 } 7906 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 7907 if (ret == NULL) 7908 return(NULL); 7909 } 7910 return(ret); 7911 } 7912 7913 /** 7914 * xmlExpParse: 7915 * @ctxt: the expressions context 7916 * @expr: the 0 terminated string 7917 * 7918 * Minimal parser for regexps, it understand the following constructs 7919 * - string terminals 7920 * - choice operator | 7921 * - sequence operator , 7922 * - subexpressions (...) 7923 * - usual cardinality operators + * and ? 7924 * - finite sequences { min, max } 7925 * - infinite sequences { min, * } 7926 * There is minimal checkings made especially no checking on strings values 7927 * 7928 * Returns a new expression or NULL in case of failure 7929 */ 7930 xmlExpNodePtr 7931 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 7932 xmlExpNodePtr ret; 7933 7934 ctxt->expr = expr; 7935 ctxt->cur = expr; 7936 7937 ret = xmlExpParseExpr(ctxt); 7938 SKIP_BLANKS 7939 if (*ctxt->cur != 0) { 7940 xmlExpFree(ctxt, ret); 7941 return(NULL); 7942 } 7943 return(ret); 7944 } 7945 7946 static void 7947 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 7948 xmlExpNodePtr c; 7949 7950 if (expr == NULL) return; 7951 if (glob) xmlBufferWriteChar(buf, "("); 7952 switch (expr->type) { 7953 case XML_EXP_EMPTY: 7954 xmlBufferWriteChar(buf, "empty"); 7955 break; 7956 case XML_EXP_FORBID: 7957 xmlBufferWriteChar(buf, "forbidden"); 7958 break; 7959 case XML_EXP_ATOM: 7960 xmlBufferWriteCHAR(buf, expr->exp_str); 7961 break; 7962 case XML_EXP_SEQ: 7963 c = expr->exp_left; 7964 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7965 xmlExpDumpInt(buf, c, 1); 7966 else 7967 xmlExpDumpInt(buf, c, 0); 7968 xmlBufferWriteChar(buf, " , "); 7969 c = expr->exp_right; 7970 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7971 xmlExpDumpInt(buf, c, 1); 7972 else 7973 xmlExpDumpInt(buf, c, 0); 7974 break; 7975 case XML_EXP_OR: 7976 c = expr->exp_left; 7977 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7978 xmlExpDumpInt(buf, c, 1); 7979 else 7980 xmlExpDumpInt(buf, c, 0); 7981 xmlBufferWriteChar(buf, " | "); 7982 c = expr->exp_right; 7983 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7984 xmlExpDumpInt(buf, c, 1); 7985 else 7986 xmlExpDumpInt(buf, c, 0); 7987 break; 7988 case XML_EXP_COUNT: { 7989 char rep[40]; 7990 7991 c = expr->exp_left; 7992 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7993 xmlExpDumpInt(buf, c, 1); 7994 else 7995 xmlExpDumpInt(buf, c, 0); 7996 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 7997 rep[0] = '?'; 7998 rep[1] = 0; 7999 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 8000 rep[0] = '*'; 8001 rep[1] = 0; 8002 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 8003 rep[0] = '+'; 8004 rep[1] = 0; 8005 } else if (expr->exp_max == expr->exp_min) { 8006 snprintf(rep, 39, "{%d}", expr->exp_min); 8007 } else if (expr->exp_max < 0) { 8008 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 8009 } else { 8010 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 8011 } 8012 rep[39] = 0; 8013 xmlBufferWriteChar(buf, rep); 8014 break; 8015 } 8016 default: 8017 fprintf(stderr, "Error in tree\n"); 8018 } 8019 if (glob) 8020 xmlBufferWriteChar(buf, ")"); 8021 } 8022 /** 8023 * xmlExpDump: 8024 * @buf: a buffer to receive the output 8025 * @expr: the compiled expression 8026 * 8027 * Serialize the expression as compiled to the buffer 8028 */ 8029 void 8030 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 8031 if ((buf == NULL) || (expr == NULL)) 8032 return; 8033 xmlExpDumpInt(buf, expr, 0); 8034 } 8035 8036 /** 8037 * xmlExpMaxToken: 8038 * @expr: a compiled expression 8039 * 8040 * Indicate the maximum number of input a expression can accept 8041 * 8042 * Returns the maximum length or -1 in case of error 8043 */ 8044 int 8045 xmlExpMaxToken(xmlExpNodePtr expr) { 8046 if (expr == NULL) 8047 return(-1); 8048 return(expr->c_max); 8049 } 8050 8051 /** 8052 * xmlExpCtxtNbNodes: 8053 * @ctxt: an expression context 8054 * 8055 * Debugging facility provides the number of allocated nodes at a that point 8056 * 8057 * Returns the number of nodes in use or -1 in case of error 8058 */ 8059 int 8060 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 8061 if (ctxt == NULL) 8062 return(-1); 8063 return(ctxt->nb_nodes); 8064 } 8065 8066 /** 8067 * xmlExpCtxtNbCons: 8068 * @ctxt: an expression context 8069 * 8070 * Debugging facility provides the number of allocated nodes over lifetime 8071 * 8072 * Returns the number of nodes ever allocated or -1 in case of error 8073 */ 8074 int 8075 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 8076 if (ctxt == NULL) 8077 return(-1); 8078 return(ctxt->nb_cons); 8079 } 8080 8081 #endif /* LIBXML_EXPR_ENABLED */ 8082 #define bottom_xmlregexp 8083 #include "elfgcchack.h" 8084 #endif /* LIBXML_REGEXP_ENABLED */ 8085