1 /* 2 ****************************************************************************** 3 * Copyright (C) 2005-2007, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 8 #include <stdio.h> 9 #include <string.h> 10 #include <stdlib.h> 11 #include <time.h> 12 13 #include "wbnf.h" 14 15 // Most of this code is meant to test the test code. It's a self test. 16 // Normally this isn't run. 17 #define TEST_WBNF_TEST 0 18 19 /////////////////////////////////////////////////////////// 20 // 21 // Constants and the most basic helper classes 22 // 23 24 static const char DIGIT_CHAR[] = "0123456789"; 25 static const char WHITE_SPACE[] = {'\t', ' ', '\r', '\n', 0}; 26 static const char ALPHABET[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; 27 static const char SPECIAL[] = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; 28 29 static inline UBool isInList(const char c /*in*/, const char list[] /*in*/){ 30 const char * p = list; 31 for (;*p != 0 && *p != c; p++); 32 return *p?TRUE:FALSE; 33 } 34 static inline UBool isDigit(char c) {return isInList(c, DIGIT_CHAR);} 35 static inline UBool isWhiteSpace(char c) {return isInList(c, WHITE_SPACE);} 36 static inline UBool isAlphabet(char c) {return isInList(c, ALPHABET);} 37 static inline UBool isSpecialAsciiChar(char c) {return isInList(c,SPECIAL);} 38 39 40 41 /////////////////////////////////////////////////////////// 42 // 43 // Helper classes 44 // 45 46 class Buffer_byte{ 47 // Utility class, can be treated as an auto expanded array. no boundary check. 48 49 typedef char byte; 50 byte * start; 51 byte * current; 52 int buffer_size; // size unit is byte 53 public: 54 inline int content_size(){return current - start;} // size unit is byte 55 56 private: 57 inline void expand(int add_size = 100){ // size unit is byte 58 int new_size = buffer_size + add_size; 59 60 int cs_snap = content_size(); 61 start = (byte *) realloc(start, new_size); // may change the value of start 62 current = start + cs_snap; 63 64 memset(current, 0, add_size); 65 buffer_size = new_size; 66 } 67 68 inline void expand_to(int size){ 69 int r = size - buffer_size; 70 if (r > 0) { 71 expand(r); // simply expand, no block alignment 72 } 73 } 74 Buffer_byte(const Buffer_byte &); 75 Buffer_byte & operator = (const Buffer_byte &); 76 public: 77 Buffer_byte():start(NULL),current(start),buffer_size(0){ 78 expand(); 79 } 80 ~Buffer_byte(){ 81 free(start); 82 } 83 84 inline void reset(){ 85 start != NULL ? memset(start, 0, buffer_size) : 0; 86 current = start; 87 } 88 89 // Using memory copy method to append a C array to buffer, 90 inline void append(const void * c, int size){ // size unit is byte 91 expand_to(content_size() + size) ; 92 memcpy(current, c, size); 93 current = current + size; 94 } 95 96 byte * buffer(){ 97 return start; 98 } 99 }; 100 101 /* 102 The class(es) try to work as bulid-in array, so it overloads these two operators 103 operator type *(); 104 type & operator[]; 105 The first is used to auto type convert, the latter is used to select member. 106 107 A small trick is the class does not overload the address-of operator. This 108 behavior is different from bulid-in array, but it give us the opportunity 109 to get the address of the class itself. 110 */ 111 //template<typename type> 112 // class BUFFER{ 113 // typedef BUFFER name; 114 #define BUFFER(type, name)\ 115 class name {\ 116 private:\ 117 Buffer_byte buf;\ 118 public:\ 119 name & reset() {buf.reset(); return *this;}\ 120 name & append(type c) {buf.append(&c, sizeof(type)); return *this;}\ 121 name & append_array(const type * p, int size) {buf.append(p, sizeof(type)*size); return *this;}\ 122 type & operator [] (int i) { return ((type *) buf.buffer())[i];}\ 123 operator type *(){return (type *) buf.buffer();} \ 124 int content_size(){return buf.content_size() / sizeof(type);}\ 125 } 126 127 128 class Pick{ 129 /* The Pick is the basic language generator element*/ 130 public: 131 // generate a string accroding the syntax 132 // Return a null-terminated c-string. The buffer is owned by callee. 133 virtual const char* next() = 0; 134 virtual ~Pick(){}; 135 }; 136 137 //typedef BUFFER<char> Buffer_char; 138 //typedef BUFFER<int> Buffer_int; 139 //typedef BUFFER<Pick *> Buffer_pPick; 140 BUFFER(char, Buffer_char); 141 BUFFER(int, Buffer_int); 142 BUFFER(Pick *, Buffer_pPick); 143 144 class SymbolTable{ 145 /* Helper class. 146 * It's a mapping table between 'variable name' and its 'active Pick object' 147 */ 148 private: 149 Buffer_char name_buffer; // var names storage space 150 151 Buffer_int names; // points to name (offset in name_buffer) 152 Buffer_pPick refs; // points to Pick 153 154 int get_index(const char *const var_name){ 155 int len = names.content_size(); 156 for (int i=0; i< len; i++){ 157 if (strcmp(var_name, name_buffer + names[i]) == 0){ 158 return i; 159 } 160 } 161 return -1; 162 } 163 164 public: 165 enum RESULT {EMPTY, NO_VAR, NO_REF, HAS_REF}; 166 167 RESULT find(const char *const var_name /*[in] c-string*/, Pick * * ref = NULL /*[out] Pick* */){ 168 if (!var_name) return EMPTY; // NULL name 169 170 int i = get_index(var_name); 171 if (i == -1){ 172 return NO_VAR; // new name 173 } 174 if (!refs[i]){ // exist name, no ref 175 return NO_REF; 176 } else { 177 if (ref) { 178 *ref = refs[i]; 179 } 180 return HAS_REF; // exist name, has ref 181 } 182 } 183 184 void put(const char *const var_name, Pick *const var_ref = NULL){ 185 int i = get_index(var_name); 186 switch(find(var_name)){ 187 case EMPTY: // NULL name 188 break; 189 case NO_VAR: // new name 190 int offset; 191 offset = name_buffer.content_size(); 192 name_buffer.append_array(var_name, strlen(var_name) + 1); 193 names.append(offset); 194 refs.append(var_ref); 195 break; 196 case NO_REF: // exist name, no ref 197 refs[i] = var_ref; // link definition with variable 198 break; 199 case HAS_REF: // exist name, has ref 200 if (var_ref){ 201 refs[i] = var_ref; 202 } 203 break; 204 default: 205 ; // ASSERT(FALSE); 206 } 207 return; 208 } 209 210 UBool is_complete(){ 211 int n = names.content_size(); 212 for (int i=0; i<n; ++i){ 213 if (refs[i] == NULL){ 214 return FALSE; 215 } 216 } 217 return TRUE; 218 } 219 220 void reset(){ 221 names.reset(); 222 name_buffer.reset(); 223 224 // release memory here 225 int s = refs.content_size(); 226 for (int i=0; i < s; i++){ 227 delete refs[i]; // TOFIX: point alias/recursion problem 228 } 229 refs.reset(); 230 } 231 232 ~SymbolTable(){ 233 reset(); 234 } 235 }; 236 237 238 /* 239 // Document of class Escaper 240 // 241 // ATTENTION: 242 // From http://icu-project.org/userguide/Collate_Customization.html. 243 // We get the precedence of escape/quote operations 244 // 245 // (highest) 1. backslash \ 246 // 2. two single quotes '' 247 // 3. quoting ' ' 248 // 249 // ICU Collation should accept following as the same string. 250 // 251 // 1) 'ab'c _ 252 // 2) a\bc \ 253 // 3) a'b'\c |- They are equal. 254 // 4) abc _/ 255 // 256 // From "two single quotes", we have following deductions 257 // D1. empty quoting is illgal. (obviously) 258 // D2. no contact operation between two quotings 259 // '.''.' is not .. it is .'. 260 // D3. "two single quotes" cannot contact two quoting simultaneously 261 // '..''''.' is not ..'. it is ..''. 262 // NOTICE: 263 // "two single quotes" can contact before one quoting 264 // '''.' is '. 265 // "two single quotes" can literally contact after one quoting 266 // But, from syntax, it's one quoting including a "two single quotes" 267 // '.''' is .' 268 // D4. "two single quotes" cannot solely be included in quoting 269 // '''' is not ' it is '' 270 // NOTICE: These are legal 271 // '.''.' is .'. 272 // '.''' is .' 273 // 274 // dicision 275 // /\ 276 // /__\ 277 // output buffer input buffer 278 // 279 // To make our dicision (within an atom operation) without caring input and output buffer, 280 // following calling pattern (within an atom operation) shall be avoided 281 // 282 // P1 open_quoting() then close_quoting() (direct violation) D1 283 // P2 close_quoting() then open_quoting() (direct violation) D2 284 // P3 empty open_quoting() (indirect violation) D1, D4 285 // P4 empty close_quoting() (indirect violation) D2, D3 286 // P5 open_quoting() then two single quotes (indirect violation) D4 287 // P6 close_quoting() then two single quotes (indirect violation) D3 288 // 289 // two single quotes escaping will not open_ or close_ quoting() 290 // The choice will not lose some quoing forms. 291 // 292 // For open_quoting(), 293 // we may get this form quoting ''' P5 294 // It may raise a bug ''''x 295 // If we expect 296 // '''.' let the next char open the quoting 297 // '.''.' the quoting is already opened by preceding char 298 // 299 // For close_quoting() 300 // we will get this form quoting '.''' P6 301 // It may raise a bug '.''''.' 302 // If we expect 303 // '.'''\. let the next char close the quoting 304 // '.''''.' the expectation is wrong! using '.'\''.' instead 305 // 306 // It's a hard work to re-adjust generation opportunity for various escaping form. 307 // We just simply ignore it. 308 */ 309 class Escaper{ 310 public: 311 enum CHOICE {YES, NO, RAND}; 312 enum ESCAPE_FORM {BSLASH_ONLY, QUOTE_ONLY, QUOTE_AND_BSLAH, RAND_ESC}; 313 private: 314 class Bool{ // A wrapper class for CHOICE, to auto adapter UBool class 315 private: 316 const CHOICE tag; 317 public: 318 Bool(CHOICE flag=RAND):tag(flag){} 319 operator UBool() { // conversion operator 320 return tag == RAND ? rand()%2 : tag == YES; 321 //if (tag == RAND){ 322 // return rand()%2 == 1; 323 //} else { 324 // return tag == YES ? TRUE : FALSE; 325 //} 326 } 327 }; 328 public: 329 Escaper(CHOICE escapeLiteral = RAND, 330 CHOICE twoQuotesEscape = RAND, 331 ESCAPE_FORM escapeForm = RAND_ESC): 332 escape_form(escapeForm), 333 escape_literal(escapeLiteral), 334 two_quotes_escape(twoQuotesEscape), 335 is_quoting(FALSE){} 336 private: 337 Buffer_char str; 338 ESCAPE_FORM escape_form; 339 Bool escape_literal; 340 Bool two_quotes_escape; 341 UBool quote_escape; 342 UBool bslash_escape; 343 UBool is_quoting; 344 345 void set_options(){ 346 ESCAPE_FORM t = escape_form == RAND_ESC ? (ESCAPE_FORM) (rand()%3) : escape_form; 347 switch (t){ 348 case BSLASH_ONLY : 349 bslash_escape = TRUE; quote_escape = FALSE; break; 350 case QUOTE_ONLY: 351 bslash_escape = FALSE;quote_escape = TRUE; break; 352 case QUOTE_AND_BSLAH: 353 bslash_escape = TRUE; quote_escape = TRUE; break; 354 default: 355 ;// error 356 } 357 } 358 359 void reset(){ 360 str.reset(); 361 is_quoting = FALSE; 362 } 363 364 inline void open_quoting(){ 365 if(is_quoting){ 366 // do nothing 367 } else { 368 str.append('\''); 369 is_quoting = TRUE; 370 } 371 } 372 inline void close_quoting(){ 373 if(is_quoting){ 374 str.append('\''); 375 is_quoting = FALSE; 376 } else { 377 // do nothing 378 } 379 } 380 381 // str [in] null-terminated c-string 382 void append(const char * strToAppend){ 383 for(;*strToAppend != 0; strToAppend++){ 384 append(*strToAppend); 385 } 386 } 387 388 inline void append(const char c){ 389 set_options(); 390 391 if (c == '\\'){ 392 quote_escape ? open_quoting() : close_quoting(); 393 //bslash_escape always true here 394 str.append('\\'); 395 str.append('\\'); 396 } else if (c == '\''){ 397 if (two_quotes_escape){ // quoted using two single quotes 398 // See documents in anonymous.design 399 str.append('\''); 400 str.append('\''); 401 } else{ 402 quote_escape ? open_quoting() : close_quoting(); 403 //bslash_escape always true here 404 str.append('\\'); 405 str.append('\''); 406 } 407 } else if (isSpecialAsciiChar(c) || isWhiteSpace(c)){ 408 quote_escape ? open_quoting() : close_quoting(); 409 if (bslash_escape) str.append('\\'); 410 str.append(c); 411 } else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as literal 412 if (escape_literal){ 413 quote_escape ? open_quoting() : close_quoting(); 414 if (bslash_escape) str.append('\\'); 415 str.append(c); 416 } else { 417 close_quoting(); 418 str.append(c); 419 } 420 } 421 } 422 423 public: 424 // Return a null-terminate c-string. The buffer is owned by callee. 425 char * operator()(const char * literal /*c-string*/){ 426 str.reset(); 427 for(;*literal != 0; literal++){ 428 append(*literal); 429 } 430 close_quoting(); // P4 exception, to close whole quoting 431 return str; 432 } 433 }; 434 435 class WeightedRand{ 436 // Return a random number in [0, size) 437 // Every number has different chance (aka weight) to be selected. 438 private: 439 Buffer_int weights; 440 double total; 441 WeightedRand(const WeightedRand &); 442 WeightedRand & operator = (const WeightedRand &); 443 public: 444 WeightedRand(Buffer_int * weight_list = NULL, int size = 0){ 445 if ( weight_list == NULL){ 446 for (int i=0; i<size; ++i) weights.append(DEFAULT_WEIGHT); 447 } else { 448 int s = weight_list->content_size(); 449 if (s < size){ 450 weights.append_array( (*weight_list),s); 451 for (int i=s; i<size; ++i) weights.append(DEFAULT_WEIGHT); 452 } else { // s >= size 453 weights.append_array( (*weight_list),size); 454 } 455 } 456 total = 0; 457 int c = weights.content_size(); 458 for (int i=0; i<c; ++i){ 459 total += weights[i]; 460 } 461 } 462 463 void append(int weight){ 464 weights.append(weight); 465 total += weight; 466 } 467 468 // Give a random number with the consideration of weight. 469 // Every random number is associated with a weight. 470 // It identifies the chance to be selected, 471 // larger weight has more chance to be selected. 472 // 473 // 474 // ______________________ every slot has equal chance 475 // 476 // [____][_][___][______] each item has different slots, hence different chance 477 // 478 // 479 // The algorithms to generate the number is illustrated by preceding figure. 480 // First, a slot is selected by rand(). Then we translate the slot to corresponding item. 481 // 482 int next(){ 483 // get a random in [0,1] 484 double reference_mark = (double)rand() / (double)RAND_MAX; 485 486 // get the slot's index, 0 <= mark <= total; 487 double mark = total * reference_mark; 488 489 // translate the slot to corresponding item 490 int i=0; 491 for (;;){ 492 mark -= weights[i]; // 0 <= mark <= total 493 if (mark <= 0) 494 break; 495 i++; 496 } 497 return i; 498 } 499 }; 500 501 /////////////////////////////////////////////////////////// 502 // 503 // The parser result nodes 504 // 505 506 class Literal : public Pick { 507 public: 508 virtual const char* next(){ 509 return str; 510 } 511 Literal(const char * s /*c-string*/){ 512 str.append_array(s, strlen(s) + 1); 513 } 514 private: 515 Buffer_char str; //null-terminated c-string 516 }; 517 518 class Variable : public Pick { 519 public: 520 Variable(SymbolTable * symbols, const char * varName, Pick * varRef = NULL){ 521 this->var_name.append_array(varName, strlen(varName) + 1); 522 if ((symbol_table = symbols)){ 523 symbol_table->put(varName, varRef); 524 } 525 } 526 527 operator const char *(){ 528 return var_name; 529 } 530 531 virtual const char* next(){ 532 if (symbol_table){ 533 Pick * var_ref = NULL; 534 symbol_table->find(var_name, &var_ref); 535 if (var_ref) { 536 return var_ref->next(); 537 } 538 } 539 return ""; // dumb string 540 } 541 private: 542 Buffer_char var_name; 543 SymbolTable * symbol_table; 544 }; 545 546 class Quote : public Pick{ 547 public: 548 Quote(Pick & base):item(base),e(Escaper::NO, Escaper::NO, Escaper::BSLASH_ONLY){ 549 } 550 virtual const char* next(){ 551 return e(item.next()); 552 } 553 private: 554 Pick & item; 555 Buffer_char str; 556 Escaper e; 557 }; 558 559 560 class Morph : public Pick{ 561 /* 562 The difference between morph and an arbitrary random string is that 563 a morph changes slowly. When we build collation rules, for example, 564 it is a much better test if the strings we use are all in the same 565 'neighborhood'; they share many common characters. 566 */ 567 public: 568 Morph(Pick & base):item(base){} 569 570 virtual const char* next(){ 571 current.reset(); 572 const char * s = item.next(); 573 current.append_array(s, strlen(s) + 1); 574 if (last.content_size() == 0) { 575 str.reset(); 576 last.reset(); 577 str.append_array(current, current.content_size()); 578 last.append_array(current, current.content_size()); 579 } else { 580 morph(); 581 } 582 return str; 583 } 584 private: 585 Pick & item; 586 Buffer_char str; 587 Buffer_char last; 588 Buffer_char current; 589 590 char * p_last; 591 char * p_curr; 592 593 void copy_curr(){ 594 if (*p_curr) { 595 str.append(*p_curr); 596 p_curr++; 597 } 598 } 599 600 void copy_last(){ 601 if (*p_last) { 602 str.append(*p_last); 603 p_last++; 604 } 605 } 606 607 // copy 0, 1, or 2 character(s) to str 608 void copy(){ 609 static WeightedRand wr(& Buffer_int().append(DEFAULT_WEIGHT * 10), 5); 610 611 switch (wr.next()){ 612 case 0: // copy last -- has 10 times chance than others 613 copy_last(); 614 break; 615 case 1: // copy both 616 copy_curr(); 617 copy_last(); 618 break; 619 case 2: // copy both 620 copy_last(); 621 copy_curr(); 622 break; 623 case 3: 624 copy_curr(); 625 break; 626 case 4: // copy nothing 627 break; 628 default: 629 // ASSERT(FALSE); 630 ; 631 } 632 } 633 634 void morph(void){ 635 int min = strlen(last); 636 int max = strlen(current); 637 if (min > max){ 638 int temp = min; 639 min = max; 640 max = temp; 641 } 642 643 int len = min + rand()%(max - min + 1); // min + [0, diff] 644 p_curr = current; 645 p_last = last; 646 str.reset(); 647 648 for (; str.content_size()<len && *p_curr && *p_last;){ 649 copy(); // copy 0, 1, or 2 character(s) to str 650 } 651 652 if (str.content_size() == len) { 653 str.append(0); 654 final(); 655 return; 656 } 657 658 if (str.content_size() > len) { // if the last copy copied two characters 659 str[len]=0; 660 final(); 661 return; 662 } 663 664 // str.content_size() < len 665 if (*p_last) { 666 for (; str.content_size() < len; copy_last()); 667 } else if (*p_curr){ 668 for (; str.content_size() < len; copy_curr()); 669 } 670 671 int last_len = last.content_size(); 672 for (;str.content_size() < len;){ 673 str.append(last[rand()%last_len]); 674 } 675 str.append(0); 676 final(); 677 } 678 679 void final(){ 680 last.reset(); 681 last.append_array(current, current.content_size()); 682 } 683 }; 684 685 class Sequence : public Pick { 686 public: 687 virtual const char* next(){ 688 str.reset(); 689 int s = items.content_size(); 690 for(int i=0; i < s; i++){ 691 const char * t = items[i]->next(); 692 str.append_array(t, strlen(t)); 693 } 694 str.append(0); // terminal null 695 return str; 696 } 697 698 void append (Pick * node){ 699 items.append(node); 700 } 701 702 virtual ~Sequence(){ 703 int s = items.content_size(); 704 for(int i=0; i < s; i++){ 705 //How can assure the item is got from heap? 706 //Let's assume it. 707 delete items[i]; // TOFIX: point alias/recursion problem 708 items[i] = NULL; 709 } 710 } 711 private: 712 Buffer_pPick items; 713 Buffer_char str; //null-terminated c-string 714 }; 715 716 class Repeat : public Pick { 717 private: 718 Pick * item; 719 Buffer_char str; 720 WeightedRand wr; 721 int min; 722 int max; 723 int select_a_count(){ 724 return min + wr.next(); 725 } 726 public: 727 virtual const char* next(){ 728 str.reset(); 729 int c = select_a_count(); 730 for(int i=0; i< c; i++){ 731 const char * t = item->next(); 732 str.append_array(t, strlen(t)); 733 } 734 str.append(0); 735 return str; 736 } 737 738 Repeat(Pick * base, int minCount =0, int maxCount = 1, Buffer_int * weights = NULL): 739 wr(weights, maxCount-minCount +1) { 740 this->item = base; 741 this->min = minCount; 742 this->max = maxCount; 743 } 744 virtual ~Repeat(){ 745 delete item; // TOFIX: point alias/recursion problem 746 item = NULL; 747 } 748 }; 749 750 751 class Alternation : public Pick { 752 public: 753 virtual const char* next(){ 754 str.reset(); 755 int i = wr.next(); 756 const char * t = items[i]->next(); 757 str.append_array(t, strlen(t) + 1); 758 return str; 759 } 760 virtual ~Alternation(){ 761 int s = items.content_size(); 762 for(int i=0; i < s; i++){ 763 delete items[i]; // TOFIX: point alias/recursion problem 764 items[i] = NULL; 765 } 766 } 767 768 Alternation & append (Pick * node, int weight = DEFAULT_WEIGHT){ 769 items.append(node); 770 wr.append(weight); 771 return *this; 772 } 773 private: 774 Buffer_pPick items; 775 Buffer_char str; // null-terminated c-string 776 WeightedRand wr; 777 }; 778 779 /////////////////////////////////////////////////////////// 780 // 781 // The parser 782 // 783 784 enum TokenType {STRING, VAR, NUMBER, STREAM_END, ERROR, QUESTION, STAR, PLUS, LBRACE, RBRACE, LPAR, RPAR, SEMI, EQ, COMMA, BAR, AT, WAVE, PERCENT}; 785 786 class Scanner{ 787 friend int DumpScanner(Scanner & s, UBool dumb); 788 private: 789 const char * source; 790 const char * working; 791 const char * history; // for debug 792 enum StateType {START, IN_NUM, IN_VAR_FIRST, IN_VAR, IN_QUOTE, IN_QUOTE_BSLASH, IN_BSLASH, IN_STRING, DONE}; 793 StateType state; 794 void terminated(TokenType t){ 795 working--; // return the peeked character 796 tokenType = t; 797 token.append(0); // close buffer 798 state = DONE; 799 } 800 public: 801 // the buffer of "source" is owned by caller 802 Scanner(const char *src/*[in] c-string*/ = NULL):source(src){ 803 working = src; 804 history = working; 805 state = DONE; 806 tokenType = ERROR; 807 } 808 809 //void setSource(const char *const src /*[in] c-string*/){ 810 // *(&const_cast<const char *>(source)) = src; 811 //} 812 813 Buffer_char token; 814 TokenType tokenType; 815 816 TokenType getNextToken(){ 817 token.reset(); 818 state = START; 819 history = working; // for debug 820 while (state != DONE){ 821 char c = *working++; 822 if (c == 0 && state != START){//avoid buffer overflow. for IN_QUOE, IN_ESCAPE 823 terminated(ERROR); 824 break; // while 825 } 826 switch(state){ 827 case START: 828 tokenType = ERROR; 829 switch(c){ 830 case '?' : tokenType = QUESTION; break; 831 case '*' : tokenType = STAR; break; 832 case '+' : tokenType = PLUS; break; 833 case '{' : tokenType = LBRACE; break; 834 case '}' : tokenType = RBRACE; break; 835 case '(' : tokenType = LPAR; break; 836 case ')' : tokenType = RPAR; break; 837 case ';' : tokenType = SEMI; break; 838 case '=' : tokenType = EQ; break; 839 case ',' : tokenType = COMMA; break; 840 case '|' : tokenType = BAR; break; 841 case '@' : tokenType = AT; break; 842 case '~' : tokenType = WAVE; break; 843 case '%' : tokenType = PERCENT; break; 844 case 0 : tokenType = STREAM_END; working-- /*avoid buffer overflow*/; break; 845 } 846 if (tokenType != ERROR){ 847 token.append(c); 848 token.append(0); 849 state = DONE; 850 break; // START 851 } 852 switch(c){ 853 case '$' : state = IN_VAR_FIRST; token.append(c); break; 854 case '\'' : state = IN_QUOTE; break; 855 case '\\' : state = IN_BSLASH; break; 856 default: 857 if (isWhiteSpace(c)){ // state = START; //do nothing 858 } else if (isDigit(c)){ state = IN_NUM; token.append(c); 859 } else if (isAlphabet(c)){ state = IN_STRING; token.append(c); 860 } else {terminated(ERROR);} 861 } 862 break;//START 863 case IN_NUM: 864 if (isDigit(c)){ 865 token.append(c); 866 } else { 867 terminated(NUMBER); 868 } 869 break;//IN_NUM 870 case IN_VAR_FIRST: 871 if (isAlphabet(c)){ 872 token.append(c); 873 state = IN_VAR; 874 } else { 875 terminated(ERROR); 876 } 877 break; // IN_VAR_FISRT 878 case IN_VAR: 879 if (isAlphabet(c) || isDigit(c)){ 880 token.append(c); 881 } else { 882 terminated(VAR); 883 } 884 break;//IN_VAR 885 case IN_STRING: 886 // About the scanner's behavior for STRING, AT, and ESCAPE: 887 // All of them can be contacted with each other. 888 // This means the scanner will eat up as much as possible strings 889 // (STRING, AT, and ESCAPE) at one time, with no regard of their 890 // combining sequence. 891 // 892 if (c == '\''){ 893 state = IN_QUOTE; // the first time we see single quote 894 } else if (c =='\\'){ // back slash character 895 state = IN_BSLASH; 896 } else if (isAlphabet(c) || isDigit(c)){ 897 token.append(c); 898 } else{ 899 terminated(STRING); 900 } 901 break;//IN_STRING 902 case IN_QUOTE: 903 if (c == '\''){ // the second time we see single quote 904 state = IN_STRING; // see document in IN_STRING 905 } else if ( c== '\\') { // backslah escape in quote 906 state = IN_QUOTE_BSLASH; 907 } else { 908 token.append(c); // eat up everything, includes back slash 909 } 910 break;//IN_QUOTE 911 case IN_QUOTE_BSLASH: 912 case IN_BSLASH: 913 switch (c){ 914 case 'n' : token.append('\n'); break; 915 case 'r' : token.append('\r'); break; 916 case 't' : token.append('\t'); break; 917 case '\'' : token.append('\''); break; 918 case '\\' : token.append('\\'); break; 919 default: token.append(c); // unknown escaping, treat it as literal 920 } 921 if (state == IN_BSLASH){ 922 state = IN_STRING; // see document in IN_STRING 923 } else { // state == IN_QUOTE_BSLASH 924 state = IN_QUOTE; 925 } 926 break;//IN_BSLASH 927 case DONE: /* should never happen */ 928 default: 929 working--; 930 tokenType = ERROR; 931 state = DONE; 932 break; 933 }//switch(state) 934 }//while (state != DONE) 935 936 return tokenType; 937 } 938 };//class Scanner 939 940 class Parser{ 941 friend UBool TestParser(); 942 friend class TestParserT; 943 friend class LanguageGenerator_impl; 944 private: 945 Scanner s; 946 TokenType & token; 947 int min_max; // for the evil infinite 948 949 UBool match(TokenType expected){ 950 if (token == expected) { 951 token = s.getNextToken(); 952 return TRUE; 953 } else { 954 //s.dumpCurrentPoint(); 955 return FALSE; 956 } 957 } 958 959 UBool weight(int & value){ 960 if (token == NUMBER){ 961 int temp = atoi(s.token); 962 match(NUMBER); 963 if (match(PERCENT)){ 964 value = temp; 965 return TRUE; 966 } 967 } 968 return FALSE; 969 } 970 971 UBool repeat (Pick* &node /*in,out*/){ 972 if (node == NULL) return FALSE; 973 974 int count = -2; 975 int min = -2; 976 int max = -2; 977 UBool question = FALSE; 978 switch (token){ 979 case QUESTION: 980 match(QUESTION); 981 min = 0; 982 max = 1; 983 count = 2; 984 question = TRUE; 985 break; 986 case STAR: 987 match(STAR); 988 min = 0; 989 max = -1; 990 count = -1; 991 break; 992 case PLUS: 993 match(PLUS); 994 min = 1; 995 max = -1; 996 count = -1; 997 break; 998 case LBRACE: 999 match(LBRACE); 1000 if (token != NUMBER){ 1001 return FALSE; 1002 }else { 1003 min = atoi(s.token); 1004 match(NUMBER); 1005 if (token == RBRACE){ 1006 match(RBRACE); 1007 max = min; 1008 count = 1; 1009 } else if (token == COMMA) { 1010 match(COMMA); 1011 if (token == RBRACE){ 1012 match(RBRACE); 1013 max = -1; 1014 count = -1; 1015 } else if (token == NUMBER) { 1016 max = atoi(s.token); 1017 match(NUMBER); 1018 count = max - min + 1; 1019 if (!match(RBRACE)) { 1020 return FALSE; 1021 } 1022 } else { 1023 return FALSE; 1024 } 1025 } else { 1026 return FALSE; 1027 } 1028 } 1029 break; 1030 default: 1031 return FALSE; 1032 } 1033 1034 if (count == -2 || min == -2 || max == -2){ 1035 //ASSERT(FALSE); 1036 return FALSE; 1037 } 1038 1039 // eat up following weights 1040 Buffer_int weights; 1041 int w; 1042 while (weight(w)){ 1043 weights.append(w); 1044 } 1045 1046 // for the evil infinite 1047 min_max = min_max > min ? min_max : min; 1048 min_max = min_max > max ? min_max : max; 1049 if (min_max > PSEUDO_INFINIT){ 1050 return FALSE; // PSEUDO_INFINIT is less than the real maximum 1051 } 1052 if (max == -1){ // the evil infinite 1053 max = PSEUDO_INFINIT; 1054 } 1055 // for the strange question mark 1056 if (question && weights.content_size() > 0){ 1057 Buffer_int w2; 1058 w2.append(DEFAULT_WEIGHT - weights[0]).append(weights[0]); 1059 node = new Repeat(node,min,max,&w2); 1060 return TRUE; 1061 } 1062 node = new Repeat(node,min,max,&weights); 1063 return TRUE; 1064 } 1065 1066 UBool core(Pick* &node /*out*/){ 1067 if (node != NULL) return FALSE; //assert node == NULL 1068 1069 switch(token){ 1070 case LPAR: 1071 match(LPAR); 1072 if(defination(node) && match(RPAR)){ 1073 return TRUE; 1074 } 1075 return FALSE; 1076 case VAR: 1077 node = new Variable(&symbols, s.token); 1078 match(VAR); 1079 return TRUE; 1080 case STRING: 1081 node = new Literal(s.token); 1082 match(STRING); 1083 return TRUE; 1084 default: 1085 return FALSE; 1086 } 1087 } 1088 UBool modified(Pick* &node /*out*/){ 1089 if (node != NULL) return FALSE; //assert node == NULL 1090 1091 if (!core(node)) { 1092 return FALSE; 1093 } 1094 1095 for (;;){ 1096 switch(token){ 1097 case WAVE: 1098 match(WAVE); 1099 node = new Morph(*node); 1100 break; 1101 case AT: 1102 match(AT); 1103 node = new Quote(*node); 1104 break; 1105 case QUESTION: 1106 case STAR: 1107 case PLUS: 1108 case LBRACE: 1109 if (!repeat(node)) return FALSE; 1110 break; 1111 case SEMI: // rule definiation closed 1112 case RPAR: // within parenthesis (core closed) 1113 case BAR: // in alternation 1114 case NUMBER: // in alternation, with weight 1115 case LPAR: // in sequence 1116 case VAR: // in sequence 1117 case STRING: // in sequence 1118 return TRUE; 1119 default: 1120 return FALSE; 1121 } 1122 } 1123 } 1124 1125 1126 UBool sequence_list(Pick* &node /*in,out*/){ 1127 if (node == NULL) return FALSE; // assert node != NULL 1128 1129 Sequence* seq = new Sequence(); 1130 Pick * n = node; 1131 1132 while (token == VAR || token == STRING || token == LPAR){ 1133 seq->append(n); 1134 n = NULL; 1135 if (modified(n)){ 1136 // go on 1137 } else { 1138 goto FAIL; 1139 } 1140 } 1141 1142 if (token == SEMI || token == RPAR || token == BAR){ 1143 seq->append(n); 1144 node = seq; 1145 return TRUE; 1146 } 1147 FAIL: 1148 delete seq; 1149 return FALSE; 1150 1151 } 1152 1153 UBool sequence(Pick* &node /*out*/){ 1154 if (node != NULL) return FALSE; //assert node == NULL 1155 1156 if (!modified(node)) { 1157 return FALSE; 1158 } 1159 1160 if (token == VAR || token == STRING || token == LPAR){ 1161 return sequence_list(node); 1162 } else { 1163 return TRUE; // just a modified 1164 } 1165 } 1166 1167 UBool alternation_list(Pick* &node /*in,out*/){ 1168 if (node == NULL) return FALSE; // assert node != NULL 1169 1170 Alternation * alt = new Alternation(); 1171 Pick * n = node; 1172 int w = DEFAULT_WEIGHT; 1173 1174 while (token == NUMBER || token == BAR){ 1175 if(token == NUMBER) { 1176 if (weight(w)){ 1177 if (token == BAR){ 1178 // the middle item, go on 1179 } else { 1180 // the last item or encounter error 1181 break; //while 1182 } 1183 } else { 1184 goto FAIL; 1185 } 1186 } // else token == BAR 1187 match(BAR); 1188 alt->append(n,w); 1189 1190 n = NULL; 1191 w = DEFAULT_WEIGHT; 1192 if (sequence(n)){ 1193 // go on 1194 } else { 1195 goto FAIL; 1196 } 1197 } 1198 1199 if (token == SEMI || token == RPAR) { 1200 alt->append(n,w); 1201 node = alt; 1202 return TRUE; 1203 } 1204 FAIL: 1205 delete alt; 1206 return FALSE; 1207 } 1208 1209 UBool alternation(Pick* &node /*out*/){ 1210 if (node != NULL) return FALSE; //assert node == NULL 1211 1212 // 'sequence' has higher precedence than 'alternation' 1213 if (!sequence(node)){ 1214 return FALSE; 1215 } 1216 1217 if (token == BAR || token == NUMBER){ // find a real alternation1, create it. 1218 return alternation_list(node); 1219 } else { 1220 return TRUE; // just a sequence_old 1221 } 1222 } 1223 1224 1225 UBool defination(Pick* &node /*out*/){ 1226 if (node != NULL) return FALSE; //assert node == NULL 1227 return alternation(node); 1228 } 1229 1230 UBool rule(){ 1231 if (token == VAR){ 1232 Buffer_char name; 1233 name.append_array(s.token, strlen(s.token) + 1); 1234 match(VAR); 1235 1236 if (match(EQ)){ 1237 Pick * t = NULL; 1238 if(defination(t)){ 1239 symbols.put(name, t); 1240 return match(SEMI); 1241 } 1242 } 1243 } 1244 return FALSE; 1245 } 1246 public: 1247 UBool rules(){ 1248 symbols.reset(); 1249 token = s.getNextToken(); 1250 while (rule()){ 1251 } 1252 if (token == STREAM_END){ 1253 return TRUE; 1254 } else { 1255 //s.dumpCurrentPoint(); 1256 return FALSE; 1257 } 1258 } 1259 1260 public: 1261 SymbolTable symbols; 1262 1263 Parser(const char *const source):s(source), token(s.tokenType){ 1264 min_max = -2; 1265 } 1266 UBool parse(){ 1267 return rules(); 1268 } 1269 1270 }; // class Parser 1271 1272 1273 /////////////////////////////////////////////////////////// 1274 // 1275 // 1276 // 1277 1278 int DumpScanner(Scanner & s, UBool dump = TRUE){ 1279 int len = strlen(s.source); 1280 int error_start_offset = s.history - s.source; 1281 if (dump){ 1282 printf("\n=================== DumpScanner ================\n"); 1283 fwrite(s.source, len, 1, stdout); 1284 printf("\n-----parsed-------------------------------------\n"); 1285 fwrite(s.source, s.history - s.source, 1, stdout); 1286 printf("\n-----current------------------------------------\n"); 1287 fwrite(s.history, s.working - s.history, 1, stdout); 1288 printf("\n-----unparsed-----------------------------------\n"); 1289 fwrite(s.working, (s.source + len - s.working), 1, stdout); 1290 printf("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"); 1291 } 1292 return error_start_offset; 1293 } 1294 1295 class LanguageGenerator_impl{ 1296 public: 1297 LanguageGenerator_impl(const char *const bnf_definition, const char *const top_node) 1298 :par(bnf_definition), top_node_name(top_node){ 1299 srand((unsigned)time( NULL )); 1300 } 1301 1302 LanguageGenerator::PARSE_RESULT parseBNF(UBool debug = TRUE){ 1303 if (par.parse()){ 1304 if (par.symbols.find(top_node_name, &top_node_ref) == SymbolTable::HAS_REF) { 1305 if (par.symbols.is_complete()) { 1306 return LanguageGenerator::OK; 1307 } else { 1308 if (debug) printf("The bnf definition is incomplete.\n"); 1309 return LanguageGenerator::INCOMPLETE; 1310 } 1311 } else { 1312 if (debug) printf("No top node is found.\n"); 1313 return LanguageGenerator::NO_TOP_NODE; 1314 } 1315 } else { 1316 if(debug) { 1317 printf("The bnf definition is wrong\n"); 1318 DumpScanner(par.s, TRUE); 1319 } 1320 return LanguageGenerator::BNF_DEF_WRONG; 1321 } 1322 } 1323 const char * next(){ 1324 return top_node_ref->next(); 1325 } 1326 1327 private: 1328 Parser par; 1329 const char *const top_node_name; 1330 Pick * top_node_ref; 1331 }; 1332 1333 LanguageGenerator::LanguageGenerator():lang_gen(NULL){ 1334 } 1335 1336 LanguageGenerator::~LanguageGenerator(){ 1337 delete lang_gen; 1338 } 1339 1340 LanguageGenerator::PARSE_RESULT LanguageGenerator::parseBNF(const char *const bnf_definition /*in*/, const char *const top_node/*in*/, UBool debug){ 1341 if (lang_gen){ 1342 delete lang_gen; 1343 } 1344 lang_gen = new LanguageGenerator_impl(bnf_definition, top_node); 1345 PARSE_RESULT r = lang_gen->parseBNF(debug); 1346 if (r != OK){ 1347 delete lang_gen; 1348 lang_gen = NULL; 1349 return r; 1350 } else { 1351 return r; 1352 } 1353 } 1354 const char *LanguageGenerator::next(){ // Return a null-terminated c-string. The buffer is owned by callee. 1355 if (lang_gen){ 1356 return lang_gen->next(); 1357 }else { 1358 return ""; 1359 } 1360 } 1361 1362 /////////////////////////////////////////////////////////// 1363 // 1364 // The test code for WBNF 1365 // 1366 1367 #define CALL(fun) \ 1368 if (fun()){ \ 1369 printf("Pass: " #fun "\n");\ 1370 } else { \ 1371 printf("FAILED: !!! " #fun " !!!\n"); \ 1372 } 1373 1374 #define DUMP_R(fun, var, times) \ 1375 {printf("\n========= " #fun " =============\n"); \ 1376 for (int i=0; i<times; i++) { \ 1377 const char * t = var.next();\ 1378 fwrite(t,strlen(t),1,stdout); \ 1379 printf("\n"); \ 1380 } \ 1381 printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");} 1382 1383 1384 1385 #if TEST_WBNF_TEST 1386 static UBool TestQuote(){ 1387 const char *const str = "This ' A !,z| qq [] .new\tline"; 1388 //const char *const str_r = "This \\' A '!,'z'|' qq '[]' '.'new\tline"; 1389 //// 1390 //// :( we must quote our string to following C syntax 1391 //// cannot type the literal here, it makes our code rather human unreadable 1392 //// very very unconformable! 1393 //// 1394 ///* 1395 //*/ 1396 1397 //const char *const s1 = "ab'c"; 1398 //const char (* s1_r1) [] = { "ab''c", // ab''c 1399 // "ab\\'c", // ab\'c 1400 // };// 1401 ///* 1402 // . '.' \. 1403 // .. \.\. '.'\. '.'\. '..' // '.''.' wrong 1404 //*/ 1405 1406 //const char *const s2 = "a..'.b"; // a..'.b 1407 //const char (*s2_r) [] = { "a'..''.'b" // a'..''.'b 1408 // ,"a'..\\'.'b" // a'..\'.'b 1409 // ,"a'..'\\''.'b" // a'..'\''.'b 1410 // };// 1411 1412 //const char *const s3 = "a..\\.b"; // a..\.b 1413 //const char (*s3_r) [] = { "a'..\\\\.'b" // a'..\\.'b 1414 // ,"a'..'\\\\'.'b" // a'..'\\'.'b 1415 // };// 1416 1417 // // no catact operation, no choice, must be compact 1418 1419 srand((unsigned)time( NULL )); 1420 1421 //Escaper l(Escaper::NO, Escaper::NO, Escaper::RAND_ESC); 1422 Pick *p = new Literal(str); 1423 Quote q(*p); 1424 1425 DUMP_R(TestQuote, (*p), 1); 1426 DUMP_R(TestQuote, q, 20); 1427 return FALSE; 1428 } 1429 static UBool TestLiteral(){ 1430 const char * s = "test string99."; 1431 Literal n(s); 1432 const char * r = n.next(); 1433 return strcmp(s,r) == 0; 1434 } 1435 1436 static UBool TestSequence(){ 1437 Sequence seq; 1438 seq.append(new Literal("abc ")); 1439 seq.append(new Literal(", s")); 1440 1441 return strcmp(seq.next(), "abc , s") == 0; 1442 } 1443 static UBool TestAlternation(){ 1444 srand((unsigned)time( NULL )); 1445 Alternation alt; 1446 alt.append(new Literal("aaa_10%"),10); 1447 alt.append(new Literal("bbb_0%"),0); 1448 alt.append(new Literal("ccc_10%"),10); 1449 alt.append(new Literal("ddddddd_50%"),50); 1450 1451 DUMP_R(TestAlternation, alt, 50); 1452 1453 return FALSE; 1454 } 1455 1456 static UBool TestBuffer(){ 1457 Buffer_int t; 1458 t.append(1).append(0).append(5); 1459 int s = t.content_size(); 1460 for (int i=0; i<s; ++i){ 1461 printf("%d\n", t[i]); 1462 } 1463 return FALSE; 1464 } 1465 1466 static UBool TestWeightedRand(){ 1467 srand((unsigned)time( NULL )); 1468 Buffer_int t; 1469 t.append(1).append(0).append(5); 1470 WeightedRand wr(&Buffer_int().append(10).append(0).append(50),4); 1471 // WeightedRand wr(&t,3); 1472 for (int i=0; i< 50; ++i){ 1473 printf("%d\n", wr.next()); 1474 } 1475 return FALSE; 1476 } 1477 1478 static UBool TestRepeat(){ 1479 srand((unsigned)time( NULL )); 1480 Repeat rep(new Literal("aaa1-5 "), 1, 5); 1481 DUMP_R(TestRepeat, rep, 50); 1482 1483 Repeat r2(new Literal("b{1,3}1%0%5% "), 1, 3, &Buffer_int().append(1).append(0).append(5)); 1484 DUMP_R(TestRepeat, r2, 50); 1485 1486 Repeat r3(new Literal("aaa5-5 "), 5, 5); 1487 DUMP_R(TestRepeat, r3, 50); 1488 1489 return FALSE; 1490 } 1491 1492 static UBool TestVariable(){ 1493 SymbolTable tab; 1494 Pick * value = new Literal("string1"); 1495 Variable var1(&tab, "x", value); 1496 1497 Variable var2(&tab, "y"); 1498 // tab.put(var2, value); // TOFIX: point alias/recursion problem 1499 Pick * value2 = new Literal("string2"); 1500 tab.put(var2, value2); 1501 1502 Pick * value3 = new Literal("string3"); 1503 Variable var3(&tab, "z"); 1504 tab.put("z", value3); 1505 1506 UBool pass; 1507 pass = strcmp(var1.next(), value->next()) == 0; 1508 pass = pass && strcmp(var2.next(), value2->next()) == 0; 1509 pass = pass && strcmp(var3.next(), value3->next()) == 0; 1510 return pass; 1511 } 1512 1513 static UBool TestSymbolTable(){ 1514 Literal * n1 = new Literal("string1"); 1515 Literal * n2 = new Literal("string2"); 1516 SymbolTable t; 1517 t.put("abc", n1); 1518 t.put("$aaa", n2); 1519 // t.put("alias", n1); // TOFIX: point alias/recursion problem 1520 t.put("bbb"); 1521 1522 UBool pass; 1523 pass = t.find(NULL) == SymbolTable::EMPTY; 1524 pass = pass && t.find("ccc") == SymbolTable::NO_VAR; 1525 pass = pass && t.find("bbb") == SymbolTable::NO_REF; 1526 pass = pass && t.find("abc") == SymbolTable::HAS_REF; 1527 pass = pass && t.find("$aaa") == SymbolTable::HAS_REF; 1528 1529 t.reset(); 1530 pass = pass && t.find("abc") == SymbolTable::NO_VAR; 1531 return pass; 1532 } 1533 1534 1535 static UBool TestScanner(void){ 1536 //const char str1[] = "$root = $command{0,5} $reset $mostRules{1,20};"; 1537 //const char str1_r[][20] = {"$root", "=", "$command", "{", "0", ",", "5", "}", 1538 // "$reset", "$mostRules", "{", "1", ",", "20", "}", ";"}; 1539 1540 const char str2[] = "$p2 =(\\\\ $s $string $s)? 25%;"; 1541 const char str2_r[][20] = {"$p2", "=", "(", "\\", "$s", "$string", "$s", ")", "?", "25", "%", ";"}; 1542 1543 const char *str = str2; 1544 const char (*str_r)[20] = str2_r; 1545 int tokenNum = sizeof(str2_r)/sizeof(char[20]); 1546 1547 Scanner t(str); 1548 UBool pass = TRUE; 1549 t.getNextToken(); 1550 int i = 0; 1551 while (pass){ 1552 if (t.tokenType == STREAM_END){ 1553 pass = pass? i == tokenNum : FALSE; 1554 break;//while 1555 } else if (t.tokenType == ERROR){ 1556 pass = FALSE; 1557 break;//while 1558 } else { 1559 pass = strcmp( &(t.token[0]), str_r[i++]) == 0; 1560 t.getNextToken(); 1561 } 1562 } 1563 1564 //const char ts[] = "$commandList = '['" 1565 //" ( alternate ' ' $alternateOptions" 1566 //" | backwards ' 2'" 1567 //" | normalization ' ' $onoff " 1568 //" | caseLevel ' ' $onoff " 1569 //" | hiraganaQ ' ' $onoff" 1570 //" | caseFirst ' ' $caseFirstOptions" 1571 //" | strength ' ' $strengthOptions" 1572 //" ) ']';" ; 1573 1574 //Scanner t2(ts); 1575 //pass = TRUE; 1576 //do { 1577 // t2.getNextToken(); 1578 // if (t2.tokenType == ERROR){ 1579 // DumpScanner(t2); 1580 // return FALSE; 1581 // } 1582 //}while (t.tokenType != STREAM_END); 1583 1584 return pass; 1585 } 1586 1587 class TestParserT { 1588 public: 1589 UBool operator () (const char *const str, const int exp_error_offset = -1, const UBool dump = TRUE){ 1590 Parser par(str); 1591 if (par.rules()){ 1592 if ( exp_error_offset == -1){ 1593 return TRUE; 1594 }else { 1595 DumpScanner(par.s,dump); 1596 return FALSE; 1597 } 1598 }else { 1599 return DumpScanner(par.s, dump) == exp_error_offset; 1600 } 1601 } 1602 }; 1603 1604 UBool TestParser(){ 1605 TestParserT test; 1606 1607 UBool pass = TRUE; 1608 pass = pass && test ("$s = ' ' ? 50%;"); 1609 pass = pass && test("$x = ($var {1,2}) 3%;"); // legal 1610 pass = pass && test("$x = $var {1,2} 3% | b 4%;"); // legal 1611 pass = pass && test("$x = $var {1,2} 3%;"); // legal 1612 pass = pass && test("$m = $c ? 2% 4% | $r 5% | $n 25%;"); // legal 1613 pass = pass && test("$a = b ? 2% | c 5%;"); // legal 1614 pass = pass && test("$x = A B 5% C 10% | D;", 8, FALSE); // illegal 5% 1615 pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc 1616 pass = pass && test("$x = (b 5%) (c 6%);"); // legal 1617 pass = pass && test("$x = (b 5%) c 6%;", 13, FALSE); // illegal 6% 1618 pass = pass && test("$x = b 5% (c 6%);", 9, FALSE); // illegal (c 6%) 1619 pass = pass && test("$x = b 5% c 6%;", 9, FALSE); // illegal c 6% 1620 pass = pass && test("$x = b 5%;"); // legal 1621 pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc 1622 pass = pass && test("$x = a | b | c 4% | d 5%;"); // legal 1623 pass = pass && test("$s = ' ' ? 50% abc;"); // legal 1624 pass = pass && test("$s = a | c d | e f;"); // legal 1625 pass = pass && test( "$z = q 0% | p 1% | r 100%;"); // legal How to check parsed tree?? 1626 1627 pass = pass && test("$s = ' ' ? 50%;"); 1628 pass = pass && test("$relationList = '<' | '<<' | ';' | '<<<' | ',' | '=';"); 1629 pass = pass && test("$p1 = ($string $s '|' $s)? 25%;"); 1630 pass = pass && test("$p2 = (\\\\ $s $string $s)? 25%;"); 1631 pass = pass && test("$rel2 = $p1 $string $s $p2;"); 1632 pass = pass && test("$relation = $relationList $s ($rel1 | $rel2) $crlf;"); 1633 pass = pass && test("$command = $commandList $crlf;"); 1634 pass = pass && test("$reset = '&' $s ($beforeList $s)? 10% ($positionList 100% | $string 10%) $crlf;"); 1635 pass = pass && test("$mostRules = $command 1% | $reset 5% | $relation 25%;"); 1636 pass = pass && test("$root = $command{0,5} $reset $mostRules{1,20};"); 1637 1638 const char collationBNF[] = 1639 "$s = ' '? 50%;" 1640 "$crlf = '\r\n';" 1641 1642 "$alternateOptions = non'-'ignorable | shifted;" 1643 "$onoff = on | off;" 1644 "$caseFirstOptions = off | upper | lower;" 1645 "$strengthOptions = '1' | '2' | '3' | '4' | 'I';" 1646 "$commandList = '['" 1647 " ( alternate ' ' $alternateOptions" 1648 " | backwards ' 2'" 1649 " | normalization ' ' $onoff " 1650 " | caseLevel ' ' $onoff " 1651 " | hiraganaQ ' ' $onoff" 1652 " | caseFirst ' ' $caseFirstOptions" 1653 " | strength ' ' $strengthOptions" 1654 " ) ']';" 1655 "$command = $commandList $crlf;" 1656 1657 "$ignorableTypes = (tertiary | secondary | primary) ' ' ignorable;" 1658 "$allTypes = variable | regular | implicit | trailing | $ignorableTypes;" 1659 "$positionList = '[' (first | last) ' ' $allTypes ']';" 1660 1661 "$beforeList = '[before ' ('1' | '2' | '3') ']';" 1662 1663 "$relationList = (" 1664 " '<'" 1665 " | '<<'" 1666 " | ';'" 1667 " | '<<<'" 1668 " | ','" 1669 " | '='" 1670 ");" 1671 "$string = $magic;" 1672 "$rel1 = '[variable top]' $s;" 1673 "$p1 = ($string $s '|' $s)? 25%;" 1674 "$p2 = (\\\\ $s $string $s)? 25%;" 1675 "$rel2 = $p1 $string $s $p2;" 1676 "$relation = $relationList $s ($rel1 | $rel2) $crlf;" 1677 1678 "$reset = '&' $s ($beforeList $s)? 10% ($positionList 1% | $string 10%) $crlf;" 1679 "$mostRules = $command 1% | $reset 5% | $relation 25%;" 1680 "$root = $command{0,5} $reset $mostRules{1,20};" 1681 ; 1682 1683 pass = pass && test(collationBNF); 1684 1685 1686 return pass; 1687 } 1688 1689 static UBool TestMorph(){ 1690 srand((unsigned)time( NULL )); 1691 1692 Alternation * alt = new Alternation(); 1693 1694 (*alt) 1695 .append(new Literal("a")).append(new Literal("b")).append(new Literal("c")) 1696 .append(new Literal("d")).append(new Literal("e")).append(new Literal("f")) 1697 .append(new Literal("g")).append(new Literal("h")).append(new Literal("i")) 1698 .append(new Literal("j")).append(new Literal("k")).append(new Literal("l")) 1699 .append(new Literal("m")).append(new Literal("n")).append(new Literal("o")) 1700 ; 1701 1702 Repeat * rep = new Repeat( alt ,5,5 ); 1703 Morph m( *rep); 1704 1705 // DUMP_R(TestMorph,(*rep),20); 1706 DUMP_R(TestMorph,m,100); 1707 1708 return FALSE; 1709 } 1710 1711 #endif 1712 1713 static UBool TestLanguageGenerator(){ 1714 //LanguageGenerator g; 1715 //const char *const s = "$s = p 0% | q 1%;"; 1716 //g.parseBNF(s, "$s"); 1717 UBool pass; 1718 //= strcmp("q", g.next()) == 0; 1719 1720 const char *const def = 1721 //"$a = $b;" 1722 //"$b = $c;" 1723 //"$c = $t;" 1724 //"$t = abc $z{1,2};" 1725 //"$k = a | b | c | d | e | f | g ;" 1726 //"$z = q 0% | p 1% | r 1%;" 1727 "$x = a ? 0%;" 1728 ; // end of string 1729 // const char * s = "abczz"; 1730 // 1731 // 1732 LanguageGenerator g; 1733 pass = g.parseBNF(def, "$x",TRUE); 1734 //// LanguageGenerator g(collationBNF, "$root", "$magic", new MagicNode()); 1735 // 1736 if (pass != LanguageGenerator::OK) return FALSE; 1737 1738 DUMP_R(TestLanguageGenerator, g, 20); 1739 return pass; 1740 1741 ////UBool pass = strcmp(s,r) == 0; 1742 1743 //if (pass){ 1744 // printf("TestRandomLanguageGenerator passed.\n"); 1745 //} else { 1746 // printf("TestRandomLanguageGenerator FAILED!!!\n"); 1747 //} 1748 //return pass; 1749 } 1750 1751 void TestWbnf(void){ 1752 srand((unsigned)time( NULL )); 1753 1754 //CALL(TestLiteral); 1755 //CALL(TestSequence); 1756 //CALL(TestSymbolTable); 1757 //CALL(TestVariable); 1758 1759 //TestRepeat(); 1760 //TestAlternation(); 1761 //TestMorph(); 1762 1763 //TestQuote(); 1764 //TestBuffer(); 1765 //TestWeightedRand(); 1766 1767 //CALL(TestScanner); 1768 //CALL(TestParser); 1769 CALL(TestLanguageGenerator); 1770 } 1771 1772