1 2 /* Parser implementation */ 3 4 /* For a description, see the comments at end of this file */ 5 6 /* XXX To do: error recovery */ 7 8 #include "Python.h" 9 #include "pgenheaders.h" 10 #include "token.h" 11 #include "grammar.h" 12 #include "node.h" 13 #include "parser.h" 14 #include "errcode.h" 15 16 17 #ifdef Py_DEBUG 18 extern int Py_DebugFlag; 19 #define D(x) if (!Py_DebugFlag); else x 20 #else 21 #define D(x) 22 #endif 23 24 25 /* STACK DATA TYPE */ 26 27 static void s_reset(stack *); 28 29 static void 30 s_reset(stack *s) 31 { 32 s->s_top = &s->s_base[MAXSTACK]; 33 } 34 35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK]) 36 37 static int 38 s_push(stack *s, dfa *d, node *parent) 39 { 40 stackentry *top; 41 if (s->s_top == s->s_base) { 42 fprintf(stderr, "s_push: parser stack overflow\n"); 43 return E_NOMEM; 44 } 45 top = --s->s_top; 46 top->s_dfa = d; 47 top->s_parent = parent; 48 top->s_state = 0; 49 return 0; 50 } 51 52 #ifdef Py_DEBUG 53 54 static void 55 s_pop(stack *s) 56 { 57 if (s_empty(s)) 58 Py_FatalError("s_pop: parser stack underflow -- FATAL"); 59 s->s_top++; 60 } 61 62 #else /* !Py_DEBUG */ 63 64 #define s_pop(s) (s)->s_top++ 65 66 #endif 67 68 69 /* PARSER CREATION */ 70 71 parser_state * 72 PyParser_New(grammar *g, int start) 73 { 74 parser_state *ps; 75 76 if (!g->g_accel) 77 PyGrammar_AddAccelerators(g); 78 ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state)); 79 if (ps == NULL) 80 return NULL; 81 ps->p_grammar = g; 82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD 83 ps->p_flags = 0; 84 #endif 85 ps->p_tree = PyNode_New(start); 86 if (ps->p_tree == NULL) { 87 PyMem_FREE(ps); 88 return NULL; 89 } 90 s_reset(&ps->p_stack); 91 (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree); 92 return ps; 93 } 94 95 void 96 PyParser_Delete(parser_state *ps) 97 { 98 /* NB If you want to save the parse tree, 99 you must set p_tree to NULL before calling delparser! */ 100 PyNode_Free(ps->p_tree); 101 PyMem_FREE(ps); 102 } 103 104 105 /* PARSER STACK OPERATIONS */ 106 107 static int 108 shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset) 109 { 110 int err; 111 assert(!s_empty(s)); 112 err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset); 113 if (err) 114 return err; 115 s->s_top->s_state = newstate; 116 return 0; 117 } 118 119 static int 120 push(stack *s, int type, dfa *d, int newstate, int lineno, int col_offset) 121 { 122 int err; 123 node *n; 124 n = s->s_top->s_parent; 125 assert(!s_empty(s)); 126 err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset); 127 if (err) 128 return err; 129 s->s_top->s_state = newstate; 130 return s_push(s, d, CHILD(n, NCH(n)-1)); 131 } 132 133 134 /* PARSER PROPER */ 135 136 static int 137 classify(parser_state *ps, int type, const char *str) 138 { 139 grammar *g = ps->p_grammar; 140 int n = g->g_ll.ll_nlabels; 141 142 if (type == NAME) { 143 label *l = g->g_ll.ll_label; 144 int i; 145 for (i = n; i > 0; i--, l++) { 146 if (l->lb_type != NAME || l->lb_str == NULL || 147 l->lb_str[0] != str[0] || 148 strcmp(l->lb_str, str) != 0) 149 continue; 150 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD 151 #if 0 152 /* Leaving this in as an example */ 153 if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) { 154 if (str[0] == 'w' && strcmp(str, "with") == 0) 155 break; /* not a keyword yet */ 156 else if (str[0] == 'a' && strcmp(str, "as") == 0) 157 break; /* not a keyword yet */ 158 } 159 #endif 160 #endif 161 D(printf("It's a keyword\n")); 162 return n - i; 163 } 164 } 165 166 { 167 label *l = g->g_ll.ll_label; 168 int i; 169 for (i = n; i > 0; i--, l++) { 170 if (l->lb_type == type && l->lb_str == NULL) { 171 D(printf("It's a token we know\n")); 172 return n - i; 173 } 174 } 175 } 176 177 D(printf("Illegal token\n")); 178 return -1; 179 } 180 181 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD 182 #if 0 183 /* Leaving this in as an example */ 184 static void 185 future_hack(parser_state *ps) 186 { 187 node *n = ps->p_stack.s_top->s_parent; 188 node *ch, *cch; 189 int i; 190 191 /* from __future__ import ..., must have at least 4 children */ 192 n = CHILD(n, 0); 193 if (NCH(n) < 4) 194 return; 195 ch = CHILD(n, 0); 196 if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0) 197 return; 198 ch = CHILD(n, 1); 199 if (NCH(ch) == 1 && STR(CHILD(ch, 0)) && 200 strcmp(STR(CHILD(ch, 0)), "__future__") != 0) 201 return; 202 ch = CHILD(n, 3); 203 /* ch can be a star, a parenthesis or import_as_names */ 204 if (TYPE(ch) == STAR) 205 return; 206 if (TYPE(ch) == LPAR) 207 ch = CHILD(n, 4); 208 209 for (i = 0; i < NCH(ch); i += 2) { 210 cch = CHILD(ch, i); 211 if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) { 212 char *str_ch = STR(CHILD(cch, 0)); 213 if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) { 214 ps->p_flags |= CO_FUTURE_WITH_STATEMENT; 215 } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) { 216 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION; 217 } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) { 218 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS; 219 } 220 } 221 } 222 } 223 #endif 224 #endif /* future keyword */ 225 226 int 227 PyParser_AddToken(parser_state *ps, int type, char *str, 228 int lineno, int col_offset, int *expected_ret) 229 { 230 int ilabel; 231 int err; 232 233 D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str)); 234 235 /* Find out which label this token is */ 236 ilabel = classify(ps, type, str); 237 if (ilabel < 0) 238 return E_SYNTAX; 239 240 /* Loop until the token is shifted or an error occurred */ 241 for (;;) { 242 /* Fetch the current dfa and state */ 243 dfa *d = ps->p_stack.s_top->s_dfa; 244 state *s = &d->d_state[ps->p_stack.s_top->s_state]; 245 246 D(printf(" DFA '%s', state %d:", 247 d->d_name, ps->p_stack.s_top->s_state)); 248 249 /* Check accelerator */ 250 if (s->s_lower <= ilabel && ilabel < s->s_upper) { 251 int x = s->s_accel[ilabel - s->s_lower]; 252 if (x != -1) { 253 if (x & (1<<7)) { 254 /* Push non-terminal */ 255 int nt = (x >> 8) + NT_OFFSET; 256 int arrow = x & ((1<<7)-1); 257 dfa *d1 = PyGrammar_FindDFA( 258 ps->p_grammar, nt); 259 if ((err = push(&ps->p_stack, nt, d1, 260 arrow, lineno, col_offset)) > 0) { 261 D(printf(" MemError: push\n")); 262 return err; 263 } 264 D(printf(" Push ...\n")); 265 continue; 266 } 267 268 /* Shift the token */ 269 if ((err = shift(&ps->p_stack, type, str, 270 x, lineno, col_offset)) > 0) { 271 D(printf(" MemError: shift.\n")); 272 return err; 273 } 274 D(printf(" Shift.\n")); 275 /* Pop while we are in an accept-only state */ 276 while (s = &d->d_state 277 [ps->p_stack.s_top->s_state], 278 s->s_accept && s->s_narcs == 1) { 279 D(printf(" DFA '%s', state %d: " 280 "Direct pop.\n", 281 d->d_name, 282 ps->p_stack.s_top->s_state)); 283 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD 284 #if 0 285 if (d->d_name[0] == 'i' && 286 strcmp(d->d_name, 287 "import_stmt") == 0) 288 future_hack(ps); 289 #endif 290 #endif 291 s_pop(&ps->p_stack); 292 if (s_empty(&ps->p_stack)) { 293 D(printf(" ACCEPT.\n")); 294 return E_DONE; 295 } 296 d = ps->p_stack.s_top->s_dfa; 297 } 298 return E_OK; 299 } 300 } 301 302 if (s->s_accept) { 303 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD 304 #if 0 305 if (d->d_name[0] == 'i' && 306 strcmp(d->d_name, "import_stmt") == 0) 307 future_hack(ps); 308 #endif 309 #endif 310 /* Pop this dfa and try again */ 311 s_pop(&ps->p_stack); 312 D(printf(" Pop ...\n")); 313 if (s_empty(&ps->p_stack)) { 314 D(printf(" Error: bottom of stack.\n")); 315 return E_SYNTAX; 316 } 317 continue; 318 } 319 320 /* Stuck, report syntax error */ 321 D(printf(" Error.\n")); 322 if (expected_ret) { 323 if (s->s_lower == s->s_upper - 1) { 324 /* Only one possible expected token */ 325 *expected_ret = ps->p_grammar-> 326 g_ll.ll_label[s->s_lower].lb_type; 327 } 328 else 329 *expected_ret = -1; 330 } 331 return E_SYNTAX; 332 } 333 } 334 335 336 #ifdef Py_DEBUG 337 338 /* DEBUG OUTPUT */ 339 340 void 341 dumptree(grammar *g, node *n) 342 { 343 int i; 344 345 if (n == NULL) 346 printf("NIL"); 347 else { 348 label l; 349 l.lb_type = TYPE(n); 350 l.lb_str = STR(n); 351 printf("%s", PyGrammar_LabelRepr(&l)); 352 if (ISNONTERMINAL(TYPE(n))) { 353 printf("("); 354 for (i = 0; i < NCH(n); i++) { 355 if (i > 0) 356 printf(","); 357 dumptree(g, CHILD(n, i)); 358 } 359 printf(")"); 360 } 361 } 362 } 363 364 void 365 showtree(grammar *g, node *n) 366 { 367 int i; 368 369 if (n == NULL) 370 return; 371 if (ISNONTERMINAL(TYPE(n))) { 372 for (i = 0; i < NCH(n); i++) 373 showtree(g, CHILD(n, i)); 374 } 375 else if (ISTERMINAL(TYPE(n))) { 376 printf("%s", _PyParser_TokenNames[TYPE(n)]); 377 if (TYPE(n) == NUMBER || TYPE(n) == NAME) 378 printf("(%s)", STR(n)); 379 printf(" "); 380 } 381 else 382 printf("? "); 383 } 384 385 void 386 printtree(parser_state *ps) 387 { 388 if (Py_DebugFlag) { 389 printf("Parse tree:\n"); 390 dumptree(ps->p_grammar, ps->p_tree); 391 printf("\n"); 392 printf("Tokens:\n"); 393 showtree(ps->p_grammar, ps->p_tree); 394 printf("\n"); 395 } 396 printf("Listing:\n"); 397 PyNode_ListTree(ps->p_tree); 398 printf("\n"); 399 } 400 401 #endif /* Py_DEBUG */ 402 403 /* 404 405 Description 406 ----------- 407 408 The parser's interface is different than usual: the function addtoken() 409 must be called for each token in the input. This makes it possible to 410 turn it into an incremental parsing system later. The parsing system 411 constructs a parse tree as it goes. 412 413 A parsing rule is represented as a Deterministic Finite-state Automaton 414 (DFA). A node in a DFA represents a state of the parser; an arc represents 415 a transition. Transitions are either labeled with terminal symbols or 416 with non-terminals. When the parser decides to follow an arc labeled 417 with a non-terminal, it is invoked recursively with the DFA representing 418 the parsing rule for that as its initial state; when that DFA accepts, 419 the parser that invoked it continues. The parse tree constructed by the 420 recursively called parser is inserted as a child in the current parse tree. 421 422 The DFA's can be constructed automatically from a more conventional 423 language description. An extended LL(1) grammar (ELL(1)) is suitable. 424 Certain restrictions make the parser's life easier: rules that can produce 425 the empty string should be outlawed (there are other ways to put loops 426 or optional parts in the language). To avoid the need to construct 427 FIRST sets, we can require that all but the last alternative of a rule 428 (really: arc going out of a DFA's state) must begin with a terminal 429 symbol. 430 431 As an example, consider this grammar: 432 433 expr: term (OP term)* 434 term: CONSTANT | '(' expr ')' 435 436 The DFA corresponding to the rule for expr is: 437 438 ------->.---term-->.-------> 439 ^ | 440 | | 441 \----OP----/ 442 443 The parse tree generated for the input a+b is: 444 445 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b))) 446 447 */ 448