Home | History | Annotate | Download | only in Parser
      1 
      2 /* Parser implementation */
      3 
      4 /* For a description, see the comments at end of this file */
      5 
      6 /* XXX To do: error recovery */
      7 
      8 #include "Python.h"
      9 #include "pgenheaders.h"
     10 #include "token.h"
     11 #include "grammar.h"
     12 #include "node.h"
     13 #include "parser.h"
     14 #include "errcode.h"
     15 
     16 
     17 #ifdef Py_DEBUG
     18 extern int Py_DebugFlag;
     19 #define D(x) if (!Py_DebugFlag); else x
     20 #else
     21 #define D(x)
     22 #endif
     23 
     24 
     25 /* STACK DATA TYPE */
     26 
     27 static void s_reset(stack *);
     28 
     29 static void
     30 s_reset(stack *s)
     31 {
     32     s->s_top = &s->s_base[MAXSTACK];
     33 }
     34 
     35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
     36 
     37 static int
     38 s_push(stack *s, dfa *d, node *parent)
     39 {
     40     stackentry *top;
     41     if (s->s_top == s->s_base) {
     42         fprintf(stderr, "s_push: parser stack overflow\n");
     43         return E_NOMEM;
     44     }
     45     top = --s->s_top;
     46     top->s_dfa = d;
     47     top->s_parent = parent;
     48     top->s_state = 0;
     49     return 0;
     50 }
     51 
     52 #ifdef Py_DEBUG
     53 
     54 static void
     55 s_pop(stack *s)
     56 {
     57     if (s_empty(s))
     58         Py_FatalError("s_pop: parser stack underflow -- FATAL");
     59     s->s_top++;
     60 }
     61 
     62 #else /* !Py_DEBUG */
     63 
     64 #define s_pop(s) (s)->s_top++
     65 
     66 #endif
     67 
     68 
     69 /* PARSER CREATION */
     70 
     71 parser_state *
     72 PyParser_New(grammar *g, int start)
     73 {
     74     parser_state *ps;
     75 
     76     if (!g->g_accel)
     77         PyGrammar_AddAccelerators(g);
     78     ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
     79     if (ps == NULL)
     80         return NULL;
     81     ps->p_grammar = g;
     82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
     83     ps->p_flags = 0;
     84 #endif
     85     ps->p_tree = PyNode_New(start);
     86     if (ps->p_tree == NULL) {
     87         PyMem_FREE(ps);
     88         return NULL;
     89     }
     90     s_reset(&ps->p_stack);
     91     (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
     92     return ps;
     93 }
     94 
     95 void
     96 PyParser_Delete(parser_state *ps)
     97 {
     98     /* NB If you want to save the parse tree,
     99        you must set p_tree to NULL before calling delparser! */
    100     PyNode_Free(ps->p_tree);
    101     PyMem_FREE(ps);
    102 }
    103 
    104 
    105 /* PARSER STACK OPERATIONS */
    106 
    107 static int
    108 shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset)
    109 {
    110     int err;
    111     assert(!s_empty(s));
    112     err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
    113     if (err)
    114         return err;
    115     s->s_top->s_state = newstate;
    116     return 0;
    117 }
    118 
    119 static int
    120 push(stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
    121 {
    122     int err;
    123     node *n;
    124     n = s->s_top->s_parent;
    125     assert(!s_empty(s));
    126     err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
    127     if (err)
    128         return err;
    129     s->s_top->s_state = newstate;
    130     return s_push(s, d, CHILD(n, NCH(n)-1));
    131 }
    132 
    133 
    134 /* PARSER PROPER */
    135 
    136 static int
    137 classify(parser_state *ps, int type, const char *str)
    138 {
    139     grammar *g = ps->p_grammar;
    140     int n = g->g_ll.ll_nlabels;
    141 
    142     if (type == NAME) {
    143         label *l = g->g_ll.ll_label;
    144         int i;
    145         for (i = n; i > 0; i--, l++) {
    146             if (l->lb_type != NAME || l->lb_str == NULL ||
    147                 l->lb_str[0] != str[0] ||
    148                 strcmp(l->lb_str, str) != 0)
    149                 continue;
    150 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
    151 #if 0
    152             /* Leaving this in as an example */
    153             if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
    154                 if (str[0] == 'w' && strcmp(str, "with") == 0)
    155                     break; /* not a keyword yet */
    156                 else if (str[0] == 'a' && strcmp(str, "as") == 0)
    157                     break; /* not a keyword yet */
    158             }
    159 #endif
    160 #endif
    161             D(printf("It's a keyword\n"));
    162             return n - i;
    163         }
    164     }
    165 
    166     {
    167         label *l = g->g_ll.ll_label;
    168         int i;
    169         for (i = n; i > 0; i--, l++) {
    170             if (l->lb_type == type && l->lb_str == NULL) {
    171                 D(printf("It's a token we know\n"));
    172                 return n - i;
    173             }
    174         }
    175     }
    176 
    177     D(printf("Illegal token\n"));
    178     return -1;
    179 }
    180 
    181 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
    182 #if 0
    183 /* Leaving this in as an example */
    184 static void
    185 future_hack(parser_state *ps)
    186 {
    187     node *n = ps->p_stack.s_top->s_parent;
    188     node *ch, *cch;
    189     int i;
    190 
    191     /* from __future__ import ..., must have at least 4 children */
    192     n = CHILD(n, 0);
    193     if (NCH(n) < 4)
    194         return;
    195     ch = CHILD(n, 0);
    196     if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
    197         return;
    198     ch = CHILD(n, 1);
    199     if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
    200         strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
    201         return;
    202     ch = CHILD(n, 3);
    203     /* ch can be a star, a parenthesis or import_as_names */
    204     if (TYPE(ch) == STAR)
    205         return;
    206     if (TYPE(ch) == LPAR)
    207         ch = CHILD(n, 4);
    208 
    209     for (i = 0; i < NCH(ch); i += 2) {
    210         cch = CHILD(ch, i);
    211         if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
    212             char *str_ch = STR(CHILD(cch, 0));
    213             if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
    214                 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
    215             } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
    216                 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
    217             } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
    218                 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
    219             }
    220         }
    221     }
    222 }
    223 #endif
    224 #endif /* future keyword */
    225 
    226 int
    227 PyParser_AddToken(parser_state *ps, int type, char *str,
    228                   int lineno, int col_offset, int *expected_ret)
    229 {
    230     int ilabel;
    231     int err;
    232 
    233     D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
    234 
    235     /* Find out which label this token is */
    236     ilabel = classify(ps, type, str);
    237     if (ilabel < 0)
    238         return E_SYNTAX;
    239 
    240     /* Loop until the token is shifted or an error occurred */
    241     for (;;) {
    242         /* Fetch the current dfa and state */
    243         dfa *d = ps->p_stack.s_top->s_dfa;
    244         state *s = &d->d_state[ps->p_stack.s_top->s_state];
    245 
    246         D(printf(" DFA '%s', state %d:",
    247             d->d_name, ps->p_stack.s_top->s_state));
    248 
    249         /* Check accelerator */
    250         if (s->s_lower <= ilabel && ilabel < s->s_upper) {
    251             int x = s->s_accel[ilabel - s->s_lower];
    252             if (x != -1) {
    253                 if (x & (1<<7)) {
    254                     /* Push non-terminal */
    255                     int nt = (x >> 8) + NT_OFFSET;
    256                     int arrow = x & ((1<<7)-1);
    257                     dfa *d1 = PyGrammar_FindDFA(
    258                         ps->p_grammar, nt);
    259                     if ((err = push(&ps->p_stack, nt, d1,
    260                         arrow, lineno, col_offset)) > 0) {
    261                         D(printf(" MemError: push\n"));
    262                         return err;
    263                     }
    264                     D(printf(" Push ...\n"));
    265                     continue;
    266                 }
    267 
    268                 /* Shift the token */
    269                 if ((err = shift(&ps->p_stack, type, str,
    270                                 x, lineno, col_offset)) > 0) {
    271                     D(printf(" MemError: shift.\n"));
    272                     return err;
    273                 }
    274                 D(printf(" Shift.\n"));
    275                 /* Pop while we are in an accept-only state */
    276                 while (s = &d->d_state
    277                                 [ps->p_stack.s_top->s_state],
    278                     s->s_accept && s->s_narcs == 1) {
    279                     D(printf("  DFA '%s', state %d: "
    280                              "Direct pop.\n",
    281                              d->d_name,
    282                              ps->p_stack.s_top->s_state));
    283 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
    284 #if 0
    285                     if (d->d_name[0] == 'i' &&
    286                         strcmp(d->d_name,
    287                            "import_stmt") == 0)
    288                         future_hack(ps);
    289 #endif
    290 #endif
    291                     s_pop(&ps->p_stack);
    292                     if (s_empty(&ps->p_stack)) {
    293                         D(printf("  ACCEPT.\n"));
    294                         return E_DONE;
    295                     }
    296                     d = ps->p_stack.s_top->s_dfa;
    297                 }
    298                 return E_OK;
    299             }
    300         }
    301 
    302         if (s->s_accept) {
    303 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
    304 #if 0
    305             if (d->d_name[0] == 'i' &&
    306                 strcmp(d->d_name, "import_stmt") == 0)
    307                 future_hack(ps);
    308 #endif
    309 #endif
    310             /* Pop this dfa and try again */
    311             s_pop(&ps->p_stack);
    312             D(printf(" Pop ...\n"));
    313             if (s_empty(&ps->p_stack)) {
    314                 D(printf(" Error: bottom of stack.\n"));
    315                 return E_SYNTAX;
    316             }
    317             continue;
    318         }
    319 
    320         /* Stuck, report syntax error */
    321         D(printf(" Error.\n"));
    322         if (expected_ret) {
    323             if (s->s_lower == s->s_upper - 1) {
    324                 /* Only one possible expected token */
    325                 *expected_ret = ps->p_grammar->
    326                     g_ll.ll_label[s->s_lower].lb_type;
    327             }
    328             else
    329                 *expected_ret = -1;
    330         }
    331         return E_SYNTAX;
    332     }
    333 }
    334 
    335 
    336 #ifdef Py_DEBUG
    337 
    338 /* DEBUG OUTPUT */
    339 
    340 void
    341 dumptree(grammar *g, node *n)
    342 {
    343     int i;
    344 
    345     if (n == NULL)
    346         printf("NIL");
    347     else {
    348         label l;
    349         l.lb_type = TYPE(n);
    350         l.lb_str = STR(n);
    351         printf("%s", PyGrammar_LabelRepr(&l));
    352         if (ISNONTERMINAL(TYPE(n))) {
    353             printf("(");
    354             for (i = 0; i < NCH(n); i++) {
    355                 if (i > 0)
    356                     printf(",");
    357                 dumptree(g, CHILD(n, i));
    358             }
    359             printf(")");
    360         }
    361     }
    362 }
    363 
    364 void
    365 showtree(grammar *g, node *n)
    366 {
    367     int i;
    368 
    369     if (n == NULL)
    370         return;
    371     if (ISNONTERMINAL(TYPE(n))) {
    372         for (i = 0; i < NCH(n); i++)
    373             showtree(g, CHILD(n, i));
    374     }
    375     else if (ISTERMINAL(TYPE(n))) {
    376         printf("%s", _PyParser_TokenNames[TYPE(n)]);
    377         if (TYPE(n) == NUMBER || TYPE(n) == NAME)
    378             printf("(%s)", STR(n));
    379         printf(" ");
    380     }
    381     else
    382         printf("? ");
    383 }
    384 
    385 void
    386 printtree(parser_state *ps)
    387 {
    388     if (Py_DebugFlag) {
    389         printf("Parse tree:\n");
    390         dumptree(ps->p_grammar, ps->p_tree);
    391         printf("\n");
    392         printf("Tokens:\n");
    393         showtree(ps->p_grammar, ps->p_tree);
    394         printf("\n");
    395     }
    396     printf("Listing:\n");
    397     PyNode_ListTree(ps->p_tree);
    398     printf("\n");
    399 }
    400 
    401 #endif /* Py_DEBUG */
    402 
    403 /*
    404 
    405 Description
    406 -----------
    407 
    408 The parser's interface is different than usual: the function addtoken()
    409 must be called for each token in the input.  This makes it possible to
    410 turn it into an incremental parsing system later.  The parsing system
    411 constructs a parse tree as it goes.
    412 
    413 A parsing rule is represented as a Deterministic Finite-state Automaton
    414 (DFA).  A node in a DFA represents a state of the parser; an arc represents
    415 a transition.  Transitions are either labeled with terminal symbols or
    416 with non-terminals.  When the parser decides to follow an arc labeled
    417 with a non-terminal, it is invoked recursively with the DFA representing
    418 the parsing rule for that as its initial state; when that DFA accepts,
    419 the parser that invoked it continues.  The parse tree constructed by the
    420 recursively called parser is inserted as a child in the current parse tree.
    421 
    422 The DFA's can be constructed automatically from a more conventional
    423 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
    424 Certain restrictions make the parser's life easier: rules that can produce
    425 the empty string should be outlawed (there are other ways to put loops
    426 or optional parts in the language).  To avoid the need to construct
    427 FIRST sets, we can require that all but the last alternative of a rule
    428 (really: arc going out of a DFA's state) must begin with a terminal
    429 symbol.
    430 
    431 As an example, consider this grammar:
    432 
    433 expr:   term (OP term)*
    434 term:   CONSTANT | '(' expr ')'
    435 
    436 The DFA corresponding to the rule for expr is:
    437 
    438 ------->.---term-->.------->
    439     ^          |
    440     |          |
    441     \----OP----/
    442 
    443 The parse tree generated for the input a+b is:
    444 
    445 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
    446 
    447 */
    448