Home | History | Annotate | Download | only in Parser
      1 
      2 /* Parser implementation */
      3 
      4 /* For a description, see the comments at end of this file */
      5 
      6 /* XXX To do: error recovery */
      7 
      8 #include "Python.h"
      9 #include "pgenheaders.h"
     10 #include "token.h"
     11 #include "grammar.h"
     12 #include "node.h"
     13 #include "parser.h"
     14 #include "errcode.h"
     15 
     16 
     17 #ifdef Py_DEBUG
     18 extern int Py_DebugFlag;
     19 #define D(x) if (!Py_DebugFlag); else x
     20 #else
     21 #define D(x)
     22 #endif
     23 
     24 
     25 /* STACK DATA TYPE */
     26 
     27 static void s_reset(stack *);
     28 
     29 static void
     30 s_reset(stack *s)
     31 {
     32     s->s_top = &s->s_base[MAXSTACK];
     33 }
     34 
     35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
     36 
     37 static int
     38 s_push(register stack *s, dfa *d, node *parent)
     39 {
     40     register stackentry *top;
     41     if (s->s_top == s->s_base) {
     42         fprintf(stderr, "s_push: parser stack overflow\n");
     43         return E_NOMEM;
     44     }
     45     top = --s->s_top;
     46     top->s_dfa = d;
     47     top->s_parent = parent;
     48     top->s_state = 0;
     49     return 0;
     50 }
     51 
     52 #ifdef Py_DEBUG
     53 
     54 static void
     55 s_pop(register stack *s)
     56 {
     57     if (s_empty(s))
     58         Py_FatalError("s_pop: parser stack underflow -- FATAL");
     59     s->s_top++;
     60 }
     61 
     62 #else /* !Py_DEBUG */
     63 
     64 #define s_pop(s) (s)->s_top++
     65 
     66 #endif
     67 
     68 
     69 /* PARSER CREATION */
     70 
     71 parser_state *
     72 PyParser_New(grammar *g, int start)
     73 {
     74     parser_state *ps;
     75 
     76     if (!g->g_accel)
     77         PyGrammar_AddAccelerators(g);
     78     ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
     79     if (ps == NULL)
     80         return NULL;
     81     ps->p_grammar = g;
     82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
     83     ps->p_flags = 0;
     84 #endif
     85     ps->p_tree = PyNode_New(start);
     86     if (ps->p_tree == NULL) {
     87         PyMem_FREE(ps);
     88         return NULL;
     89     }
     90     s_reset(&ps->p_stack);
     91     (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
     92     return ps;
     93 }
     94 
     95 void
     96 PyParser_Delete(parser_state *ps)
     97 {
     98     /* NB If you want to save the parse tree,
     99        you must set p_tree to NULL before calling delparser! */
    100     PyNode_Free(ps->p_tree);
    101     PyMem_FREE(ps);
    102 }
    103 
    104 
    105 /* PARSER STACK OPERATIONS */
    106 
    107 static int
    108 shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset)
    109 {
    110     int err;
    111     assert(!s_empty(s));
    112     err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
    113     if (err)
    114         return err;
    115     s->s_top->s_state = newstate;
    116     return 0;
    117 }
    118 
    119 static int
    120 push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
    121 {
    122     int err;
    123     register node *n;
    124     n = s->s_top->s_parent;
    125     assert(!s_empty(s));
    126     err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
    127     if (err)
    128         return err;
    129     s->s_top->s_state = newstate;
    130     return s_push(s, d, CHILD(n, NCH(n)-1));
    131 }
    132 
    133 
    134 /* PARSER PROPER */
    135 
    136 static int
    137 classify(parser_state *ps, int type, char *str)
    138 {
    139     grammar *g = ps->p_grammar;
    140     register int n = g->g_ll.ll_nlabels;
    141 
    142     if (type == NAME) {
    143         register char *s = str;
    144         register label *l = g->g_ll.ll_label;
    145         register int i;
    146         for (i = n; i > 0; i--, l++) {
    147             if (l->lb_type != NAME || l->lb_str == NULL ||
    148                 l->lb_str[0] != s[0] ||
    149                 strcmp(l->lb_str, s) != 0)
    150                 continue;
    151 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
    152             if (ps->p_flags & CO_FUTURE_PRINT_FUNCTION &&
    153                 s[0] == 'p' && strcmp(s, "print") == 0) {
    154                 break; /* no longer a keyword */
    155             }
    156 #endif
    157             D(printf("It's a keyword\n"));
    158             return n - i;
    159         }
    160     }
    161 
    162     {
    163         register label *l = g->g_ll.ll_label;
    164         register int i;
    165         for (i = n; i > 0; i--, l++) {
    166             if (l->lb_type == type && l->lb_str == NULL) {
    167                 D(printf("It's a token we know\n"));
    168                 return n - i;
    169             }
    170         }
    171     }
    172 
    173     D(printf("Illegal token\n"));
    174     return -1;
    175 }
    176 
    177 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
    178 static void
    179 future_hack(parser_state *ps)
    180 {
    181     node *n = ps->p_stack.s_top->s_parent;
    182     node *ch, *cch;
    183     int i;
    184 
    185     /* from __future__ import ..., must have at least 4 children */
    186     n = CHILD(n, 0);
    187     if (NCH(n) < 4)
    188         return;
    189     ch = CHILD(n, 0);
    190     if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
    191         return;
    192     ch = CHILD(n, 1);
    193     if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
    194         strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
    195         return;
    196     ch = CHILD(n, 3);
    197     /* ch can be a star, a parenthesis or import_as_names */
    198     if (TYPE(ch) == STAR)
    199         return;
    200     if (TYPE(ch) == LPAR)
    201         ch = CHILD(n, 4);
    202 
    203     for (i = 0; i < NCH(ch); i += 2) {
    204         cch = CHILD(ch, i);
    205         if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
    206             char *str_ch = STR(CHILD(cch, 0));
    207             if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
    208                 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
    209             } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
    210                 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
    211             } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
    212                 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
    213             }
    214         }
    215     }
    216 }
    217 #endif /* future keyword */
    218 
    219 int
    220 PyParser_AddToken(register parser_state *ps, register int type, char *str,
    221                   int lineno, int col_offset, int *expected_ret)
    222 {
    223     register int ilabel;
    224     int err;
    225 
    226     D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
    227 
    228     /* Find out which label this token is */
    229     ilabel = classify(ps, type, str);
    230     if (ilabel < 0)
    231         return E_SYNTAX;
    232 
    233     /* Loop until the token is shifted or an error occurred */
    234     for (;;) {
    235         /* Fetch the current dfa and state */
    236         register dfa *d = ps->p_stack.s_top->s_dfa;
    237         register state *s = &d->d_state[ps->p_stack.s_top->s_state];
    238 
    239         D(printf(" DFA '%s', state %d:",
    240             d->d_name, ps->p_stack.s_top->s_state));
    241 
    242         /* Check accelerator */
    243         if (s->s_lower <= ilabel && ilabel < s->s_upper) {
    244             register int x = s->s_accel[ilabel - s->s_lower];
    245             if (x != -1) {
    246                 if (x & (1<<7)) {
    247                     /* Push non-terminal */
    248                     int nt = (x >> 8) + NT_OFFSET;
    249                     int arrow = x & ((1<<7)-1);
    250                     dfa *d1 = PyGrammar_FindDFA(
    251                         ps->p_grammar, nt);
    252                     if ((err = push(&ps->p_stack, nt, d1,
    253                         arrow, lineno, col_offset)) > 0) {
    254                         D(printf(" MemError: push\n"));
    255                         return err;
    256                     }
    257                     D(printf(" Push ...\n"));
    258                     continue;
    259                 }
    260 
    261                 /* Shift the token */
    262                 if ((err = shift(&ps->p_stack, type, str,
    263                                 x, lineno, col_offset)) > 0) {
    264                     D(printf(" MemError: shift.\n"));
    265                     return err;
    266                 }
    267                 D(printf(" Shift.\n"));
    268                 /* Pop while we are in an accept-only state */
    269                 while (s = &d->d_state
    270                                 [ps->p_stack.s_top->s_state],
    271                     s->s_accept && s->s_narcs == 1) {
    272                     D(printf("  DFA '%s', state %d: "
    273                              "Direct pop.\n",
    274                              d->d_name,
    275                              ps->p_stack.s_top->s_state));
    276 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
    277                     if (d->d_name[0] == 'i' &&
    278                         strcmp(d->d_name,
    279                            "import_stmt") == 0)
    280                         future_hack(ps);
    281 #endif
    282                     s_pop(&ps->p_stack);
    283                     if (s_empty(&ps->p_stack)) {
    284                         D(printf("  ACCEPT.\n"));
    285                         return E_DONE;
    286                     }
    287                     d = ps->p_stack.s_top->s_dfa;
    288                 }
    289                 return E_OK;
    290             }
    291         }
    292 
    293         if (s->s_accept) {
    294 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
    295             if (d->d_name[0] == 'i' &&
    296                 strcmp(d->d_name, "import_stmt") == 0)
    297                 future_hack(ps);
    298 #endif
    299             /* Pop this dfa and try again */
    300             s_pop(&ps->p_stack);
    301             D(printf(" Pop ...\n"));
    302             if (s_empty(&ps->p_stack)) {
    303                 D(printf(" Error: bottom of stack.\n"));
    304                 return E_SYNTAX;
    305             }
    306             continue;
    307         }
    308 
    309         /* Stuck, report syntax error */
    310         D(printf(" Error.\n"));
    311         if (expected_ret) {
    312             if (s->s_lower == s->s_upper - 1) {
    313                 /* Only one possible expected token */
    314                 *expected_ret = ps->p_grammar->
    315                     g_ll.ll_label[s->s_lower].lb_type;
    316             }
    317             else
    318                 *expected_ret = -1;
    319         }
    320         return E_SYNTAX;
    321     }
    322 }
    323 
    324 
    325 #ifdef Py_DEBUG
    326 
    327 /* DEBUG OUTPUT */
    328 
    329 void
    330 dumptree(grammar *g, node *n)
    331 {
    332     int i;
    333 
    334     if (n == NULL)
    335         printf("NIL");
    336     else {
    337         label l;
    338         l.lb_type = TYPE(n);
    339         l.lb_str = STR(n);
    340         printf("%s", PyGrammar_LabelRepr(&l));
    341         if (ISNONTERMINAL(TYPE(n))) {
    342             printf("(");
    343             for (i = 0; i < NCH(n); i++) {
    344                 if (i > 0)
    345                     printf(",");
    346                 dumptree(g, CHILD(n, i));
    347             }
    348             printf(")");
    349         }
    350     }
    351 }
    352 
    353 void
    354 showtree(grammar *g, node *n)
    355 {
    356     int i;
    357 
    358     if (n == NULL)
    359         return;
    360     if (ISNONTERMINAL(TYPE(n))) {
    361         for (i = 0; i < NCH(n); i++)
    362             showtree(g, CHILD(n, i));
    363     }
    364     else if (ISTERMINAL(TYPE(n))) {
    365         printf("%s", _PyParser_TokenNames[TYPE(n)]);
    366         if (TYPE(n) == NUMBER || TYPE(n) == NAME)
    367             printf("(%s)", STR(n));
    368         printf(" ");
    369     }
    370     else
    371         printf("? ");
    372 }
    373 
    374 void
    375 printtree(parser_state *ps)
    376 {
    377     if (Py_DebugFlag) {
    378         printf("Parse tree:\n");
    379         dumptree(ps->p_grammar, ps->p_tree);
    380         printf("\n");
    381         printf("Tokens:\n");
    382         showtree(ps->p_grammar, ps->p_tree);
    383         printf("\n");
    384     }
    385     printf("Listing:\n");
    386     PyNode_ListTree(ps->p_tree);
    387     printf("\n");
    388 }
    389 
    390 #endif /* Py_DEBUG */
    391 
    392 /*
    393 
    394 Description
    395 -----------
    396 
    397 The parser's interface is different than usual: the function addtoken()
    398 must be called for each token in the input.  This makes it possible to
    399 turn it into an incremental parsing system later.  The parsing system
    400 constructs a parse tree as it goes.
    401 
    402 A parsing rule is represented as a Deterministic Finite-state Automaton
    403 (DFA).  A node in a DFA represents a state of the parser; an arc represents
    404 a transition.  Transitions are either labeled with terminal symbols or
    405 with non-terminals.  When the parser decides to follow an arc labeled
    406 with a non-terminal, it is invoked recursively with the DFA representing
    407 the parsing rule for that as its initial state; when that DFA accepts,
    408 the parser that invoked it continues.  The parse tree constructed by the
    409 recursively called parser is inserted as a child in the current parse tree.
    410 
    411 The DFA's can be constructed automatically from a more conventional
    412 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
    413 Certain restrictions make the parser's life easier: rules that can produce
    414 the empty string should be outlawed (there are other ways to put loops
    415 or optional parts in the language).  To avoid the need to construct
    416 FIRST sets, we can require that all but the last alternative of a rule
    417 (really: arc going out of a DFA's state) must begin with a terminal
    418 symbol.
    419 
    420 As an example, consider this grammar:
    421 
    422 expr:   term (OP term)*
    423 term:   CONSTANT | '(' expr ')'
    424 
    425 The DFA corresponding to the rule for expr is:
    426 
    427 ------->.---term-->.------->
    428     ^          |
    429     |          |
    430     \----OP----/
    431 
    432 The parse tree generated for the input a+b is:
    433 
    434 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
    435 
    436 */
    437