Home | History | Annotate | Download | only in src
      1 /* Bison Grammar Scanner                             -*- C -*-
      2 
      3    Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
      4 
      5    This file is part of Bison, the GNU Compiler Compiler.
      6 
      7    This program is free software; you can redistribute it and/or modify
      8    it under the terms of the GNU General Public License as published by
      9    the Free Software Foundation; either version 2 of the License, or
     10    (at your option) any later version.
     11 
     12    This program is distributed in the hope that it will be useful,
     13    but WITHOUT ANY WARRANTY; without even the implied warranty of
     14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15    GNU General Public License for more details.
     16 
     17    You should have received a copy of the GNU General Public License
     18    along with this program; if not, write to the Free Software
     19    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     20    02110-1301  USA
     21 */
     22 
     23 %option debug nodefault nounput noyywrap never-interactive
     24 %option prefix="gram_" outfile="lex.yy.c"
     25 
     26 %{
     27 /* Work around a bug in flex 2.5.31.  See Debian bug 333231
     28    <http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=333231>.  */
     29 #undef gram_wrap
     30 #define gram_wrap() 1
     31 
     32 #include "system.h"
     33 
     34 #include <mbswidth.h>
     35 #include <quote.h>
     36 
     37 #include "complain.h"
     38 #include "files.h"
     39 #include "getargs.h"
     40 #include "gram.h"
     41 #include "quotearg.h"
     42 #include "reader.h"
     43 #include "uniqstr.h"
     44 
     45 #define YY_USER_INIT					\
     46   do							\
     47     {							\
     48       scanner_cursor.file = current_file;		\
     49       scanner_cursor.line = 1;				\
     50       scanner_cursor.column = 1;			\
     51       code_start = scanner_cursor;			\
     52     }							\
     53   while (0)
     54 
     55 /* Pacify "gcc -Wmissing-prototypes" when flex 2.5.31 is used.  */
     56 int gram_get_lineno (void);
     57 FILE *gram_get_in (void);
     58 FILE *gram_get_out (void);
     59 int gram_get_leng (void);
     60 char *gram_get_text (void);
     61 void gram_set_lineno (int);
     62 void gram_set_in (FILE *);
     63 void gram_set_out (FILE *);
     64 int gram_get_debug (void);
     65 void gram_set_debug (int);
     66 int gram_lex_destroy (void);
     67 
     68 /* Location of scanner cursor.  */
     69 boundary scanner_cursor;
     70 
     71 static void adjust_location (location *, char const *, size_t);
     72 #define YY_USER_ACTION  adjust_location (loc, yytext, yyleng);
     73 
     74 static size_t no_cr_read (FILE *, char *, size_t);
     75 #define YY_INPUT(buf, result, size) ((result) = no_cr_read (yyin, buf, size))
     76 
     77 
     78 /* OBSTACK_FOR_STRING -- Used to store all the characters that we need to
     79    keep (to construct ID, STRINGS etc.).  Use the following macros to
     80    use it.
     81 
     82    Use STRING_GROW to append what has just been matched, and
     83    STRING_FINISH to end the string (it puts the ending 0).
     84    STRING_FINISH also stores this string in LAST_STRING, which can be
     85    used, and which is used by STRING_FREE to free the last string.  */
     86 
     87 static struct obstack obstack_for_string;
     88 
     89 /* A string representing the most recently saved token.  */
     90 char *last_string;
     91 
     92 /* The location of the most recently saved token, if it was a
     93    BRACED_CODE token; otherwise, this has an unspecified value.  */
     94 location last_braced_code_loc;
     95 
     96 #define STRING_GROW   \
     97   obstack_grow (&obstack_for_string, yytext, yyleng)
     98 
     99 #define STRING_FINISH					\
    100   do {							\
    101     obstack_1grow (&obstack_for_string, '\0');		\
    102     last_string = obstack_finish (&obstack_for_string);	\
    103   } while (0)
    104 
    105 #define STRING_FREE \
    106   obstack_free (&obstack_for_string, last_string)
    107 
    108 void
    109 scanner_last_string_free (void)
    110 {
    111   STRING_FREE;
    112 }
    113 
    114 /* Within well-formed rules, RULE_LENGTH is the number of values in
    115    the current rule so far, which says where to find `$0' with respect
    116    to the top of the stack.  It is not the same as the rule->length in
    117    the case of mid rule actions.
    118 
    119    Outside of well-formed rules, RULE_LENGTH has an undefined value.  */
    120 static int rule_length;
    121 
    122 static void rule_length_overflow (location) __attribute__ ((__noreturn__));
    123 
    124 /* Increment the rule length by one, checking for overflow.  */
    125 static inline void
    126 increment_rule_length (location loc)
    127 {
    128   rule_length++;
    129 
    130   /* Don't allow rule_length == INT_MAX, since that might cause
    131      confusion with strtol if INT_MAX == LONG_MAX.  */
    132   if (rule_length == INT_MAX)
    133     rule_length_overflow (loc);
    134 }
    135 
    136 static void handle_dollar (int token_type, char *cp, location loc);
    137 static void handle_at (int token_type, char *cp, location loc);
    138 static void handle_syncline (char *, location);
    139 static unsigned long int scan_integer (char const *p, int base, location loc);
    140 static int convert_ucn_to_byte (char const *hex_text);
    141 static void unexpected_eof (boundary, char const *);
    142 static void unexpected_newline (boundary, char const *);
    143 
    144 %}
    145 %x SC_COMMENT SC_LINE_COMMENT SC_YACC_COMMENT
    146 %x SC_STRING SC_CHARACTER
    147 %x SC_AFTER_IDENTIFIER
    148 %x SC_ESCAPED_STRING SC_ESCAPED_CHARACTER
    149 %x SC_PRE_CODE SC_BRACED_CODE SC_PROLOGUE SC_EPILOGUE
    150 
    151 letter	  [.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_]
    152 id	  {letter}({letter}|[0-9])*
    153 directive %{letter}({letter}|[0-9]|-)*
    154 int	  [0-9]+
    155 
    156 /* POSIX says that a tag must be both an id and a C union member, but
    157    historically almost any character is allowed in a tag.  We disallow
    158    NUL and newline, as this simplifies our implementation.  */
    159 tag	 [^\0\n>]+
    160 
    161 /* Zero or more instances of backslash-newline.  Following GCC, allow
    162    white space between the backslash and the newline.  */
    163 splice	 (\\[ \f\t\v]*\n)*
    164 
    165 %%
    166 %{
    167   /* Nesting level of the current code in braces.  */
    168   int braces_level IF_LINT (= 0);
    169 
    170   /* Parent context state, when applicable.  */
    171   int context_state IF_LINT (= 0);
    172 
    173   /* Token type to return, when applicable.  */
    174   int token_type IF_LINT (= 0);
    175 
    176   /* Location of most recent identifier, when applicable.  */
    177   location id_loc IF_LINT (= empty_location);
    178 
    179   /* Where containing code started, when applicable.  Its initial
    180      value is relevant only when yylex is invoked in the SC_EPILOGUE
    181      start condition.  */
    182   boundary code_start = scanner_cursor;
    183 
    184   /* Where containing comment or string or character literal started,
    185      when applicable.  */
    186   boundary token_start IF_LINT (= scanner_cursor);
    187 %}
    188 
    189 
    190   /*-----------------------.
    191   | Scanning white space.  |
    192   `-----------------------*/
    193 
    194 <INITIAL,SC_AFTER_IDENTIFIER,SC_PRE_CODE>
    195 {
    196   /* Comments and white space.  */
    197   ","	       warn_at (*loc, _("stray `,' treated as white space"));
    198   [ \f\n\t\v]  |
    199   "//".*       ;
    200   "/*" {
    201     token_start = loc->start;
    202     context_state = YY_START;
    203     BEGIN SC_YACC_COMMENT;
    204   }
    205 
    206   /* #line directives are not documented, and may be withdrawn or
    207      modified in future versions of Bison.  */
    208   ^"#line "{int}" \"".*"\"\n" {
    209     handle_syncline (yytext + sizeof "#line " - 1, *loc);
    210   }
    211 }
    212 
    213 
    214   /*----------------------------.
    215   | Scanning Bison directives.  |
    216   `----------------------------*/
    217 <INITIAL>
    218 {
    219   "%binary"               return PERCENT_NONASSOC;
    220   "%debug"                return PERCENT_DEBUG;
    221   "%default"[-_]"prec"    return PERCENT_DEFAULT_PREC;
    222   "%define"               return PERCENT_DEFINE;
    223   "%defines"              return PERCENT_DEFINES;
    224   "%destructor"		  token_type = PERCENT_DESTRUCTOR; BEGIN SC_PRE_CODE;
    225   "%dprec"		  return PERCENT_DPREC;
    226   "%error"[-_]"verbose"   return PERCENT_ERROR_VERBOSE;
    227   "%expect"               return PERCENT_EXPECT;
    228   "%expect"[-_]"rr"	  return PERCENT_EXPECT_RR;
    229   "%file-prefix"          return PERCENT_FILE_PREFIX;
    230   "%fixed"[-_]"output"[-_]"files"   return PERCENT_YACC;
    231   "%initial-action"       token_type = PERCENT_INITIAL_ACTION; BEGIN SC_PRE_CODE;
    232   "%glr-parser"           return PERCENT_GLR_PARSER;
    233   "%left"                 return PERCENT_LEFT;
    234   "%lex-param"		  token_type = PERCENT_LEX_PARAM; BEGIN SC_PRE_CODE;
    235   "%locations"            return PERCENT_LOCATIONS;
    236   "%merge"		  return PERCENT_MERGE;
    237   "%name"[-_]"prefix"     return PERCENT_NAME_PREFIX;
    238   "%no"[-_]"default"[-_]"prec"	return PERCENT_NO_DEFAULT_PREC;
    239   "%no"[-_]"lines"        return PERCENT_NO_LINES;
    240   "%nonassoc"             return PERCENT_NONASSOC;
    241   "%nondeterministic-parser"   return PERCENT_NONDETERMINISTIC_PARSER;
    242   "%nterm"                return PERCENT_NTERM;
    243   "%output"               return PERCENT_OUTPUT;
    244   "%parse-param"	  token_type = PERCENT_PARSE_PARAM; BEGIN SC_PRE_CODE;
    245   "%prec"                 rule_length--; return PERCENT_PREC;
    246   "%printer"              token_type = PERCENT_PRINTER; BEGIN SC_PRE_CODE;
    247   "%pure"[-_]"parser"     return PERCENT_PURE_PARSER;
    248   "%require"              return PERCENT_REQUIRE;
    249   "%right"                return PERCENT_RIGHT;
    250   "%skeleton"             return PERCENT_SKELETON;
    251   "%start"                return PERCENT_START;
    252   "%term"                 return PERCENT_TOKEN;
    253   "%token"                return PERCENT_TOKEN;
    254   "%token"[-_]"table"     return PERCENT_TOKEN_TABLE;
    255   "%type"                 return PERCENT_TYPE;
    256   "%union"		  token_type = PERCENT_UNION; BEGIN SC_PRE_CODE;
    257   "%verbose"              return PERCENT_VERBOSE;
    258   "%yacc"                 return PERCENT_YACC;
    259 
    260   {directive} {
    261     complain_at (*loc, _("invalid directive: %s"), quote (yytext));
    262   }
    263 
    264   "="                     return EQUAL;
    265   "|"                     rule_length = 0; return PIPE;
    266   ";"                     return SEMICOLON;
    267 
    268   {id} {
    269     val->symbol = symbol_get (yytext, *loc);
    270     id_loc = *loc;
    271     increment_rule_length (*loc);
    272     BEGIN SC_AFTER_IDENTIFIER;
    273   }
    274 
    275   {int} {
    276     val->integer = scan_integer (yytext, 10, *loc);
    277     return INT;
    278   }
    279   0[xX][0-9abcdefABCDEF]+ {
    280     val->integer = scan_integer (yytext, 16, *loc);
    281     return INT;
    282   }
    283 
    284   /* Characters.  We don't check there is only one.  */
    285   "'"	      STRING_GROW; token_start = loc->start; BEGIN SC_ESCAPED_CHARACTER;
    286 
    287   /* Strings. */
    288   "\""	      token_start = loc->start; BEGIN SC_ESCAPED_STRING;
    289 
    290   /* Prologue. */
    291   "%{"        code_start = loc->start; BEGIN SC_PROLOGUE;
    292 
    293   /* Code in between braces.  */
    294   "{" {
    295     if (current_rule && current_rule->action)
    296       grammar_midrule_action ();
    297     STRING_GROW;
    298     token_type = BRACED_CODE;
    299     braces_level = 0;
    300     code_start = loc->start;
    301     BEGIN SC_BRACED_CODE;
    302   }
    303 
    304   /* A type. */
    305   "<"{tag}">" {
    306     obstack_grow (&obstack_for_string, yytext + 1, yyleng - 2);
    307     STRING_FINISH;
    308     val->uniqstr = uniqstr_new (last_string);
    309     STRING_FREE;
    310     return TYPE;
    311   }
    312 
    313   "%%" {
    314     static int percent_percent_count;
    315     if (++percent_percent_count == 2)
    316       BEGIN SC_EPILOGUE;
    317     return PERCENT_PERCENT;
    318   }
    319 
    320   . {
    321     complain_at (*loc, _("invalid character: %s"), quote (yytext));
    322   }
    323 
    324   <<EOF>> {
    325     loc->start = loc->end = scanner_cursor;
    326     yyterminate ();
    327   }
    328 }
    329 
    330 
    331   /*-----------------------------------------------------------------.
    332   | Scanning after an identifier, checking whether a colon is next.  |
    333   `-----------------------------------------------------------------*/
    334 
    335 <SC_AFTER_IDENTIFIER>
    336 {
    337   ":" {
    338     rule_length = 0;
    339     *loc = id_loc;
    340     BEGIN INITIAL;
    341     return ID_COLON;
    342   }
    343   . {
    344     scanner_cursor.column -= mbsnwidth (yytext, yyleng, 0);
    345     yyless (0);
    346     *loc = id_loc;
    347     BEGIN INITIAL;
    348     return ID;
    349   }
    350   <<EOF>> {
    351     *loc = id_loc;
    352     BEGIN INITIAL;
    353     return ID;
    354   }
    355 }
    356 
    357 
    358   /*---------------------------------------------------------------.
    359   | Scanning a Yacc comment.  The initial `/ *' is already eaten.  |
    360   `---------------------------------------------------------------*/
    361 
    362 <SC_YACC_COMMENT>
    363 {
    364   "*/"     BEGIN context_state;
    365   .|\n	   ;
    366   <<EOF>>  unexpected_eof (token_start, "*/"); BEGIN context_state;
    367 }
    368 
    369 
    370   /*------------------------------------------------------------.
    371   | Scanning a C comment.  The initial `/ *' is already eaten.  |
    372   `------------------------------------------------------------*/
    373 
    374 <SC_COMMENT>
    375 {
    376   "*"{splice}"/"  STRING_GROW; BEGIN context_state;
    377   <<EOF>>	  unexpected_eof (token_start, "*/"); BEGIN context_state;
    378 }
    379 
    380 
    381   /*--------------------------------------------------------------.
    382   | Scanning a line comment.  The initial `//' is already eaten.  |
    383   `--------------------------------------------------------------*/
    384 
    385 <SC_LINE_COMMENT>
    386 {
    387   "\n"		 STRING_GROW; BEGIN context_state;
    388   {splice}	 STRING_GROW;
    389   <<EOF>>	 BEGIN context_state;
    390 }
    391 
    392 
    393   /*------------------------------------------------.
    394   | Scanning a Bison string, including its escapes. |
    395   | The initial quote is already eaten.             |
    396   `------------------------------------------------*/
    397 
    398 <SC_ESCAPED_STRING>
    399 {
    400   "\"" {
    401     STRING_FINISH;
    402     loc->start = token_start;
    403     val->chars = last_string;
    404     increment_rule_length (*loc);
    405     BEGIN INITIAL;
    406     return STRING;
    407   }
    408   \n		unexpected_newline (token_start, "\"");	BEGIN INITIAL;
    409   <<EOF>>	unexpected_eof (token_start, "\"");	BEGIN INITIAL;
    410 }
    411 
    412   /*----------------------------------------------------------.
    413   | Scanning a Bison character literal, decoding its escapes. |
    414   | The initial quote is already eaten.			      |
    415   `----------------------------------------------------------*/
    416 
    417 <SC_ESCAPED_CHARACTER>
    418 {
    419   "'" {
    420     unsigned char last_string_1;
    421     STRING_GROW;
    422     STRING_FINISH;
    423     loc->start = token_start;
    424     val->symbol = symbol_get (quotearg_style (escape_quoting_style,
    425 					      last_string),
    426 			      *loc);
    427     symbol_class_set (val->symbol, token_sym, *loc, false);
    428     last_string_1 = last_string[1];
    429     symbol_user_token_number_set (val->symbol, last_string_1, *loc);
    430     STRING_FREE;
    431     increment_rule_length (*loc);
    432     BEGIN INITIAL;
    433     return ID;
    434   }
    435   \n		unexpected_newline (token_start, "'");	BEGIN INITIAL;
    436   <<EOF>>	unexpected_eof (token_start, "'");	BEGIN INITIAL;
    437 }
    438 
    439 <SC_ESCAPED_CHARACTER,SC_ESCAPED_STRING>
    440 {
    441   \0	    complain_at (*loc, _("invalid null character"));
    442 }
    443 
    444 
    445   /*----------------------------.
    446   | Decode escaped characters.  |
    447   `----------------------------*/
    448 
    449 <SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>
    450 {
    451   \\[0-7]{1,3} {
    452     unsigned long int c = strtoul (yytext + 1, NULL, 8);
    453     if (UCHAR_MAX < c)
    454       complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
    455     else if (! c)
    456       complain_at (*loc, _("invalid null character: %s"), quote (yytext));
    457     else
    458       obstack_1grow (&obstack_for_string, c);
    459   }
    460 
    461   \\x[0-9abcdefABCDEF]+ {
    462     verify (UCHAR_MAX < ULONG_MAX);
    463     unsigned long int c = strtoul (yytext + 2, NULL, 16);
    464     if (UCHAR_MAX < c)
    465       complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
    466     else if (! c)
    467       complain_at (*loc, _("invalid null character: %s"), quote (yytext));
    468     else
    469       obstack_1grow (&obstack_for_string, c);
    470   }
    471 
    472   \\a	obstack_1grow (&obstack_for_string, '\a');
    473   \\b	obstack_1grow (&obstack_for_string, '\b');
    474   \\f	obstack_1grow (&obstack_for_string, '\f');
    475   \\n	obstack_1grow (&obstack_for_string, '\n');
    476   \\r	obstack_1grow (&obstack_for_string, '\r');
    477   \\t	obstack_1grow (&obstack_for_string, '\t');
    478   \\v	obstack_1grow (&obstack_for_string, '\v');
    479 
    480   /* \\[\"\'?\\] would be shorter, but it confuses xgettext.  */
    481   \\("\""|"'"|"?"|"\\")  obstack_1grow (&obstack_for_string, yytext[1]);
    482 
    483   \\(u|U[0-9abcdefABCDEF]{4})[0-9abcdefABCDEF]{4} {
    484     int c = convert_ucn_to_byte (yytext);
    485     if (c < 0)
    486       complain_at (*loc, _("invalid escape sequence: %s"), quote (yytext));
    487     else if (! c)
    488       complain_at (*loc, _("invalid null character: %s"), quote (yytext));
    489     else
    490       obstack_1grow (&obstack_for_string, c);
    491   }
    492   \\(.|\n)	{
    493     complain_at (*loc, _("unrecognized escape sequence: %s"), quote (yytext));
    494     STRING_GROW;
    495   }
    496 }
    497 
    498   /*--------------------------------------------.
    499   | Scanning user-code characters and strings.  |
    500   `--------------------------------------------*/
    501 
    502 <SC_CHARACTER,SC_STRING>
    503 {
    504   {splice}|\\{splice}[^\n$@\[\]]	STRING_GROW;
    505 }
    506 
    507 <SC_CHARACTER>
    508 {
    509   "'"		STRING_GROW; BEGIN context_state;
    510   \n		unexpected_newline (token_start, "'"); BEGIN context_state;
    511   <<EOF>>	unexpected_eof (token_start, "'"); BEGIN context_state;
    512 }
    513 
    514 <SC_STRING>
    515 {
    516   "\""		STRING_GROW; BEGIN context_state;
    517   \n		unexpected_newline (token_start, "\""); BEGIN context_state;
    518   <<EOF>>	unexpected_eof (token_start, "\""); BEGIN context_state;
    519 }
    520 
    521 
    522   /*---------------------------------------------------.
    523   | Strings, comments etc. can be found in user code.  |
    524   `---------------------------------------------------*/
    525 
    526 <SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>
    527 {
    528   "'" {
    529     STRING_GROW;
    530     context_state = YY_START;
    531     token_start = loc->start;
    532     BEGIN SC_CHARACTER;
    533   }
    534   "\"" {
    535     STRING_GROW;
    536     context_state = YY_START;
    537     token_start = loc->start;
    538     BEGIN SC_STRING;
    539   }
    540   "/"{splice}"*" {
    541     STRING_GROW;
    542     context_state = YY_START;
    543     token_start = loc->start;
    544     BEGIN SC_COMMENT;
    545   }
    546   "/"{splice}"/" {
    547     STRING_GROW;
    548     context_state = YY_START;
    549     BEGIN SC_LINE_COMMENT;
    550   }
    551 }
    552 
    553 
    554   /*---------------------------------------------------------------.
    555   | Scanning after %union etc., possibly followed by white space.  |
    556   | For %union only, allow arbitrary C code to appear before the   |
    557   | following brace, as an extension to POSIX.			   |
    558   `---------------------------------------------------------------*/
    559 
    560 <SC_PRE_CODE>
    561 {
    562   . {
    563     bool valid = yytext[0] == '{' || token_type == PERCENT_UNION;
    564     scanner_cursor.column -= mbsnwidth (yytext, yyleng, 0);
    565     yyless (0);
    566 
    567     if (valid)
    568       {
    569 	braces_level = -1;
    570 	code_start = loc->start;
    571 	BEGIN SC_BRACED_CODE;
    572       }
    573     else
    574       {
    575 	complain_at (*loc, _("missing `{' in %s"),
    576 		     token_name (token_type));
    577 	obstack_sgrow (&obstack_for_string, "{}");
    578 	STRING_FINISH;
    579 	val->chars = last_string;
    580 	BEGIN INITIAL;
    581 	return token_type;
    582       }
    583   }
    584 
    585   <<EOF>>  unexpected_eof (scanner_cursor, "{}"); BEGIN INITIAL;
    586 }
    587 
    588 
    589   /*---------------------------------------------------------------.
    590   | Scanning some code in braces (%union and actions). The initial |
    591   | "{" is already eaten.                                          |
    592   `---------------------------------------------------------------*/
    593 
    594 <SC_BRACED_CODE>
    595 {
    596   "{"|"<"{splice}"%"  STRING_GROW; braces_level++;
    597   "%"{splice}">"      STRING_GROW; braces_level--;
    598   "}" {
    599     bool outer_brace = --braces_level < 0;
    600 
    601     /* As an undocumented Bison extension, append `;' before the last
    602        brace in braced code, so that the user code can omit trailing
    603        `;'.  But do not append `;' if emulating Yacc, since Yacc does
    604        not append one.
    605 
    606        FIXME: Bison should warn if a semicolon seems to be necessary
    607        here, and should omit the semicolon if it seems unnecessary
    608        (e.g., after ';', '{', or '}', each followed by comments or
    609        white space).  Such a warning shouldn't depend on --yacc; it
    610        should depend on a new --pedantic option, which would cause
    611        Bison to warn if it detects an extension to POSIX.  --pedantic
    612        should also diagnose other Bison extensions like %yacc.
    613        Perhaps there should also be a GCC-style --pedantic-errors
    614        option, so that such warnings are diagnosed as errors.  */
    615     if (outer_brace && token_type == BRACED_CODE && ! yacc_flag)
    616       obstack_1grow (&obstack_for_string, ';');
    617 
    618     obstack_1grow (&obstack_for_string, '}');
    619 
    620     if (outer_brace)
    621       {
    622 	STRING_FINISH;
    623 	loc->start = code_start;
    624 	val->chars = last_string;
    625 	increment_rule_length (*loc);
    626 	last_braced_code_loc = *loc;
    627 	BEGIN INITIAL;
    628 	return token_type;
    629       }
    630   }
    631 
    632   /* Tokenize `<<%' correctly (as `<<' `%') rather than incorrrectly
    633      (as `<' `<%').  */
    634   "<"{splice}"<"  STRING_GROW;
    635 
    636   "$"("<"{tag}">")?(-?[0-9]+|"$")  handle_dollar (token_type, yytext, *loc);
    637   "@"(-?[0-9]+|"$")		   handle_at (token_type, yytext, *loc);
    638 
    639   "$"  {
    640     warn_at (*loc, _("stray `$'"));
    641     obstack_sgrow (&obstack_for_string, "$][");
    642   }
    643   "@"  {
    644     warn_at (*loc, _("stray `@'"));
    645     obstack_sgrow (&obstack_for_string, "@@");
    646   }
    647 
    648   <<EOF>>  unexpected_eof (code_start, "}"); BEGIN INITIAL;
    649 }
    650 
    651 
    652   /*--------------------------------------------------------------.
    653   | Scanning some prologue: from "%{" (already scanned) to "%}".  |
    654   `--------------------------------------------------------------*/
    655 
    656 <SC_PROLOGUE>
    657 {
    658   "%}" {
    659     STRING_FINISH;
    660     loc->start = code_start;
    661     val->chars = last_string;
    662     BEGIN INITIAL;
    663     return PROLOGUE;
    664   }
    665 
    666   <<EOF>>  unexpected_eof (code_start, "%}"); BEGIN INITIAL;
    667 }
    668 
    669 
    670   /*---------------------------------------------------------------.
    671   | Scanning the epilogue (everything after the second "%%", which |
    672   | has already been eaten).                                       |
    673   `---------------------------------------------------------------*/
    674 
    675 <SC_EPILOGUE>
    676 {
    677   <<EOF>> {
    678     STRING_FINISH;
    679     loc->start = code_start;
    680     val->chars = last_string;
    681     BEGIN INITIAL;
    682     return EPILOGUE;
    683   }
    684 }
    685 
    686 
    687   /*-----------------------------------------.
    688   | Escape M4 quoting characters in C code.  |
    689   `-----------------------------------------*/
    690 
    691 <SC_COMMENT,SC_LINE_COMMENT,SC_STRING,SC_CHARACTER,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>
    692 {
    693   \$	obstack_sgrow (&obstack_for_string, "$][");
    694   \@	obstack_sgrow (&obstack_for_string, "@@");
    695   \[	obstack_sgrow (&obstack_for_string, "@{");
    696   \]	obstack_sgrow (&obstack_for_string, "@}");
    697 }
    698 
    699 
    700   /*-----------------------------------------------------.
    701   | By default, grow the string obstack with the input.  |
    702   `-----------------------------------------------------*/
    703 
    704 <SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE,SC_STRING,SC_CHARACTER,SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>.	|
    705 <SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE>\n	STRING_GROW;
    706 
    707 %%
    708 
    709 /* Keeps track of the maximum number of semantic values to the left of
    710    a handle (those referenced by $0, $-1, etc.) are required by the
    711    semantic actions of this grammar. */
    712 int max_left_semantic_context = 0;
    713 
    714 /* If BUF is null, add BUFSIZE (which in this case must be less than
    715    INT_MAX) to COLUMN; otherwise, add mbsnwidth (BUF, BUFSIZE, 0) to
    716    COLUMN.  If an overflow occurs, or might occur but is undetectable,
    717    return INT_MAX.  Assume COLUMN is nonnegative.  */
    718 
    719 static inline int
    720 add_column_width (int column, char const *buf, size_t bufsize)
    721 {
    722   size_t width;
    723   unsigned int remaining_columns = INT_MAX - column;
    724 
    725   if (buf)
    726     {
    727       if (INT_MAX / 2 <= bufsize)
    728 	return INT_MAX;
    729       width = mbsnwidth (buf, bufsize, 0);
    730     }
    731   else
    732     width = bufsize;
    733 
    734   return width <= remaining_columns ? column + width : INT_MAX;
    735 }
    736 
    737 /* Set *LOC and adjust scanner cursor to account for token TOKEN of
    738    size SIZE.  */
    739 
    740 static void
    741 adjust_location (location *loc, char const *token, size_t size)
    742 {
    743   int line = scanner_cursor.line;
    744   int column = scanner_cursor.column;
    745   char const *p0 = token;
    746   char const *p = token;
    747   char const *lim = token + size;
    748 
    749   loc->start = scanner_cursor;
    750 
    751   for (p = token; p < lim; p++)
    752     switch (*p)
    753       {
    754       case '\n':
    755 	line += line < INT_MAX;
    756 	column = 1;
    757 	p0 = p + 1;
    758 	break;
    759 
    760       case '\t':
    761 	column = add_column_width (column, p0, p - p0);
    762 	column = add_column_width (column, NULL, 8 - ((column - 1) & 7));
    763 	p0 = p + 1;
    764 	break;
    765 
    766       default:
    767 	break;
    768       }
    769 
    770   scanner_cursor.line = line;
    771   scanner_cursor.column = column = add_column_width (column, p0, p - p0);
    772 
    773   loc->end = scanner_cursor;
    774 
    775   if (line == INT_MAX && loc->start.line != INT_MAX)
    776     warn_at (*loc, _("line number overflow"));
    777   if (column == INT_MAX && loc->start.column != INT_MAX)
    778     warn_at (*loc, _("column number overflow"));
    779 }
    780 
    781 
    782 /* Read bytes from FP into buffer BUF of size SIZE.  Return the
    783    number of bytes read.  Remove '\r' from input, treating \r\n
    784    and isolated \r as \n.  */
    785 
    786 static size_t
    787 no_cr_read (FILE *fp, char *buf, size_t size)
    788 {
    789   size_t bytes_read = fread (buf, 1, size, fp);
    790   if (bytes_read)
    791     {
    792       char *w = memchr (buf, '\r', bytes_read);
    793       if (w)
    794 	{
    795 	  char const *r = ++w;
    796 	  char const *lim = buf + bytes_read;
    797 
    798 	  for (;;)
    799 	    {
    800 	      /* Found an '\r'.  Treat it like '\n', but ignore any
    801 		 '\n' that immediately follows.  */
    802 	      w[-1] = '\n';
    803 	      if (r == lim)
    804 		{
    805 		  int ch = getc (fp);
    806 		  if (ch != '\n' && ungetc (ch, fp) != ch)
    807 		    break;
    808 		}
    809 	      else if (*r == '\n')
    810 		r++;
    811 
    812 	      /* Copy until the next '\r'.  */
    813 	      do
    814 		{
    815 		  if (r == lim)
    816 		    return w - buf;
    817 		}
    818 	      while ((*w++ = *r++) != '\r');
    819 	    }
    820 
    821 	  return w - buf;
    822 	}
    823     }
    824 
    825   return bytes_read;
    826 }
    827 
    828 
    829 /*------------------------------------------------------------------.
    830 | TEXT is pointing to a wannabee semantic value (i.e., a `$').      |
    831 |                                                                   |
    832 | Possible inputs: $[<TYPENAME>]($|integer)                         |
    833 |                                                                   |
    834 | Output to OBSTACK_FOR_STRING a reference to this semantic value.  |
    835 `------------------------------------------------------------------*/
    836 
    837 static inline bool
    838 handle_action_dollar (char *text, location loc)
    839 {
    840   const char *type_name = NULL;
    841   char *cp = text + 1;
    842 
    843   if (! current_rule)
    844     return false;
    845 
    846   /* Get the type name if explicit. */
    847   if (*cp == '<')
    848     {
    849       type_name = ++cp;
    850       while (*cp != '>')
    851 	++cp;
    852       *cp = '\0';
    853       ++cp;
    854     }
    855 
    856   if (*cp == '$')
    857     {
    858       if (!type_name)
    859 	type_name = symbol_list_n_type_name_get (current_rule, loc, 0);
    860       if (!type_name && typed)
    861 	complain_at (loc, _("$$ of `%s' has no declared type"),
    862 		     current_rule->sym->tag);
    863       if (!type_name)
    864 	type_name = "";
    865       obstack_fgrow1 (&obstack_for_string,
    866 		      "]b4_lhs_value([%s])[", type_name);
    867       current_rule->used = true;
    868     }
    869   else
    870     {
    871       long int num = strtol (cp, NULL, 10);
    872 
    873       if (1 - INT_MAX + rule_length <= num && num <= rule_length)
    874 	{
    875 	  int n = num;
    876 	  if (max_left_semantic_context < 1 - n)
    877 	    max_left_semantic_context = 1 - n;
    878 	  if (!type_name && 0 < n)
    879 	    type_name = symbol_list_n_type_name_get (current_rule, loc, n);
    880 	  if (!type_name && typed)
    881 	    complain_at (loc, _("$%d of `%s' has no declared type"),
    882 			 n, current_rule->sym->tag);
    883 	  if (!type_name)
    884 	    type_name = "";
    885 	  obstack_fgrow3 (&obstack_for_string,
    886 			  "]b4_rhs_value(%d, %d, [%s])[",
    887 			  rule_length, n, type_name);
    888 	  symbol_list_n_used_set (current_rule, n, true);
    889 	}
    890       else
    891 	complain_at (loc, _("integer out of range: %s"), quote (text));
    892     }
    893 
    894   return true;
    895 }
    896 
    897 
    898 /*----------------------------------------------------------------.
    899 | Map `$?' onto the proper M4 symbol, depending on its TOKEN_TYPE |
    900 | (are we in an action?).                                         |
    901 `----------------------------------------------------------------*/
    902 
    903 static void
    904 handle_dollar (int token_type, char *text, location loc)
    905 {
    906   switch (token_type)
    907     {
    908     case BRACED_CODE:
    909       if (handle_action_dollar (text, loc))
    910 	return;
    911       break;
    912 
    913     case PERCENT_DESTRUCTOR:
    914     case PERCENT_INITIAL_ACTION:
    915     case PERCENT_PRINTER:
    916       if (text[1] == '$')
    917 	{
    918 	  obstack_sgrow (&obstack_for_string, "]b4_dollar_dollar[");
    919 	  return;
    920 	}
    921       break;
    922 
    923     default:
    924       break;
    925     }
    926 
    927   complain_at (loc, _("invalid value: %s"), quote (text));
    928 }
    929 
    930 
    931 /*------------------------------------------------------.
    932 | TEXT is a location token (i.e., a `@...').  Output to |
    933 | OBSTACK_FOR_STRING a reference to this location.      |
    934 `------------------------------------------------------*/
    935 
    936 static inline bool
    937 handle_action_at (char *text, location loc)
    938 {
    939   char *cp = text + 1;
    940   locations_flag = true;
    941 
    942   if (! current_rule)
    943     return false;
    944 
    945   if (*cp == '$')
    946     obstack_sgrow (&obstack_for_string, "]b4_lhs_location[");
    947   else
    948     {
    949       long int num = strtol (cp, NULL, 10);
    950 
    951       if (1 - INT_MAX + rule_length <= num && num <= rule_length)
    952 	{
    953 	  int n = num;
    954 	  obstack_fgrow2 (&obstack_for_string, "]b4_rhs_location(%d, %d)[",
    955 			  rule_length, n);
    956 	}
    957       else
    958 	complain_at (loc, _("integer out of range: %s"), quote (text));
    959     }
    960 
    961   return true;
    962 }
    963 
    964 
    965 /*----------------------------------------------------------------.
    966 | Map `@?' onto the proper M4 symbol, depending on its TOKEN_TYPE |
    967 | (are we in an action?).                                         |
    968 `----------------------------------------------------------------*/
    969 
    970 static void
    971 handle_at (int token_type, char *text, location loc)
    972 {
    973   switch (token_type)
    974     {
    975     case BRACED_CODE:
    976       handle_action_at (text, loc);
    977       return;
    978 
    979     case PERCENT_INITIAL_ACTION:
    980     case PERCENT_DESTRUCTOR:
    981     case PERCENT_PRINTER:
    982       if (text[1] == '$')
    983 	{
    984 	  obstack_sgrow (&obstack_for_string, "]b4_at_dollar[");
    985 	  return;
    986 	}
    987       break;
    988 
    989     default:
    990       break;
    991     }
    992 
    993   complain_at (loc, _("invalid value: %s"), quote (text));
    994 }
    995 
    996 
    997 /*------------------------------------------------------.
    998 | Scan NUMBER for a base-BASE integer at location LOC.  |
    999 `------------------------------------------------------*/
   1000 
   1001 static unsigned long int
   1002 scan_integer (char const *number, int base, location loc)
   1003 {
   1004   verify (INT_MAX < ULONG_MAX);
   1005   unsigned long int num = strtoul (number, NULL, base);
   1006 
   1007   if (INT_MAX < num)
   1008     {
   1009       complain_at (loc, _("integer out of range: %s"), quote (number));
   1010       num = INT_MAX;
   1011     }
   1012 
   1013   return num;
   1014 }
   1015 
   1016 
   1017 /*------------------------------------------------------------------.
   1018 | Convert universal character name UCN to a single-byte character,  |
   1019 | and return that character.  Return -1 if UCN does not correspond  |
   1020 | to a single-byte character.					    |
   1021 `------------------------------------------------------------------*/
   1022 
   1023 static int
   1024 convert_ucn_to_byte (char const *ucn)
   1025 {
   1026   verify (UCHAR_MAX <= INT_MAX);
   1027   unsigned long int code = strtoul (ucn + 2, NULL, 16);
   1028 
   1029   /* FIXME: Currently we assume Unicode-compatible unibyte characters
   1030      on ASCII hosts (i.e., Latin-1 on hosts with 8-bit bytes).  On
   1031      non-ASCII hosts we support only the portable C character set.
   1032      These limitations should be removed once we add support for
   1033      multibyte characters.  */
   1034 
   1035   if (UCHAR_MAX < code)
   1036     return -1;
   1037 
   1038 #if ! ('$' == 0x24 && '@' == 0x40 && '`' == 0x60 && '~' == 0x7e)
   1039   {
   1040     /* A non-ASCII host.  Use CODE to index into a table of the C
   1041        basic execution character set, which is guaranteed to exist on
   1042        all Standard C platforms.  This table also includes '$', '@',
   1043        and '`', which are not in the basic execution character set but
   1044        which are unibyte characters on all the platforms that we know
   1045        about.  */
   1046     static signed char const table[] =
   1047       {
   1048 	'\0',   -1,   -1,   -1,   -1,   -1,   -1, '\a',
   1049 	'\b', '\t', '\n', '\v', '\f', '\r',   -1,   -1,
   1050 	  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
   1051 	  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
   1052 	 ' ',  '!',  '"',  '#',  '$',  '%',  '&', '\'',
   1053 	 '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',
   1054 	 '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',
   1055 	 '8',  '9',  ':',  ';',  '<',  '=',  '>',  '?',
   1056 	 '@',  'A',  'B',  'C',  'D',  'E',  'F',  'G',
   1057 	 'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
   1058 	 'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
   1059 	 'X',  'Y',  'Z',  '[', '\\',  ']',  '^',  '_',
   1060 	 '`',  'a',  'b',  'c',  'd',  'e',  'f',  'g',
   1061 	 'h',  'i',  'j',  'k',  'l',  'm',  'n',  'o',
   1062 	 'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
   1063 	 'x',  'y',  'z',  '{',  '|',  '}',  '~'
   1064       };
   1065 
   1066     code = code < sizeof table ? table[code] : -1;
   1067   }
   1068 #endif
   1069 
   1070   return code;
   1071 }
   1072 
   1073 
   1074 /*----------------------------------------------------------------.
   1075 | Handle `#line INT "FILE"'.  ARGS has already skipped `#line '.  |
   1076 `----------------------------------------------------------------*/
   1077 
   1078 static void
   1079 handle_syncline (char *args, location loc)
   1080 {
   1081   char *after_num;
   1082   unsigned long int lineno = strtoul (args, &after_num, 10);
   1083   char *file = strchr (after_num, '"') + 1;
   1084   *strchr (file, '"') = '\0';
   1085   if (INT_MAX <= lineno)
   1086     {
   1087       warn_at (loc, _("line number overflow"));
   1088       lineno = INT_MAX;
   1089     }
   1090   scanner_cursor.file = current_file = uniqstr_new (file);
   1091   scanner_cursor.line = lineno;
   1092   scanner_cursor.column = 1;
   1093 }
   1094 
   1095 
   1096 /*---------------------------------.
   1097 | Report a rule that is too long.  |
   1098 `---------------------------------*/
   1099 
   1100 static void
   1101 rule_length_overflow (location loc)
   1102 {
   1103   fatal_at (loc, _("rule is too long"));
   1104 }
   1105 
   1106 
   1107 /*----------------------------------------------------------------.
   1108 | For a token or comment starting at START, report message MSGID, |
   1109 | which should say that an end marker was found before		  |
   1110 | the expected TOKEN_END.					  |
   1111 `----------------------------------------------------------------*/
   1112 
   1113 static void
   1114 unexpected_end (boundary start, char const *msgid, char const *token_end)
   1115 {
   1116   location loc;
   1117   loc.start = start;
   1118   loc.end = scanner_cursor;
   1119   complain_at (loc, _(msgid), token_end);
   1120 }
   1121 
   1122 
   1123 /*------------------------------------------------------------------------.
   1124 | Report an unexpected EOF in a token or comment starting at START.       |
   1125 | An end of file was encountered and the expected TOKEN_END was missing.  |
   1126 `------------------------------------------------------------------------*/
   1127 
   1128 static void
   1129 unexpected_eof (boundary start, char const *token_end)
   1130 {
   1131   unexpected_end (start, N_("missing `%s' at end of file"), token_end);
   1132 }
   1133 
   1134 
   1135 /*----------------------------------------.
   1136 | Likewise, but for unexpected newlines.  |
   1137 `----------------------------------------*/
   1138 
   1139 static void
   1140 unexpected_newline (boundary start, char const *token_end)
   1141 {
   1142   unexpected_end (start, N_("missing `%s' at end of line"), token_end);
   1143 }
   1144 
   1145 
   1146 /*-------------------------.
   1147 | Initialize the scanner.  |
   1148 `-------------------------*/
   1149 
   1150 void
   1151 scanner_initialize (void)
   1152 {
   1153   obstack_init (&obstack_for_string);
   1154 }
   1155 
   1156 
   1157 /*-----------------------------------------------.
   1158 | Free all the memory allocated to the scanner.  |
   1159 `-----------------------------------------------*/
   1160 
   1161 void
   1162 scanner_free (void)
   1163 {
   1164   obstack_free (&obstack_for_string, 0);
   1165   /* Reclaim Flex's buffers.  */
   1166   yy_delete_buffer (YY_CURRENT_BUFFER);
   1167 }
   1168