Home | History | Annotate | Download | only in flex-2.5.4a
      1 /* nfa - NFA construction routines */
      2 
      3 /*-
      4  * Copyright (c) 1990 The Regents of the University of California.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to Berkeley by
      8  * Vern Paxson.
      9  *
     10  * The United States Government has rights in this work pursuant
     11  * to contract no. DE-AC03-76SF00098 between the United States
     12  * Department of Energy and the University of California.
     13  *
     14  * Redistribution and use in source and binary forms with or without
     15  * modification are permitted provided that: (1) source distributions retain
     16  * this entire copyright notice and comment, and (2) distributions including
     17  * binaries display the following acknowledgement:  ``This product includes
     18  * software developed by the University of California, Berkeley and its
     19  * contributors'' in the documentation or other materials provided with the
     20  * distribution and in all advertising materials mentioning features or use
     21  * of this software.  Neither the name of the University nor the names of
     22  * its contributors may be used to endorse or promote products derived from
     23  * this software without specific prior written permission.
     24  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
     25  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
     26  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
     27  */
     28 
     29 /* $Header: /home/daffy/u0/vern/flex/RCS/nfa.c,v 2.17 95/03/04 16:11:42 vern Exp $ */
     30 
     31 #include "flexdef.h"
     32 
     33 
     34 /* declare functions that have forward references */
     35 
     36 int dupmachine PROTO((int));
     37 void mkxtion PROTO((int, int));
     38 
     39 
     40 /* add_accept - add an accepting state to a machine
     41  *
     42  * accepting_number becomes mach's accepting number.
     43  */
     44 
     45 void add_accept( mach, accepting_number )
     46 int mach, accepting_number;
     47 	{
     48 	/* Hang the accepting number off an epsilon state.  if it is associated
     49 	 * with a state that has a non-epsilon out-transition, then the state
     50 	 * will accept BEFORE it makes that transition, i.e., one character
     51 	 * too soon.
     52 	 */
     53 
     54 	if ( transchar[finalst[mach]] == SYM_EPSILON )
     55 		accptnum[finalst[mach]] = accepting_number;
     56 
     57 	else
     58 		{
     59 		int astate = mkstate( SYM_EPSILON );
     60 		accptnum[astate] = accepting_number;
     61 		(void) link_machines( mach, astate );
     62 		}
     63 	}
     64 
     65 
     66 /* copysingl - make a given number of copies of a singleton machine
     67  *
     68  * synopsis
     69  *
     70  *   newsng = copysingl( singl, num );
     71  *
     72  *     newsng - a new singleton composed of num copies of singl
     73  *     singl  - a singleton machine
     74  *     num    - the number of copies of singl to be present in newsng
     75  */
     76 
     77 int copysingl( singl, num )
     78 int singl, num;
     79 	{
     80 	int copy, i;
     81 
     82 	copy = mkstate( SYM_EPSILON );
     83 
     84 	for ( i = 1; i <= num; ++i )
     85 		copy = link_machines( copy, dupmachine( singl ) );
     86 
     87 	return copy;
     88 	}
     89 
     90 
     91 /* dumpnfa - debugging routine to write out an nfa */
     92 
     93 void dumpnfa( state1 )
     94 int state1;
     95 
     96 	{
     97 	int sym, tsp1, tsp2, anum, ns;
     98 
     99 	fprintf( stderr,
    100 	_( "\n\n********** beginning dump of nfa with start state %d\n" ),
    101 		state1 );
    102 
    103 	/* We probably should loop starting at firstst[state1] and going to
    104 	 * lastst[state1], but they're not maintained properly when we "or"
    105 	 * all of the rules together.  So we use our knowledge that the machine
    106 	 * starts at state 1 and ends at lastnfa.
    107 	 */
    108 
    109 	/* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
    110 	for ( ns = 1; ns <= lastnfa; ++ns )
    111 		{
    112 		fprintf( stderr, _( "state # %4d\t" ), ns );
    113 
    114 		sym = transchar[ns];
    115 		tsp1 = trans1[ns];
    116 		tsp2 = trans2[ns];
    117 		anum = accptnum[ns];
    118 
    119 		fprintf( stderr, "%3d:  %4d, %4d", sym, tsp1, tsp2 );
    120 
    121 		if ( anum != NIL )
    122 			fprintf( stderr, "  [%d]", anum );
    123 
    124 		fprintf( stderr, "\n" );
    125 		}
    126 
    127 	fprintf( stderr, _( "********** end of dump\n" ) );
    128 	}
    129 
    130 
    131 /* dupmachine - make a duplicate of a given machine
    132  *
    133  * synopsis
    134  *
    135  *   copy = dupmachine( mach );
    136  *
    137  *     copy - holds duplicate of mach
    138  *     mach - machine to be duplicated
    139  *
    140  * note that the copy of mach is NOT an exact duplicate; rather, all the
    141  * transition states values are adjusted so that the copy is self-contained,
    142  * as the original should have been.
    143  *
    144  * also note that the original MUST be contiguous, with its low and high
    145  * states accessible by the arrays firstst and lastst
    146  */
    147 
    148 int dupmachine( mach )
    149 int mach;
    150 	{
    151 	int i, init, state_offset;
    152 	int state = 0;
    153 	int last = lastst[mach];
    154 
    155 	for ( i = firstst[mach]; i <= last; ++i )
    156 		{
    157 		state = mkstate( transchar[i] );
    158 
    159 		if ( trans1[i] != NO_TRANSITION )
    160 			{
    161 			mkxtion( finalst[state], trans1[i] + state - i );
    162 
    163 			if ( transchar[i] == SYM_EPSILON &&
    164 			     trans2[i] != NO_TRANSITION )
    165 				mkxtion( finalst[state],
    166 					trans2[i] + state - i );
    167 			}
    168 
    169 		accptnum[state] = accptnum[i];
    170 		}
    171 
    172 	if ( state == 0 )
    173 		flexfatal( _( "empty machine in dupmachine()" ) );
    174 
    175 	state_offset = state - i + 1;
    176 
    177 	init = mach + state_offset;
    178 	firstst[init] = firstst[mach] + state_offset;
    179 	finalst[init] = finalst[mach] + state_offset;
    180 	lastst[init] = lastst[mach] + state_offset;
    181 
    182 	return init;
    183 	}
    184 
    185 
    186 /* finish_rule - finish up the processing for a rule
    187  *
    188  * An accepting number is added to the given machine.  If variable_trail_rule
    189  * is true then the rule has trailing context and both the head and trail
    190  * are variable size.  Otherwise if headcnt or trailcnt is non-zero then
    191  * the machine recognizes a pattern with trailing context and headcnt is
    192  * the number of characters in the matched part of the pattern, or zero
    193  * if the matched part has variable length.  trailcnt is the number of
    194  * trailing context characters in the pattern, or zero if the trailing
    195  * context has variable length.
    196  */
    197 
    198 void finish_rule( mach, variable_trail_rule, headcnt, trailcnt )
    199 int mach, variable_trail_rule, headcnt, trailcnt;
    200 	{
    201 	char action_text[MAXLINE];
    202 
    203 	add_accept( mach, num_rules );
    204 
    205 	/* We did this in new_rule(), but it often gets the wrong
    206 	 * number because we do it before we start parsing the current rule.
    207 	 */
    208 	rule_linenum[num_rules] = linenum;
    209 
    210 	/* If this is a continued action, then the line-number has already
    211 	 * been updated, giving us the wrong number.
    212 	 */
    213 	if ( continued_action )
    214 		--rule_linenum[num_rules];
    215 
    216 	sprintf( action_text, "case %d:\n", num_rules );
    217 	add_action( action_text );
    218 
    219 	if ( variable_trail_rule )
    220 		{
    221 		rule_type[num_rules] = RULE_VARIABLE;
    222 
    223 		if ( performance_report > 0 )
    224 			fprintf( stderr,
    225 			_( "Variable trailing context rule at line %d\n" ),
    226 				rule_linenum[num_rules] );
    227 
    228 		variable_trailing_context_rules = true;
    229 		}
    230 
    231 	else
    232 		{
    233 		rule_type[num_rules] = RULE_NORMAL;
    234 
    235 		if ( headcnt > 0 || trailcnt > 0 )
    236 			{
    237 			/* Do trailing context magic to not match the trailing
    238 			 * characters.
    239 			 */
    240 			char *scanner_cp = "yy_c_buf_p = yy_cp";
    241 			char *scanner_bp = "yy_bp";
    242 
    243 			add_action(
    244 	"*yy_cp = yy_hold_char; /* undo effects of setting up yytext */\n" );
    245 
    246 			if ( headcnt > 0 )
    247 				{
    248 				sprintf( action_text, "%s = %s + %d;\n",
    249 				scanner_cp, scanner_bp, headcnt );
    250 				add_action( action_text );
    251 				}
    252 
    253 			else
    254 				{
    255 				sprintf( action_text, "%s -= %d;\n",
    256 					scanner_cp, trailcnt );
    257 				add_action( action_text );
    258 				}
    259 
    260 			add_action(
    261 			"YY_DO_BEFORE_ACTION; /* set up yytext again */\n" );
    262 			}
    263 		}
    264 
    265 	/* Okay, in the action code at this point yytext and yyleng have
    266 	 * their proper final values for this rule, so here's the point
    267 	 * to do any user action.  But don't do it for continued actions,
    268 	 * as that'll result in multiple YY_RULE_SETUP's.
    269 	 */
    270 	if ( ! continued_action )
    271 		add_action( "YY_RULE_SETUP\n" );
    272 
    273 	line_directive_out( (FILE *) 0, 1 );
    274 	}
    275 
    276 
    277 /* link_machines - connect two machines together
    278  *
    279  * synopsis
    280  *
    281  *   new = link_machines( first, last );
    282  *
    283  *     new    - a machine constructed by connecting first to last
    284  *     first  - the machine whose successor is to be last
    285  *     last   - the machine whose predecessor is to be first
    286  *
    287  * note: this routine concatenates the machine first with the machine
    288  *  last to produce a machine new which will pattern-match first first
    289  *  and then last, and will fail if either of the sub-patterns fails.
    290  *  FIRST is set to new by the operation.  last is unmolested.
    291  */
    292 
    293 int link_machines( first, last )
    294 int first, last;
    295 	{
    296 	if ( first == NIL )
    297 		return last;
    298 
    299 	else if ( last == NIL )
    300 		return first;
    301 
    302 	else
    303 		{
    304 		mkxtion( finalst[first], last );
    305 		finalst[first] = finalst[last];
    306 		lastst[first] = MAX( lastst[first], lastst[last] );
    307 		firstst[first] = MIN( firstst[first], firstst[last] );
    308 
    309 		return first;
    310 		}
    311 	}
    312 
    313 
    314 /* mark_beginning_as_normal - mark each "beginning" state in a machine
    315  *                            as being a "normal" (i.e., not trailing context-
    316  *                            associated) states
    317  *
    318  * The "beginning" states are the epsilon closure of the first state
    319  */
    320 
    321 void mark_beginning_as_normal( mach )
    322 register int mach;
    323 	{
    324 	switch ( state_type[mach] )
    325 		{
    326 		case STATE_NORMAL:
    327 			/* Oh, we've already visited here. */
    328 			return;
    329 
    330 		case STATE_TRAILING_CONTEXT:
    331 			state_type[mach] = STATE_NORMAL;
    332 
    333 			if ( transchar[mach] == SYM_EPSILON )
    334 				{
    335 				if ( trans1[mach] != NO_TRANSITION )
    336 					mark_beginning_as_normal(
    337 						trans1[mach] );
    338 
    339 				if ( trans2[mach] != NO_TRANSITION )
    340 					mark_beginning_as_normal(
    341 						trans2[mach] );
    342 				}
    343 			break;
    344 
    345 		default:
    346 			flexerror(
    347 			_( "bad state type in mark_beginning_as_normal()" ) );
    348 			break;
    349 		}
    350 	}
    351 
    352 
    353 /* mkbranch - make a machine that branches to two machines
    354  *
    355  * synopsis
    356  *
    357  *   branch = mkbranch( first, second );
    358  *
    359  *     branch - a machine which matches either first's pattern or second's
    360  *     first, second - machines whose patterns are to be or'ed (the | operator)
    361  *
    362  * Note that first and second are NEITHER destroyed by the operation.  Also,
    363  * the resulting machine CANNOT be used with any other "mk" operation except
    364  * more mkbranch's.  Compare with mkor()
    365  */
    366 
    367 int mkbranch( first, second )
    368 int first, second;
    369 	{
    370 	int eps;
    371 
    372 	if ( first == NO_TRANSITION )
    373 		return second;
    374 
    375 	else if ( second == NO_TRANSITION )
    376 		return first;
    377 
    378 	eps = mkstate( SYM_EPSILON );
    379 
    380 	mkxtion( eps, first );
    381 	mkxtion( eps, second );
    382 
    383 	return eps;
    384 	}
    385 
    386 
    387 /* mkclos - convert a machine into a closure
    388  *
    389  * synopsis
    390  *   new = mkclos( state );
    391  *
    392  * new - a new state which matches the closure of "state"
    393  */
    394 
    395 int mkclos( state )
    396 int state;
    397 	{
    398 	return mkopt( mkposcl( state ) );
    399 	}
    400 
    401 
    402 /* mkopt - make a machine optional
    403  *
    404  * synopsis
    405  *
    406  *   new = mkopt( mach );
    407  *
    408  *     new  - a machine which optionally matches whatever mach matched
    409  *     mach - the machine to make optional
    410  *
    411  * notes:
    412  *     1. mach must be the last machine created
    413  *     2. mach is destroyed by the call
    414  */
    415 
    416 int mkopt( mach )
    417 int mach;
    418 	{
    419 	int eps;
    420 
    421 	if ( ! SUPER_FREE_EPSILON(finalst[mach]) )
    422 		{
    423 		eps = mkstate( SYM_EPSILON );
    424 		mach = link_machines( mach, eps );
    425 		}
    426 
    427 	/* Can't skimp on the following if FREE_EPSILON(mach) is true because
    428 	 * some state interior to "mach" might point back to the beginning
    429 	 * for a closure.
    430 	 */
    431 	eps = mkstate( SYM_EPSILON );
    432 	mach = link_machines( eps, mach );
    433 
    434 	mkxtion( mach, finalst[mach] );
    435 
    436 	return mach;
    437 	}
    438 
    439 
    440 /* mkor - make a machine that matches either one of two machines
    441  *
    442  * synopsis
    443  *
    444  *   new = mkor( first, second );
    445  *
    446  *     new - a machine which matches either first's pattern or second's
    447  *     first, second - machines whose patterns are to be or'ed (the | operator)
    448  *
    449  * note that first and second are both destroyed by the operation
    450  * the code is rather convoluted because an attempt is made to minimize
    451  * the number of epsilon states needed
    452  */
    453 
    454 int mkor( first, second )
    455 int first, second;
    456 	{
    457 	int eps, orend;
    458 
    459 	if ( first == NIL )
    460 		return second;
    461 
    462 	else if ( second == NIL )
    463 		return first;
    464 
    465 	else
    466 		{
    467 		/* See comment in mkopt() about why we can't use the first
    468 		 * state of "first" or "second" if they satisfy "FREE_EPSILON".
    469 		 */
    470 		eps = mkstate( SYM_EPSILON );
    471 
    472 		first = link_machines( eps, first );
    473 
    474 		mkxtion( first, second );
    475 
    476 		if ( SUPER_FREE_EPSILON(finalst[first]) &&
    477 		     accptnum[finalst[first]] == NIL )
    478 			{
    479 			orend = finalst[first];
    480 			mkxtion( finalst[second], orend );
    481 			}
    482 
    483 		else if ( SUPER_FREE_EPSILON(finalst[second]) &&
    484 			  accptnum[finalst[second]] == NIL )
    485 			{
    486 			orend = finalst[second];
    487 			mkxtion( finalst[first], orend );
    488 			}
    489 
    490 		else
    491 			{
    492 			eps = mkstate( SYM_EPSILON );
    493 
    494 			first = link_machines( first, eps );
    495 			orend = finalst[first];
    496 
    497 			mkxtion( finalst[second], orend );
    498 			}
    499 		}
    500 
    501 	finalst[first] = orend;
    502 	return first;
    503 	}
    504 
    505 
    506 /* mkposcl - convert a machine into a positive closure
    507  *
    508  * synopsis
    509  *   new = mkposcl( state );
    510  *
    511  *    new - a machine matching the positive closure of "state"
    512  */
    513 
    514 int mkposcl( state )
    515 int state;
    516 	{
    517 	int eps;
    518 
    519 	if ( SUPER_FREE_EPSILON(finalst[state]) )
    520 		{
    521 		mkxtion( finalst[state], state );
    522 		return state;
    523 		}
    524 
    525 	else
    526 		{
    527 		eps = mkstate( SYM_EPSILON );
    528 		mkxtion( eps, state );
    529 		return link_machines( state, eps );
    530 		}
    531 	}
    532 
    533 
    534 /* mkrep - make a replicated machine
    535  *
    536  * synopsis
    537  *   new = mkrep( mach, lb, ub );
    538  *
    539  *    new - a machine that matches whatever "mach" matched from "lb"
    540  *          number of times to "ub" number of times
    541  *
    542  * note
    543  *   if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach"
    544  */
    545 
    546 int mkrep( mach, lb, ub )
    547 int mach, lb, ub;
    548 	{
    549 	int base_mach, tail, copy, i;
    550 
    551 	base_mach = copysingl( mach, lb - 1 );
    552 
    553 	if ( ub == INFINITY )
    554 		{
    555 		copy = dupmachine( mach );
    556 		mach = link_machines( mach,
    557 		link_machines( base_mach, mkclos( copy ) ) );
    558 		}
    559 
    560 	else
    561 		{
    562 		tail = mkstate( SYM_EPSILON );
    563 
    564 		for ( i = lb; i < ub; ++i )
    565 			{
    566 			copy = dupmachine( mach );
    567 			tail = mkopt( link_machines( copy, tail ) );
    568 			}
    569 
    570 		mach = link_machines( mach, link_machines( base_mach, tail ) );
    571 		}
    572 
    573 	return mach;
    574 	}
    575 
    576 
    577 /* mkstate - create a state with a transition on a given symbol
    578  *
    579  * synopsis
    580  *
    581  *   state = mkstate( sym );
    582  *
    583  *     state - a new state matching sym
    584  *     sym   - the symbol the new state is to have an out-transition on
    585  *
    586  * note that this routine makes new states in ascending order through the
    587  * state array (and increments LASTNFA accordingly).  The routine DUPMACHINE
    588  * relies on machines being made in ascending order and that they are
    589  * CONTIGUOUS.  Change it and you will have to rewrite DUPMACHINE (kludge
    590  * that it admittedly is)
    591  */
    592 
    593 int mkstate( sym )
    594 int sym;
    595 	{
    596 	if ( ++lastnfa >= current_mns )
    597 		{
    598 		if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS )
    599 			lerrif(
    600 		_( "input rules are too complicated (>= %d NFA states)" ),
    601 				current_mns );
    602 
    603 		++num_reallocs;
    604 
    605 		firstst = reallocate_integer_array( firstst, current_mns );
    606 		lastst = reallocate_integer_array( lastst, current_mns );
    607 		finalst = reallocate_integer_array( finalst, current_mns );
    608 		transchar = reallocate_integer_array( transchar, current_mns );
    609 		trans1 = reallocate_integer_array( trans1, current_mns );
    610 		trans2 = reallocate_integer_array( trans2, current_mns );
    611 		accptnum = reallocate_integer_array( accptnum, current_mns );
    612 		assoc_rule =
    613 			reallocate_integer_array( assoc_rule, current_mns );
    614 		state_type =
    615 			reallocate_integer_array( state_type, current_mns );
    616 		}
    617 
    618 	firstst[lastnfa] = lastnfa;
    619 	finalst[lastnfa] = lastnfa;
    620 	lastst[lastnfa] = lastnfa;
    621 	transchar[lastnfa] = sym;
    622 	trans1[lastnfa] = NO_TRANSITION;
    623 	trans2[lastnfa] = NO_TRANSITION;
    624 	accptnum[lastnfa] = NIL;
    625 	assoc_rule[lastnfa] = num_rules;
    626 	state_type[lastnfa] = current_state_type;
    627 
    628 	/* Fix up equivalence classes base on this transition.  Note that any
    629 	 * character which has its own transition gets its own equivalence
    630 	 * class.  Thus only characters which are only in character classes
    631 	 * have a chance at being in the same equivalence class.  E.g. "a|b"
    632 	 * puts 'a' and 'b' into two different equivalence classes.  "[ab]"
    633 	 * puts them in the same equivalence class (barring other differences
    634 	 * elsewhere in the input).
    635 	 */
    636 
    637 	if ( sym < 0 )
    638 		{
    639 		/* We don't have to update the equivalence classes since
    640 		 * that was already done when the ccl was created for the
    641 		 * first time.
    642 		 */
    643 		}
    644 
    645 	else if ( sym == SYM_EPSILON )
    646 		++numeps;
    647 
    648 	else
    649 		{
    650 		check_char( sym );
    651 
    652 		if ( useecs )
    653 			/* Map NUL's to csize. */
    654 			mkechar( sym ? sym : csize, nextecm, ecgroup );
    655 		}
    656 
    657 	return lastnfa;
    658 	}
    659 
    660 
    661 /* mkxtion - make a transition from one state to another
    662  *
    663  * synopsis
    664  *
    665  *   mkxtion( statefrom, stateto );
    666  *
    667  *     statefrom - the state from which the transition is to be made
    668  *     stateto   - the state to which the transition is to be made
    669  */
    670 
    671 void mkxtion( statefrom, stateto )
    672 int statefrom, stateto;
    673 	{
    674 	if ( trans1[statefrom] == NO_TRANSITION )
    675 		trans1[statefrom] = stateto;
    676 
    677 	else if ( (transchar[statefrom] != SYM_EPSILON) ||
    678 		  (trans2[statefrom] != NO_TRANSITION) )
    679 		flexfatal( _( "found too many transitions in mkxtion()" ) );
    680 
    681 	else
    682 		{ /* second out-transition for an epsilon state */
    683 		++eps2;
    684 		trans2[statefrom] = stateto;
    685 		}
    686 	}
    687 
    688 /* new_rule - initialize for a new rule */
    689 
    690 void new_rule()
    691 	{
    692 	if ( ++num_rules >= current_max_rules )
    693 		{
    694 		++num_reallocs;
    695 		current_max_rules += MAX_RULES_INCREMENT;
    696 		rule_type = reallocate_integer_array( rule_type,
    697 							current_max_rules );
    698 		rule_linenum = reallocate_integer_array( rule_linenum,
    699 							current_max_rules );
    700 		rule_useful = reallocate_integer_array( rule_useful,
    701 							current_max_rules );
    702 		}
    703 
    704 	if ( num_rules > MAX_RULE )
    705 		lerrif( _( "too many rules (> %d)!" ), MAX_RULE );
    706 
    707 	rule_linenum[num_rules] = linenum;
    708 	rule_useful[num_rules] = false;
    709 	}
    710