Home | History | Annotate | Download | only in one-true-awk
      1 /****************************************************************
      2 Copyright (C) Lucent Technologies 1997
      3 All Rights Reserved
      4 
      5 Permission to use, copy, modify, and distribute this software and
      6 its documentation for any purpose and without fee is hereby
      7 granted, provided that the above copyright notice appear in all
      8 copies and that both that the copyright notice and this
      9 permission notice and warranty disclaimer appear in supporting
     10 documentation, and that the name Lucent Technologies or any of
     11 its entities not be used in advertising or publicity pertaining
     12 to distribution of the software without specific, written prior
     13 permission.
     14 
     15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
     16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
     17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
     18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
     20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
     21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
     22 THIS SOFTWARE.
     23 ****************************************************************/
     24 
     25 /* lasciate ogne speranza, voi ch'intrate. */
     26 
     27 #define	DEBUG
     28 
     29 #include <ctype.h>
     30 #include <limits.h>
     31 #include <stdio.h>
     32 #include <string.h>
     33 #include <stdlib.h>
     34 #include "awk.h"
     35 #include "ytab.h"
     36 
     37 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
     38 				/* NCHARS is 2**n */
     39 #define MAXLIN 22
     40 
     41 #define type(v)		(v)->nobj	/* badly overloaded here */
     42 #define info(v)		(v)->ntype	/* badly overloaded here */
     43 #define left(v)		(v)->narg[0]
     44 #define right(v)	(v)->narg[1]
     45 #define parent(v)	(v)->nnext
     46 
     47 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
     48 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
     49 #define UNARY	case STAR: case PLUS: case QUEST:
     50 
     51 /* encoding in tree Nodes:
     52 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
     53 		left is index, right contains value or pointer to value
     54 	unary (STAR, PLUS, QUEST): left is child, right is null
     55 	binary (CAT, OR): left and right are children
     56 	parent contains pointer to parent
     57 */
     58 
     59 
     60 int	*setvec;
     61 int	*tmpset;
     62 int	maxsetvec = 0;
     63 
     64 int	rtok;		/* next token in current re */
     65 int	rlxval;
     66 static uschar	*rlxstr;
     67 static uschar	*prestr;	/* current position in current re */
     68 static uschar	*lastre;	/* origin of last re */
     69 
     70 static	int setcnt;
     71 static	int poscnt;
     72 
     73 char	*patbeg;
     74 int	patlen;
     75 
     76 #define	NFA	20	/* cache this many dynamic fa's */
     77 fa	*fatab[NFA];
     78 int	nfatab	= 0;	/* entries in fatab */
     79 
     80 fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
     81 {
     82 	int i, use, nuse;
     83 	fa *pfa;
     84 	static int now = 1;
     85 
     86 	if (setvec == 0) {	/* first time through any RE */
     87 		maxsetvec = MAXLIN;
     88 		setvec = (int *) malloc(maxsetvec * sizeof(int));
     89 		tmpset = (int *) malloc(maxsetvec * sizeof(int));
     90 		if (setvec == 0 || tmpset == 0)
     91 			overflo("out of space initializing makedfa");
     92 	}
     93 
     94 	if (compile_time)	/* a constant for sure */
     95 		return mkdfa(s, anchor);
     96 	for (i = 0; i < nfatab; i++)	/* is it there already? */
     97 		if (fatab[i]->anchor == anchor
     98 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
     99 			fatab[i]->use = now++;
    100 			return fatab[i];
    101 		}
    102 	pfa = mkdfa(s, anchor);
    103 	if (nfatab < NFA) {	/* room for another */
    104 		fatab[nfatab] = pfa;
    105 		fatab[nfatab]->use = now++;
    106 		nfatab++;
    107 		return pfa;
    108 	}
    109 	use = fatab[0]->use;	/* replace least-recently used */
    110 	nuse = 0;
    111 	for (i = 1; i < nfatab; i++)
    112 		if (fatab[i]->use < use) {
    113 			use = fatab[i]->use;
    114 			nuse = i;
    115 		}
    116 	freefa(fatab[nuse]);
    117 	fatab[nuse] = pfa;
    118 	pfa->use = now++;
    119 	return pfa;
    120 }
    121 
    122 fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
    123 				/* anchor = 1 for anchored matches, else 0 */
    124 {
    125 	Node *p, *p1;
    126 	fa *f;
    127 
    128 	p = reparse(s);
    129 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
    130 		/* put ALL STAR in front of reg.  exp. */
    131 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
    132 		/* put FINAL after reg.  exp. */
    133 
    134 	poscnt = 0;
    135 	penter(p1);	/* enter parent pointers and leaf indices */
    136 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
    137 		overflo("out of space for fa");
    138 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
    139 	cfoll(f, p1);	/* set up follow sets */
    140 	freetr(p1);
    141 	if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
    142 			overflo("out of space in makedfa");
    143 	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
    144 		overflo("out of space in makedfa");
    145 	*f->posns[1] = 0;
    146 	f->initstat = makeinit(f, anchor);
    147 	f->anchor = anchor;
    148 	f->restr = (uschar *) tostring(s);
    149 	return f;
    150 }
    151 
    152 int makeinit(fa *f, int anchor)
    153 {
    154 	int i, k;
    155 
    156 	f->curstat = 2;
    157 	f->out[2] = 0;
    158 	f->reset = 0;
    159 	k = *(f->re[0].lfollow);
    160 	xfree(f->posns[2]);
    161 	if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
    162 		overflo("out of space in makeinit");
    163 	for (i=0; i <= k; i++) {
    164 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
    165 	}
    166 	if ((f->posns[2])[1] == f->accept)
    167 		f->out[2] = 1;
    168 	for (i=0; i < NCHARS; i++)
    169 		f->gototab[2][i] = 0;
    170 	f->curstat = cgoto(f, 2, HAT);
    171 	if (anchor) {
    172 		*f->posns[2] = k-1;	/* leave out position 0 */
    173 		for (i=0; i < k; i++) {
    174 			(f->posns[0])[i] = (f->posns[2])[i];
    175 		}
    176 
    177 		f->out[0] = f->out[2];
    178 		if (f->curstat != 2)
    179 			--(*f->posns[f->curstat]);
    180 	}
    181 	return f->curstat;
    182 }
    183 
    184 void penter(Node *p)	/* set up parent pointers and leaf indices */
    185 {
    186 	switch (type(p)) {
    187 	ELEAF
    188 	LEAF
    189 		info(p) = poscnt;
    190 		poscnt++;
    191 		break;
    192 	UNARY
    193 		penter(left(p));
    194 		parent(left(p)) = p;
    195 		break;
    196 	case CAT:
    197 	case OR:
    198 		penter(left(p));
    199 		penter(right(p));
    200 		parent(left(p)) = p;
    201 		parent(right(p)) = p;
    202 		break;
    203 	default:	/* can't happen */
    204 		FATAL("can't happen: unknown type %d in penter", type(p));
    205 		break;
    206 	}
    207 }
    208 
    209 void freetr(Node *p)	/* free parse tree */
    210 {
    211 	switch (type(p)) {
    212 	ELEAF
    213 	LEAF
    214 		xfree(p);
    215 		break;
    216 	UNARY
    217 		freetr(left(p));
    218 		xfree(p);
    219 		break;
    220 	case CAT:
    221 	case OR:
    222 		freetr(left(p));
    223 		freetr(right(p));
    224 		xfree(p);
    225 		break;
    226 	default:	/* can't happen */
    227 		FATAL("can't happen: unknown type %d in freetr", type(p));
    228 		break;
    229 	}
    230 }
    231 
    232 /* in the parsing of regular expressions, metacharacters like . have */
    233 /* to be seen literally;  \056 is not a metacharacter. */
    234 
    235 int hexstr(uschar **pp)	/* find and eval hex string at pp, return new p */
    236 {			/* only pick up one 8-bit byte (2 chars) */
    237 	uschar *p;
    238 	int n = 0;
    239 	int i;
    240 
    241 	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
    242 		if (isdigit(*p))
    243 			n = 16 * n + *p - '0';
    244 		else if (*p >= 'a' && *p <= 'f')
    245 			n = 16 * n + *p - 'a' + 10;
    246 		else if (*p >= 'A' && *p <= 'F')
    247 			n = 16 * n + *p - 'A' + 10;
    248 	}
    249 	*pp = (uschar *) p;
    250 	return n;
    251 }
    252 
    253 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
    254 
    255 int quoted(uschar **pp)	/* pick up next thing after a \\ */
    256 			/* and increment *pp */
    257 {
    258 	uschar *p = *pp;
    259 	int c;
    260 
    261 	if ((c = *p++) == 't')
    262 		c = '\t';
    263 	else if (c == 'n')
    264 		c = '\n';
    265 	else if (c == 'f')
    266 		c = '\f';
    267 	else if (c == 'r')
    268 		c = '\r';
    269 	else if (c == 'b')
    270 		c = '\b';
    271 	else if (c == '\\')
    272 		c = '\\';
    273 	else if (c == 'x') {	/* hexadecimal goo follows */
    274 		c = hexstr(&p);	/* this adds a null if number is invalid */
    275 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
    276 		int n = c - '0';
    277 		if (isoctdigit(*p)) {
    278 			n = 8 * n + *p++ - '0';
    279 			if (isoctdigit(*p))
    280 				n = 8 * n + *p++ - '0';
    281 		}
    282 		c = n;
    283 	} /* else */
    284 		/* c = c; */
    285 	*pp = p;
    286 	return c;
    287 }
    288 
    289 char *cclenter(const char *argp)	/* add a character class */
    290 {
    291 	int i, c, c2;
    292 	uschar *p = (uschar *) argp;
    293 	uschar *op, *bp;
    294 	static uschar *buf = 0;
    295 	static int bufsz = 100;
    296 
    297 	op = p;
    298 	if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
    299 		FATAL("out of space for character class [%.10s...] 1", p);
    300 	bp = buf;
    301 	for (i = 0; (c = *p++) != 0; ) {
    302 		if (c == '\\') {
    303 			c = quoted(&p);
    304 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
    305 			if (*p != 0) {
    306 				c = bp[-1];
    307 				c2 = *p++;
    308 				if (c2 == '\\')
    309 					c2 = quoted(&p);
    310 				if (c > c2) {	/* empty; ignore */
    311 					bp--;
    312 					i--;
    313 					continue;
    314 				}
    315 				while (c < c2) {
    316 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
    317 						FATAL("out of space for character class [%.10s...] 2", p);
    318 					*bp++ = ++c;
    319 					i++;
    320 				}
    321 				continue;
    322 			}
    323 		}
    324 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
    325 			FATAL("out of space for character class [%.10s...] 3", p);
    326 		*bp++ = c;
    327 		i++;
    328 	}
    329 	*bp = 0;
    330 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
    331 	xfree(op);
    332 	return (char *) tostring((char *) buf);
    333 }
    334 
    335 void overflo(const char *s)
    336 {
    337 	FATAL("regular expression too big: %.30s...", s);
    338 }
    339 
    340 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
    341 {
    342 	int i;
    343 	int *p;
    344 
    345 	switch (type(v)) {
    346 	ELEAF
    347 	LEAF
    348 		f->re[info(v)].ltype = type(v);
    349 		f->re[info(v)].lval.np = right(v);
    350 		while (f->accept >= maxsetvec) {	/* guessing here! */
    351 			maxsetvec *= 4;
    352 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
    353 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
    354 			if (setvec == 0 || tmpset == 0)
    355 				overflo("out of space in cfoll()");
    356 		}
    357 		for (i = 0; i <= f->accept; i++)
    358 			setvec[i] = 0;
    359 		setcnt = 0;
    360 		follow(v);	/* computes setvec and setcnt */
    361 		if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
    362 			overflo("out of space building follow set");
    363 		f->re[info(v)].lfollow = p;
    364 		*p = setcnt;
    365 		for (i = f->accept; i >= 0; i--)
    366 			if (setvec[i] == 1)
    367 				*++p = i;
    368 		break;
    369 	UNARY
    370 		cfoll(f,left(v));
    371 		break;
    372 	case CAT:
    373 	case OR:
    374 		cfoll(f,left(v));
    375 		cfoll(f,right(v));
    376 		break;
    377 	default:	/* can't happen */
    378 		FATAL("can't happen: unknown type %d in cfoll", type(v));
    379 	}
    380 }
    381 
    382 int first(Node *p)	/* collects initially active leaves of p into setvec */
    383 			/* returns 0 if p matches empty string */
    384 {
    385 	int b, lp;
    386 
    387 	switch (type(p)) {
    388 	ELEAF
    389 	LEAF
    390 		lp = info(p);	/* look for high-water mark of subscripts */
    391 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
    392 			maxsetvec *= 4;
    393 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
    394 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
    395 			if (setvec == 0 || tmpset == 0)
    396 				overflo("out of space in first()");
    397 		}
    398 		if (type(p) == EMPTYRE) {
    399 			setvec[lp] = 0;
    400 			return(0);
    401 		}
    402 		if (setvec[lp] != 1) {
    403 			setvec[lp] = 1;
    404 			setcnt++;
    405 		}
    406 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
    407 			return(0);		/* empty CCL */
    408 		else return(1);
    409 	case PLUS:
    410 		if (first(left(p)) == 0) return(0);
    411 		return(1);
    412 	case STAR:
    413 	case QUEST:
    414 		first(left(p));
    415 		return(0);
    416 	case CAT:
    417 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
    418 		return(1);
    419 	case OR:
    420 		b = first(right(p));
    421 		if (first(left(p)) == 0 || b == 0) return(0);
    422 		return(1);
    423 	}
    424 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
    425 	return(-1);
    426 }
    427 
    428 void follow(Node *v)	/* collects leaves that can follow v into setvec */
    429 {
    430 	Node *p;
    431 
    432 	if (type(v) == FINAL)
    433 		return;
    434 	p = parent(v);
    435 	switch (type(p)) {
    436 	case STAR:
    437 	case PLUS:
    438 		first(v);
    439 		follow(p);
    440 		return;
    441 
    442 	case OR:
    443 	case QUEST:
    444 		follow(p);
    445 		return;
    446 
    447 	case CAT:
    448 		if (v == left(p)) {	/* v is left child of p */
    449 			if (first(right(p)) == 0) {
    450 				follow(p);
    451 				return;
    452 			}
    453 		} else		/* v is right child */
    454 			follow(p);
    455 		return;
    456 	}
    457 }
    458 
    459 int member(int c, const char *sarg)	/* is c in s? */
    460 {
    461 	uschar *s = (uschar *) sarg;
    462 
    463 	while (*s)
    464 		if (c == *s++)
    465 			return(1);
    466 	return(0);
    467 }
    468 
    469 int match(fa *f, const char *p0)	/* shortest match ? */
    470 {
    471 	int s, ns;
    472 	uschar *p = (uschar *) p0;
    473 
    474 	s = f->reset ? makeinit(f,0) : f->initstat;
    475 	if (f->out[s])
    476 		return(1);
    477 	do {
    478 		/* assert(*p < NCHARS); */
    479 		if ((ns = f->gototab[s][*p]) != 0)
    480 			s = ns;
    481 		else
    482 			s = cgoto(f, s, *p);
    483 		if (f->out[s])
    484 			return(1);
    485 	} while (*p++ != 0);
    486 	return(0);
    487 }
    488 
    489 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
    490 {
    491 	int s, ns;
    492 	uschar *p = (uschar *) p0;
    493 	uschar *q;
    494 	int i, k;
    495 
    496 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
    497 	if (f->reset) {
    498 		f->initstat = s = makeinit(f,1);
    499 	} else {
    500 		s = f->initstat;
    501 	}
    502 	patbeg = (char *) p;
    503 	patlen = -1;
    504 	do {
    505 		q = p;
    506 		do {
    507 			if (f->out[s])		/* final state */
    508 				patlen = q-p;
    509 			/* assert(*q < NCHARS); */
    510 			if ((ns = f->gototab[s][*q]) != 0)
    511 				s = ns;
    512 			else
    513 				s = cgoto(f, s, *q);
    514 			if (s == 1) {	/* no transition */
    515 				if (patlen >= 0) {
    516 					patbeg = (char *) p;
    517 					return(1);
    518 				}
    519 				else
    520 					goto nextin;	/* no match */
    521 			}
    522 		} while (*q++ != 0);
    523 		if (f->out[s])
    524 			patlen = q-p-1;	/* don't count $ */
    525 		if (patlen >= 0) {
    526 			patbeg = (char *) p;
    527 			return(1);
    528 		}
    529 	nextin:
    530 		s = 2;
    531 		if (f->reset) {
    532 			for (i = 2; i <= f->curstat; i++)
    533 				xfree(f->posns[i]);
    534 			k = *f->posns[0];
    535 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
    536 				overflo("out of space in pmatch");
    537 			for (i = 0; i <= k; i++)
    538 				(f->posns[2])[i] = (f->posns[0])[i];
    539 			f->initstat = f->curstat = 2;
    540 			f->out[2] = f->out[0];
    541 			for (i = 0; i < NCHARS; i++)
    542 				f->gototab[2][i] = 0;
    543 		}
    544 	} while (*p++ != 0);
    545 	return (0);
    546 }
    547 
    548 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
    549 {
    550 	int s, ns;
    551 	uschar *p = (uschar *) p0;
    552 	uschar *q;
    553 	int i, k;
    554 
    555 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
    556 	if (f->reset) {
    557 		f->initstat = s = makeinit(f,1);
    558 	} else {
    559 		s = f->initstat;
    560 	}
    561 	patlen = -1;
    562 	while (*p) {
    563 		q = p;
    564 		do {
    565 			if (f->out[s])		/* final state */
    566 				patlen = q-p;
    567 			/* assert(*q < NCHARS); */
    568 			if ((ns = f->gototab[s][*q]) != 0)
    569 				s = ns;
    570 			else
    571 				s = cgoto(f, s, *q);
    572 			if (s == 1) {	/* no transition */
    573 				if (patlen > 0) {
    574 					patbeg = (char *) p;
    575 					return(1);
    576 				} else
    577 					goto nnextin;	/* no nonempty match */
    578 			}
    579 		} while (*q++ != 0);
    580 		if (f->out[s])
    581 			patlen = q-p-1;	/* don't count $ */
    582 		if (patlen > 0 ) {
    583 			patbeg = (char *) p;
    584 			return(1);
    585 		}
    586 	nnextin:
    587 		s = 2;
    588 		if (f->reset) {
    589 			for (i = 2; i <= f->curstat; i++)
    590 				xfree(f->posns[i]);
    591 			k = *f->posns[0];
    592 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
    593 				overflo("out of state space");
    594 			for (i = 0; i <= k; i++)
    595 				(f->posns[2])[i] = (f->posns[0])[i];
    596 			f->initstat = f->curstat = 2;
    597 			f->out[2] = f->out[0];
    598 			for (i = 0; i < NCHARS; i++)
    599 				f->gototab[2][i] = 0;
    600 		}
    601 		p++;
    602 	}
    603 	return (0);
    604 }
    605 
    606 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
    607 {			/* uses relex() to scan regular expression */
    608 	Node *np;
    609 
    610 	dprintf( ("reparse <%s>\n", p) );
    611 	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
    612 	rtok = relex();
    613 	/* GNU compatibility: an empty regexp matches anything */
    614 	if (rtok == '\0') {
    615 		/* FATAL("empty regular expression"); previous */
    616 		return(op2(EMPTYRE, NIL, NIL));
    617 	}
    618 	np = regexp();
    619 	if (rtok != '\0')
    620 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
    621 	return(np);
    622 }
    623 
    624 Node *regexp(void)	/* top-level parse of reg expr */
    625 {
    626 	return (alt(concat(primary())));
    627 }
    628 
    629 Node *primary(void)
    630 {
    631 	Node *np;
    632 
    633 	switch (rtok) {
    634 	case CHAR:
    635 		np = op2(CHAR, NIL, itonp(rlxval));
    636 		rtok = relex();
    637 		return (unary(np));
    638 	case ALL:
    639 		rtok = relex();
    640 		return (unary(op2(ALL, NIL, NIL)));
    641 	case EMPTYRE:
    642 		rtok = relex();
    643 		return (unary(op2(ALL, NIL, NIL)));
    644 	case DOT:
    645 		rtok = relex();
    646 		return (unary(op2(DOT, NIL, NIL)));
    647 	case CCL:
    648 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
    649 		rtok = relex();
    650 		return (unary(np));
    651 	case NCCL:
    652 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
    653 		rtok = relex();
    654 		return (unary(np));
    655 	case '^':
    656 		rtok = relex();
    657 		return (unary(op2(CHAR, NIL, itonp(HAT))));
    658 	case '$':
    659 		rtok = relex();
    660 		return (unary(op2(CHAR, NIL, NIL)));
    661 	case '(':
    662 		rtok = relex();
    663 		if (rtok == ')') {	/* special pleading for () */
    664 			rtok = relex();
    665 			return unary(op2(CCL, NIL, (Node *) tostring("")));
    666 		}
    667 		np = regexp();
    668 		if (rtok == ')') {
    669 			rtok = relex();
    670 			return (unary(np));
    671 		}
    672 		else
    673 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
    674 	default:
    675 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
    676 	}
    677 	return 0;	/*NOTREACHED*/
    678 }
    679 
    680 Node *concat(Node *np)
    681 {
    682 	switch (rtok) {
    683 	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
    684 		return (concat(op2(CAT, np, primary())));
    685 	}
    686 	return (np);
    687 }
    688 
    689 Node *alt(Node *np)
    690 {
    691 	if (rtok == OR) {
    692 		rtok = relex();
    693 		return (alt(op2(OR, np, concat(primary()))));
    694 	}
    695 	return (np);
    696 }
    697 
    698 Node *unary(Node *np)
    699 {
    700 	switch (rtok) {
    701 	case STAR:
    702 		rtok = relex();
    703 		return (unary(op2(STAR, np, NIL)));
    704 	case PLUS:
    705 		rtok = relex();
    706 		return (unary(op2(PLUS, np, NIL)));
    707 	case QUEST:
    708 		rtok = relex();
    709 		return (unary(op2(QUEST, np, NIL)));
    710 	default:
    711 		return (np);
    712 	}
    713 }
    714 
    715 /*
    716  * Character class definitions conformant to the POSIX locale as
    717  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
    718  * and operating character sets are both ASCII (ISO646) or supersets
    719  * thereof.
    720  *
    721  * Note that to avoid overflowing the temporary buffer used in
    722  * relex(), the expanded character class (prior to range expansion)
    723  * must be less than twice the size of their full name.
    724  */
    725 
    726 /* Because isblank doesn't show up in any of the header files on any
    727  * system i use, it's defined here.  if some other locale has a richer
    728  * definition of "blank", define HAS_ISBLANK and provide your own
    729  * version.
    730  * the parentheses here are an attempt to find a path through the maze
    731  * of macro definition and/or function and/or version provided.  thanks
    732  * to nelson beebe for the suggestion; let's see if it works everywhere.
    733  */
    734 
    735 /* #define HAS_ISBLANK */
    736 #ifndef HAS_ISBLANK
    737 
    738 int (xisblank)(int c)
    739 {
    740 	return c==' ' || c=='\t';
    741 }
    742 
    743 #endif
    744 
    745 struct charclass {
    746 	const char *cc_name;
    747 	int cc_namelen;
    748 	int (*cc_func)(int);
    749 } charclasses[] = {
    750 	{ "alnum",	5,	isalnum },
    751 	{ "alpha",	5,	isalpha },
    752 #ifndef HAS_ISBLANK
    753 	{ "blank",	5,	xisblank },
    754 #else
    755 	{ "blank",	5,	isblank },
    756 #endif
    757 	{ "cntrl",	5,	iscntrl },
    758 	{ "digit",	5,	isdigit },
    759 	{ "graph",	5,	isgraph },
    760 	{ "lower",	5,	islower },
    761 	{ "print",	5,	isprint },
    762 	{ "punct",	5,	ispunct },
    763 	{ "space",	5,	isspace },
    764 	{ "upper",	5,	isupper },
    765 	{ "xdigit",	6,	isxdigit },
    766 	{ NULL,		0,	NULL },
    767 };
    768 
    769 
    770 int relex(void)		/* lexical analyzer for reparse */
    771 {
    772 	int c, n;
    773 	int cflag;
    774 	static uschar *buf = 0;
    775 	static int bufsz = 100;
    776 	uschar *bp;
    777 	struct charclass *cc;
    778 	int i;
    779 
    780 	switch (c = *prestr++) {
    781 	case '|': return OR;
    782 	case '*': return STAR;
    783 	case '+': return PLUS;
    784 	case '?': return QUEST;
    785 	case '.': return DOT;
    786 	case '\0': prestr--; return '\0';
    787 	case '^':
    788 	case '$':
    789 	case '(':
    790 	case ')':
    791 		return c;
    792 	case '\\':
    793 		rlxval = quoted(&prestr);
    794 		return CHAR;
    795 	default:
    796 		rlxval = c;
    797 		return CHAR;
    798 	case '[':
    799 		if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
    800 			FATAL("out of space in reg expr %.10s..", lastre);
    801 		bp = buf;
    802 		if (*prestr == '^') {
    803 			cflag = 1;
    804 			prestr++;
    805 		}
    806 		else
    807 			cflag = 0;
    808 		n = 2 * strlen((const char *) prestr)+1;
    809 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
    810 			FATAL("out of space for reg expr %.10s...", lastre);
    811 		for (; ; ) {
    812 			if ((c = *prestr++) == '\\') {
    813 				*bp++ = '\\';
    814 				if ((c = *prestr++) == '\0')
    815 					FATAL("nonterminated character class %.20s...", lastre);
    816 				*bp++ = c;
    817 			/* } else if (c == '\n') { */
    818 			/* 	FATAL("newline in character class %.20s...", lastre); */
    819 			} else if (c == '[' && *prestr == ':') {
    820 				/* POSIX char class names, Dag-Erling Smorgrav, des (at) ofug.org */
    821 				for (cc = charclasses; cc->cc_name; cc++)
    822 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
    823 						break;
    824 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
    825 				    prestr[2 + cc->cc_namelen] == ']') {
    826 					prestr += cc->cc_namelen + 3;
    827 					/*
    828 					 * BUG: We begin at 1, instead of 0, since we
    829 					 * would otherwise prematurely terminate the
    830 					 * string for classes like [[:cntrl:]]. This
    831 					 * means that we can't match the NUL character,
    832 					 * not without first adapting the entire
    833 					 * program to track each string's length.
    834 					 */
    835 					for (i = 1; i <= UCHAR_MAX; i++) {
    836 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
    837 						    FATAL("out of space for reg expr %.10s...", lastre);
    838 						if (cc->cc_func(i)) {
    839 							*bp++ = i;
    840 							n++;
    841 						}
    842 					}
    843 				} else
    844 					*bp++ = c;
    845 			} else if (c == '\0') {
    846 				FATAL("nonterminated character class %.20s", lastre);
    847 			} else if (bp == buf) {	/* 1st char is special */
    848 				*bp++ = c;
    849 			} else if (c == ']') {
    850 				*bp++ = 0;
    851 				rlxstr = (uschar *) tostring((char *) buf);
    852 				if (cflag == 0)
    853 					return CCL;
    854 				else
    855 					return NCCL;
    856 			} else
    857 				*bp++ = c;
    858 		}
    859 	}
    860 }
    861 
    862 int cgoto(fa *f, int s, int c)
    863 {
    864 	int i, j, k;
    865 	int *p, *q;
    866 
    867 	assert(c == HAT || c < NCHARS);
    868 	while (f->accept >= maxsetvec) {	/* guessing here! */
    869 		maxsetvec *= 4;
    870 		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
    871 		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
    872 		if (setvec == 0 || tmpset == 0)
    873 			overflo("out of space in cgoto()");
    874 	}
    875 	for (i = 0; i <= f->accept; i++)
    876 		setvec[i] = 0;
    877 	setcnt = 0;
    878 	/* compute positions of gototab[s,c] into setvec */
    879 	p = f->posns[s];
    880 	for (i = 1; i <= *p; i++) {
    881 		if ((k = f->re[p[i]].ltype) != FINAL) {
    882 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
    883 			 || (k == DOT && c != 0 && c != HAT)
    884 			 || (k == ALL && c != 0)
    885 			 || (k == EMPTYRE && c != 0)
    886 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
    887 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
    888 				q = f->re[p[i]].lfollow;
    889 				for (j = 1; j <= *q; j++) {
    890 					if (q[j] >= maxsetvec) {
    891 						maxsetvec *= 4;
    892 						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
    893 						tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
    894 						if (setvec == 0 || tmpset == 0)
    895 							overflo("cgoto overflow");
    896 					}
    897 					if (setvec[q[j]] == 0) {
    898 						setcnt++;
    899 						setvec[q[j]] = 1;
    900 					}
    901 				}
    902 			}
    903 		}
    904 	}
    905 	/* determine if setvec is a previous state */
    906 	tmpset[0] = setcnt;
    907 	j = 1;
    908 	for (i = f->accept; i >= 0; i--)
    909 		if (setvec[i]) {
    910 			tmpset[j++] = i;
    911 		}
    912 	/* tmpset == previous state? */
    913 	for (i = 1; i <= f->curstat; i++) {
    914 		p = f->posns[i];
    915 		if ((k = tmpset[0]) != p[0])
    916 			goto different;
    917 		for (j = 1; j <= k; j++)
    918 			if (tmpset[j] != p[j])
    919 				goto different;
    920 		/* setvec is state i */
    921 		f->gototab[s][c] = i;
    922 		return i;
    923 	  different:;
    924 	}
    925 
    926 	/* add tmpset to current set of states */
    927 	if (f->curstat >= NSTATES-1) {
    928 		f->curstat = 2;
    929 		f->reset = 1;
    930 		for (i = 2; i < NSTATES; i++)
    931 			xfree(f->posns[i]);
    932 	} else
    933 		++(f->curstat);
    934 	for (i = 0; i < NCHARS; i++)
    935 		f->gototab[f->curstat][i] = 0;
    936 	xfree(f->posns[f->curstat]);
    937 	if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
    938 		overflo("out of space in cgoto");
    939 
    940 	f->posns[f->curstat] = p;
    941 	f->gototab[s][c] = f->curstat;
    942 	for (i = 0; i <= setcnt; i++)
    943 		p[i] = tmpset[i];
    944 	if (setvec[f->accept])
    945 		f->out[f->curstat] = 1;
    946 	else
    947 		f->out[f->curstat] = 0;
    948 	return f->curstat;
    949 }
    950 
    951 
    952 void freefa(fa *f)	/* free a finite automaton */
    953 {
    954 	int i;
    955 
    956 	if (f == NULL)
    957 		return;
    958 	for (i = 0; i <= f->curstat; i++)
    959 		xfree(f->posns[i]);
    960 	for (i = 0; i <= f->accept; i++) {
    961 		xfree(f->re[i].lfollow);
    962 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
    963 			xfree((f->re[i].lval.np));
    964 	}
    965 	xfree(f->restr);
    966 	xfree(f);
    967 }
    968