Home | History | Annotate | Download | only in libpcap
      1 /*
      2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
      3  *	The Regents of the University of California.  All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that: (1) source code distributions
      7  * retain the above copyright notice and this paragraph in its entirety, (2)
      8  * distributions including binary code include the above copyright notice and
      9  * this paragraph in its entirety in the documentation or other materials
     10  * provided with the distribution, and (3) all advertising materials mentioning
     11  * features or use of this software display the following acknowledgement:
     12  * ``This product includes software developed by the University of California,
     13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
     14  * the University nor the names of its contributors may be used to endorse
     15  * or promote products derived from this software without specific prior
     16  * written permission.
     17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
     18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
     19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
     20  *
     21  *  Optimization module for BPF code intermediate representation.
     22  */
     23 
     24 #ifdef HAVE_CONFIG_H
     25 #include <config.h>
     26 #endif
     27 
     28 #include <pcap-types.h>
     29 
     30 #include <stdio.h>
     31 #include <stdlib.h>
     32 #include <memory.h>
     33 #include <string.h>
     34 
     35 #include <errno.h>
     36 
     37 #include "pcap-int.h"
     38 
     39 #include "gencode.h"
     40 #include "optimize.h"
     41 
     42 #ifdef HAVE_OS_PROTO_H
     43 #include "os-proto.h"
     44 #endif
     45 
     46 #ifdef BDEBUG
     47 /*
     48  * The internal "debug printout" flag for the filter expression optimizer.
     49  * The code to print that stuff is present only if BDEBUG is defined, so
     50  * the flag, and the routine to set it, are defined only if BDEBUG is
     51  * defined.
     52  */
     53 static int pcap_optimizer_debug;
     54 
     55 /*
     56  * Routine to set that flag.
     57  *
     58  * This is intended for libpcap developers, not for general use.
     59  * If you want to set these in a program, you'll have to declare this
     60  * routine yourself, with the appropriate DLL import attribute on Windows;
     61  * it's not declared in any header file, and won't be declared in any
     62  * header file provided by libpcap.
     63  */
     64 PCAP_API void pcap_set_optimizer_debug(int value);
     65 
     66 PCAP_API_DEF void
     67 pcap_set_optimizer_debug(int value)
     68 {
     69 	pcap_optimizer_debug = value;
     70 }
     71 
     72 /*
     73  * The internal "print dot graph" flag for the filter expression optimizer.
     74  * The code to print that stuff is present only if BDEBUG is defined, so
     75  * the flag, and the routine to set it, are defined only if BDEBUG is
     76  * defined.
     77  */
     78 static int pcap_print_dot_graph;
     79 
     80 /*
     81  * Routine to set that flag.
     82  *
     83  * This is intended for libpcap developers, not for general use.
     84  * If you want to set these in a program, you'll have to declare this
     85  * routine yourself, with the appropriate DLL import attribute on Windows;
     86  * it's not declared in any header file, and won't be declared in any
     87  * header file provided by libpcap.
     88  */
     89 PCAP_API void pcap_set_print_dot_graph(int value);
     90 
     91 PCAP_API_DEF void
     92 pcap_set_print_dot_graph(int value)
     93 {
     94 	pcap_print_dot_graph = value;
     95 }
     96 
     97 #endif
     98 
     99 /*
    100  * lowest_set_bit().
    101  *
    102  * Takes a 32-bit integer as an argument.
    103  *
    104  * If handed a non-zero value, returns the index of the lowest set bit,
    105  * counting upwards fro zero.
    106  *
    107  * If handed zero, the results are platform- and compiler-dependent.
    108  * Keep it out of the light, don't give it any water, don't feed it
    109  * after midnight, and don't pass zero to it.
    110  *
    111  * This is the same as the count of trailing zeroes in the word.
    112  */
    113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
    114   /*
    115    * GCC 3.4 and later; we have __builtin_ctz().
    116    */
    117   #define lowest_set_bit(mask) __builtin_ctz(mask)
    118 #elif defined(_MSC_VER)
    119   /*
    120    * Visual Studio; we support only 2005 and later, so use
    121    * _BitScanForward().
    122    */
    123 #include <intrin.h>
    124 
    125 #ifndef __clang__
    126 #pragma intrinsic(_BitScanForward)
    127 #endif
    128 
    129 static __forceinline int
    130 lowest_set_bit(int mask)
    131 {
    132 	unsigned long bit;
    133 
    134 	/*
    135 	 * Don't sign-extend mask if long is longer than int.
    136 	 * (It's currently not, in MSVC, even on 64-bit platforms, but....)
    137 	 */
    138 	if (_BitScanForward(&bit, (unsigned int)mask) == 0)
    139 		return -1;	/* mask is zero */
    140 	return (int)bit;
    141 }
    142 #elif defined(MSDOS) && defined(__DJGPP__)
    143   /*
    144    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
    145    * we've already included.
    146    */
    147   #define lowest_set_bit(mask)	(ffs((mask)) - 1)
    148 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
    149   /*
    150    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
    151    * or some other platform (UN*X conforming to a sufficient recent version
    152    * of the Single UNIX Specification).
    153    */
    154   #include <strings.h>
    155   #define lowest_set_bit(mask)	(ffs((mask)) - 1)
    156 #else
    157 /*
    158  * None of the above.
    159  * Use a perfect-hash-function-based function.
    160  */
    161 static int
    162 lowest_set_bit(int mask)
    163 {
    164 	unsigned int v = (unsigned int)mask;
    165 
    166 	static const int MultiplyDeBruijnBitPosition[32] = {
    167 		0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
    168 		31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
    169 	};
    170 
    171 	/*
    172 	 * We strip off all but the lowermost set bit (v & ~v),
    173 	 * and perform a minimal perfect hash on it to look up the
    174 	 * number of low-order zero bits in a table.
    175 	 *
    176 	 * See:
    177 	 *
    178 	 *	http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
    179 	 *
    180 	 *	http://supertech.csail.mit.edu/papers/debruijn.pdf
    181 	 */
    182 	return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
    183 }
    184 #endif
    185 
    186 /*
    187  * Represents a deleted instruction.
    188  */
    189 #define NOP -1
    190 
    191 /*
    192  * Register numbers for use-def values.
    193  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
    194  * location.  A_ATOM is the accumulator and X_ATOM is the index
    195  * register.
    196  */
    197 #define A_ATOM BPF_MEMWORDS
    198 #define X_ATOM (BPF_MEMWORDS+1)
    199 
    200 /*
    201  * This define is used to represent *both* the accumulator and
    202  * x register in use-def computations.
    203  * Currently, the use-def code assumes only one definition per instruction.
    204  */
    205 #define AX_ATOM N_ATOMS
    206 
    207 /*
    208  * These data structures are used in a Cocke and Shwarz style
    209  * value numbering scheme.  Since the flowgraph is acyclic,
    210  * exit values can be propagated from a node's predecessors
    211  * provided it is uniquely defined.
    212  */
    213 struct valnode {
    214 	int code;
    215 	int v0, v1;
    216 	int val;
    217 	struct valnode *next;
    218 };
    219 
    220 /* Integer constants mapped with the load immediate opcode. */
    221 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0L)
    222 
    223 struct vmapinfo {
    224 	int is_const;
    225 	bpf_int32 const_val;
    226 };
    227 
    228 typedef struct {
    229 	/*
    230 	 * A flag to indicate that further optimization is needed.
    231 	 * Iterative passes are continued until a given pass yields no
    232 	 * branch movement.
    233 	 */
    234 	int done;
    235 
    236 	int n_blocks;
    237 	struct block **blocks;
    238 	int n_edges;
    239 	struct edge **edges;
    240 
    241 	/*
    242 	 * A bit vector set representation of the dominators.
    243 	 * We round up the set size to the next power of two.
    244 	 */
    245 	int nodewords;
    246 	int edgewords;
    247 	struct block **levels;
    248 	bpf_u_int32 *space;
    249 
    250 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
    251 /*
    252  * True if a is in uset {p}
    253  */
    254 #define SET_MEMBER(p, a) \
    255 ((p)[(unsigned)(a) / BITS_PER_WORD] & (1 << ((unsigned)(a) % BITS_PER_WORD)))
    256 
    257 /*
    258  * Add 'a' to uset p.
    259  */
    260 #define SET_INSERT(p, a) \
    261 (p)[(unsigned)(a) / BITS_PER_WORD] |= (1 << ((unsigned)(a) % BITS_PER_WORD))
    262 
    263 /*
    264  * Delete 'a' from uset p.
    265  */
    266 #define SET_DELETE(p, a) \
    267 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~(1 << ((unsigned)(a) % BITS_PER_WORD))
    268 
    269 /*
    270  * a := a intersect b
    271  */
    272 #define SET_INTERSECT(a, b, n)\
    273 {\
    274 	register bpf_u_int32 *_x = a, *_y = b;\
    275 	register int _n = n;\
    276 	while (--_n >= 0) *_x++ &= *_y++;\
    277 }
    278 
    279 /*
    280  * a := a - b
    281  */
    282 #define SET_SUBTRACT(a, b, n)\
    283 {\
    284 	register bpf_u_int32 *_x = a, *_y = b;\
    285 	register int _n = n;\
    286 	while (--_n >= 0) *_x++ &=~ *_y++;\
    287 }
    288 
    289 /*
    290  * a := a union b
    291  */
    292 #define SET_UNION(a, b, n)\
    293 {\
    294 	register bpf_u_int32 *_x = a, *_y = b;\
    295 	register int _n = n;\
    296 	while (--_n >= 0) *_x++ |= *_y++;\
    297 }
    298 
    299 	uset all_dom_sets;
    300 	uset all_closure_sets;
    301 	uset all_edge_sets;
    302 
    303 #define MODULUS 213
    304 	struct valnode *hashtbl[MODULUS];
    305 	int curval;
    306 	int maxval;
    307 
    308 	struct vmapinfo *vmap;
    309 	struct valnode *vnode_base;
    310 	struct valnode *next_vnode;
    311 } opt_state_t;
    312 
    313 typedef struct {
    314 	/*
    315 	 * Some pointers used to convert the basic block form of the code,
    316 	 * into the array form that BPF requires.  'fstart' will point to
    317 	 * the malloc'd array while 'ftail' is used during the recursive
    318 	 * traversal.
    319 	 */
    320 	struct bpf_insn *fstart;
    321 	struct bpf_insn *ftail;
    322 } conv_state_t;
    323 
    324 static void opt_init(compiler_state_t *, opt_state_t *, struct icode *);
    325 static void opt_cleanup(opt_state_t *);
    326 
    327 static void intern_blocks(opt_state_t *, struct icode *);
    328 
    329 static void find_inedges(opt_state_t *, struct block *);
    330 #ifdef BDEBUG
    331 static void opt_dump(compiler_state_t *, struct icode *);
    332 #endif
    333 
    334 #ifndef MAX
    335 #define MAX(a,b) ((a)>(b)?(a):(b))
    336 #endif
    337 
    338 static void
    339 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
    340 {
    341 	int level;
    342 
    343 	if (isMarked(ic, b))
    344 		return;
    345 
    346 	Mark(ic, b);
    347 	b->link = 0;
    348 
    349 	if (JT(b)) {
    350 		find_levels_r(opt_state, ic, JT(b));
    351 		find_levels_r(opt_state, ic, JF(b));
    352 		level = MAX(JT(b)->level, JF(b)->level) + 1;
    353 	} else
    354 		level = 0;
    355 	b->level = level;
    356 	b->link = opt_state->levels[level];
    357 	opt_state->levels[level] = b;
    358 }
    359 
    360 /*
    361  * Level graph.  The levels go from 0 at the leaves to
    362  * N_LEVELS at the root.  The opt_state->levels[] array points to the
    363  * first node of the level list, whose elements are linked
    364  * with the 'link' field of the struct block.
    365  */
    366 static void
    367 find_levels(opt_state_t *opt_state, struct icode *ic)
    368 {
    369 	memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
    370 	unMarkAll(ic);
    371 	find_levels_r(opt_state, ic, ic->root);
    372 }
    373 
    374 /*
    375  * Find dominator relationships.
    376  * Assumes graph has been leveled.
    377  */
    378 static void
    379 find_dom(opt_state_t *opt_state, struct block *root)
    380 {
    381 	int i;
    382 	struct block *b;
    383 	bpf_u_int32 *x;
    384 
    385 	/*
    386 	 * Initialize sets to contain all nodes.
    387 	 */
    388 	x = opt_state->all_dom_sets;
    389 	i = opt_state->n_blocks * opt_state->nodewords;
    390 	while (--i >= 0)
    391 		*x++ = 0xFFFFFFFFU;
    392 	/* Root starts off empty. */
    393 	for (i = opt_state->nodewords; --i >= 0;)
    394 		root->dom[i] = 0;
    395 
    396 	/* root->level is the highest level no found. */
    397 	for (i = root->level; i >= 0; --i) {
    398 		for (b = opt_state->levels[i]; b; b = b->link) {
    399 			SET_INSERT(b->dom, b->id);
    400 			if (JT(b) == 0)
    401 				continue;
    402 			SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
    403 			SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
    404 		}
    405 	}
    406 }
    407 
    408 static void
    409 propedom(opt_state_t *opt_state, struct edge *ep)
    410 {
    411 	SET_INSERT(ep->edom, ep->id);
    412 	if (ep->succ) {
    413 		SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
    414 		SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
    415 	}
    416 }
    417 
    418 /*
    419  * Compute edge dominators.
    420  * Assumes graph has been leveled and predecessors established.
    421  */
    422 static void
    423 find_edom(opt_state_t *opt_state, struct block *root)
    424 {
    425 	int i;
    426 	uset x;
    427 	struct block *b;
    428 
    429 	x = opt_state->all_edge_sets;
    430 	for (i = opt_state->n_edges * opt_state->edgewords; --i >= 0; )
    431 		x[i] = 0xFFFFFFFFU;
    432 
    433 	/* root->level is the highest level no found. */
    434 	memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
    435 	memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
    436 	for (i = root->level; i >= 0; --i) {
    437 		for (b = opt_state->levels[i]; b != 0; b = b->link) {
    438 			propedom(opt_state, &b->et);
    439 			propedom(opt_state, &b->ef);
    440 		}
    441 	}
    442 }
    443 
    444 /*
    445  * Find the backwards transitive closure of the flow graph.  These sets
    446  * are backwards in the sense that we find the set of nodes that reach
    447  * a given node, not the set of nodes that can be reached by a node.
    448  *
    449  * Assumes graph has been leveled.
    450  */
    451 static void
    452 find_closure(opt_state_t *opt_state, struct block *root)
    453 {
    454 	int i;
    455 	struct block *b;
    456 
    457 	/*
    458 	 * Initialize sets to contain no nodes.
    459 	 */
    460 	memset((char *)opt_state->all_closure_sets, 0,
    461 	      opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
    462 
    463 	/* root->level is the highest level no found. */
    464 	for (i = root->level; i >= 0; --i) {
    465 		for (b = opt_state->levels[i]; b; b = b->link) {
    466 			SET_INSERT(b->closure, b->id);
    467 			if (JT(b) == 0)
    468 				continue;
    469 			SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
    470 			SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
    471 		}
    472 	}
    473 }
    474 
    475 /*
    476  * Return the register number that is used by s.  If A and X are both
    477  * used, return AX_ATOM.  If no register is used, return -1.
    478  *
    479  * The implementation should probably change to an array access.
    480  */
    481 static int
    482 atomuse(struct stmt *s)
    483 {
    484 	register int c = s->code;
    485 
    486 	if (c == NOP)
    487 		return -1;
    488 
    489 	switch (BPF_CLASS(c)) {
    490 
    491 	case BPF_RET:
    492 		return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
    493 			(BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
    494 
    495 	case BPF_LD:
    496 	case BPF_LDX:
    497 		return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
    498 			(BPF_MODE(c) == BPF_MEM) ? s->k : -1;
    499 
    500 	case BPF_ST:
    501 		return A_ATOM;
    502 
    503 	case BPF_STX:
    504 		return X_ATOM;
    505 
    506 	case BPF_JMP:
    507 	case BPF_ALU:
    508 		if (BPF_SRC(c) == BPF_X)
    509 			return AX_ATOM;
    510 		return A_ATOM;
    511 
    512 	case BPF_MISC:
    513 		return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
    514 	}
    515 	abort();
    516 	/* NOTREACHED */
    517 }
    518 
    519 /*
    520  * Return the register number that is defined by 's'.  We assume that
    521  * a single stmt cannot define more than one register.  If no register
    522  * is defined, return -1.
    523  *
    524  * The implementation should probably change to an array access.
    525  */
    526 static int
    527 atomdef(struct stmt *s)
    528 {
    529 	if (s->code == NOP)
    530 		return -1;
    531 
    532 	switch (BPF_CLASS(s->code)) {
    533 
    534 	case BPF_LD:
    535 	case BPF_ALU:
    536 		return A_ATOM;
    537 
    538 	case BPF_LDX:
    539 		return X_ATOM;
    540 
    541 	case BPF_ST:
    542 	case BPF_STX:
    543 		return s->k;
    544 
    545 	case BPF_MISC:
    546 		return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
    547 	}
    548 	return -1;
    549 }
    550 
    551 /*
    552  * Compute the sets of registers used, defined, and killed by 'b'.
    553  *
    554  * "Used" means that a statement in 'b' uses the register before any
    555  * statement in 'b' defines it, i.e. it uses the value left in
    556  * that register by a predecessor block of this block.
    557  * "Defined" means that a statement in 'b' defines it.
    558  * "Killed" means that a statement in 'b' defines it before any
    559  * statement in 'b' uses it, i.e. it kills the value left in that
    560  * register by a predecessor block of this block.
    561  */
    562 static void
    563 compute_local_ud(struct block *b)
    564 {
    565 	struct slist *s;
    566 	atomset def = 0, use = 0, killed = 0;
    567 	int atom;
    568 
    569 	for (s = b->stmts; s; s = s->next) {
    570 		if (s->s.code == NOP)
    571 			continue;
    572 		atom = atomuse(&s->s);
    573 		if (atom >= 0) {
    574 			if (atom == AX_ATOM) {
    575 				if (!ATOMELEM(def, X_ATOM))
    576 					use |= ATOMMASK(X_ATOM);
    577 				if (!ATOMELEM(def, A_ATOM))
    578 					use |= ATOMMASK(A_ATOM);
    579 			}
    580 			else if (atom < N_ATOMS) {
    581 				if (!ATOMELEM(def, atom))
    582 					use |= ATOMMASK(atom);
    583 			}
    584 			else
    585 				abort();
    586 		}
    587 		atom = atomdef(&s->s);
    588 		if (atom >= 0) {
    589 			if (!ATOMELEM(use, atom))
    590 				killed |= ATOMMASK(atom);
    591 			def |= ATOMMASK(atom);
    592 		}
    593 	}
    594 	if (BPF_CLASS(b->s.code) == BPF_JMP) {
    595 		/*
    596 		 * XXX - what about RET?
    597 		 */
    598 		atom = atomuse(&b->s);
    599 		if (atom >= 0) {
    600 			if (atom == AX_ATOM) {
    601 				if (!ATOMELEM(def, X_ATOM))
    602 					use |= ATOMMASK(X_ATOM);
    603 				if (!ATOMELEM(def, A_ATOM))
    604 					use |= ATOMMASK(A_ATOM);
    605 			}
    606 			else if (atom < N_ATOMS) {
    607 				if (!ATOMELEM(def, atom))
    608 					use |= ATOMMASK(atom);
    609 			}
    610 			else
    611 				abort();
    612 		}
    613 	}
    614 
    615 	b->def = def;
    616 	b->kill = killed;
    617 	b->in_use = use;
    618 }
    619 
    620 /*
    621  * Assume graph is already leveled.
    622  */
    623 static void
    624 find_ud(opt_state_t *opt_state, struct block *root)
    625 {
    626 	int i, maxlevel;
    627 	struct block *p;
    628 
    629 	/*
    630 	 * root->level is the highest level no found;
    631 	 * count down from there.
    632 	 */
    633 	maxlevel = root->level;
    634 	for (i = maxlevel; i >= 0; --i)
    635 		for (p = opt_state->levels[i]; p; p = p->link) {
    636 			compute_local_ud(p);
    637 			p->out_use = 0;
    638 		}
    639 
    640 	for (i = 1; i <= maxlevel; ++i) {
    641 		for (p = opt_state->levels[i]; p; p = p->link) {
    642 			p->out_use |= JT(p)->in_use | JF(p)->in_use;
    643 			p->in_use |= p->out_use &~ p->kill;
    644 		}
    645 	}
    646 }
    647 static void
    648 init_val(opt_state_t *opt_state)
    649 {
    650 	opt_state->curval = 0;
    651 	opt_state->next_vnode = opt_state->vnode_base;
    652 	memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
    653 	memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
    654 }
    655 
    656 /* Because we really don't have an IR, this stuff is a little messy. */
    657 static int
    658 F(opt_state_t *opt_state, int code, int v0, int v1)
    659 {
    660 	u_int hash;
    661 	int val;
    662 	struct valnode *p;
    663 
    664 	hash = (u_int)code ^ ((u_int)v0 << 4) ^ ((u_int)v1 << 8);
    665 	hash %= MODULUS;
    666 
    667 	for (p = opt_state->hashtbl[hash]; p; p = p->next)
    668 		if (p->code == code && p->v0 == v0 && p->v1 == v1)
    669 			return p->val;
    670 
    671 	val = ++opt_state->curval;
    672 	if (BPF_MODE(code) == BPF_IMM &&
    673 	    (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
    674 		opt_state->vmap[val].const_val = v0;
    675 		opt_state->vmap[val].is_const = 1;
    676 	}
    677 	p = opt_state->next_vnode++;
    678 	p->val = val;
    679 	p->code = code;
    680 	p->v0 = v0;
    681 	p->v1 = v1;
    682 	p->next = opt_state->hashtbl[hash];
    683 	opt_state->hashtbl[hash] = p;
    684 
    685 	return val;
    686 }
    687 
    688 static inline void
    689 vstore(struct stmt *s, int *valp, int newval, int alter)
    690 {
    691 	if (alter && newval != VAL_UNKNOWN && *valp == newval)
    692 		s->code = NOP;
    693 	else
    694 		*valp = newval;
    695 }
    696 
    697 /*
    698  * Do constant-folding on binary operators.
    699  * (Unary operators are handled elsewhere.)
    700  */
    701 static void
    702 fold_op(compiler_state_t *cstate, opt_state_t *opt_state,
    703     struct stmt *s, int v0, int v1)
    704 {
    705 	bpf_u_int32 a, b;
    706 
    707 	a = opt_state->vmap[v0].const_val;
    708 	b = opt_state->vmap[v1].const_val;
    709 
    710 	switch (BPF_OP(s->code)) {
    711 	case BPF_ADD:
    712 		a += b;
    713 		break;
    714 
    715 	case BPF_SUB:
    716 		a -= b;
    717 		break;
    718 
    719 	case BPF_MUL:
    720 		a *= b;
    721 		break;
    722 
    723 	case BPF_DIV:
    724 		if (b == 0)
    725 			bpf_error(cstate, "division by zero");
    726 		a /= b;
    727 		break;
    728 
    729 	case BPF_MOD:
    730 		if (b == 0)
    731 			bpf_error(cstate, "modulus by zero");
    732 		a %= b;
    733 		break;
    734 
    735 	case BPF_AND:
    736 		a &= b;
    737 		break;
    738 
    739 	case BPF_OR:
    740 		a |= b;
    741 		break;
    742 
    743 	case BPF_XOR:
    744 		a ^= b;
    745 		break;
    746 
    747 	case BPF_LSH:
    748 		a <<= b;
    749 		break;
    750 
    751 	case BPF_RSH:
    752 		a >>= b;
    753 		break;
    754 
    755 	default:
    756 		abort();
    757 	}
    758 	s->k = a;
    759 	s->code = BPF_LD|BPF_IMM;
    760 	opt_state->done = 0;
    761 }
    762 
    763 static inline struct slist *
    764 this_op(struct slist *s)
    765 {
    766 	while (s != 0 && s->s.code == NOP)
    767 		s = s->next;
    768 	return s;
    769 }
    770 
    771 static void
    772 opt_not(struct block *b)
    773 {
    774 	struct block *tmp = JT(b);
    775 
    776 	JT(b) = JF(b);
    777 	JF(b) = tmp;
    778 }
    779 
    780 static void
    781 opt_peep(opt_state_t *opt_state, struct block *b)
    782 {
    783 	struct slist *s;
    784 	struct slist *next, *last;
    785 	int val;
    786 
    787 	s = b->stmts;
    788 	if (s == 0)
    789 		return;
    790 
    791 	last = s;
    792 	for (/*empty*/; /*empty*/; s = next) {
    793 		/*
    794 		 * Skip over nops.
    795 		 */
    796 		s = this_op(s);
    797 		if (s == 0)
    798 			break;	/* nothing left in the block */
    799 
    800 		/*
    801 		 * Find the next real instruction after that one
    802 		 * (skipping nops).
    803 		 */
    804 		next = this_op(s->next);
    805 		if (next == 0)
    806 			break;	/* no next instruction */
    807 		last = next;
    808 
    809 		/*
    810 		 * st  M[k]	-->	st  M[k]
    811 		 * ldx M[k]		tax
    812 		 */
    813 		if (s->s.code == BPF_ST &&
    814 		    next->s.code == (BPF_LDX|BPF_MEM) &&
    815 		    s->s.k == next->s.k) {
    816 			opt_state->done = 0;
    817 			next->s.code = BPF_MISC|BPF_TAX;
    818 		}
    819 		/*
    820 		 * ld  #k	-->	ldx  #k
    821 		 * tax			txa
    822 		 */
    823 		if (s->s.code == (BPF_LD|BPF_IMM) &&
    824 		    next->s.code == (BPF_MISC|BPF_TAX)) {
    825 			s->s.code = BPF_LDX|BPF_IMM;
    826 			next->s.code = BPF_MISC|BPF_TXA;
    827 			opt_state->done = 0;
    828 		}
    829 		/*
    830 		 * This is an ugly special case, but it happens
    831 		 * when you say tcp[k] or udp[k] where k is a constant.
    832 		 */
    833 		if (s->s.code == (BPF_LD|BPF_IMM)) {
    834 			struct slist *add, *tax, *ild;
    835 
    836 			/*
    837 			 * Check that X isn't used on exit from this
    838 			 * block (which the optimizer might cause).
    839 			 * We know the code generator won't generate
    840 			 * any local dependencies.
    841 			 */
    842 			if (ATOMELEM(b->out_use, X_ATOM))
    843 				continue;
    844 
    845 			/*
    846 			 * Check that the instruction following the ldi
    847 			 * is an addx, or it's an ldxms with an addx
    848 			 * following it (with 0 or more nops between the
    849 			 * ldxms and addx).
    850 			 */
    851 			if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
    852 				add = next;
    853 			else
    854 				add = this_op(next->next);
    855 			if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
    856 				continue;
    857 
    858 			/*
    859 			 * Check that a tax follows that (with 0 or more
    860 			 * nops between them).
    861 			 */
    862 			tax = this_op(add->next);
    863 			if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
    864 				continue;
    865 
    866 			/*
    867 			 * Check that an ild follows that (with 0 or more
    868 			 * nops between them).
    869 			 */
    870 			ild = this_op(tax->next);
    871 			if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
    872 			    BPF_MODE(ild->s.code) != BPF_IND)
    873 				continue;
    874 			/*
    875 			 * We want to turn this sequence:
    876 			 *
    877 			 * (004) ldi     #0x2		{s}
    878 			 * (005) ldxms   [14]		{next}  -- optional
    879 			 * (006) addx			{add}
    880 			 * (007) tax			{tax}
    881 			 * (008) ild     [x+0]		{ild}
    882 			 *
    883 			 * into this sequence:
    884 			 *
    885 			 * (004) nop
    886 			 * (005) ldxms   [14]
    887 			 * (006) nop
    888 			 * (007) nop
    889 			 * (008) ild     [x+2]
    890 			 *
    891 			 * XXX We need to check that X is not
    892 			 * subsequently used, because we want to change
    893 			 * what'll be in it after this sequence.
    894 			 *
    895 			 * We know we can eliminate the accumulator
    896 			 * modifications earlier in the sequence since
    897 			 * it is defined by the last stmt of this sequence
    898 			 * (i.e., the last statement of the sequence loads
    899 			 * a value into the accumulator, so we can eliminate
    900 			 * earlier operations on the accumulator).
    901 			 */
    902 			ild->s.k += s->s.k;
    903 			s->s.code = NOP;
    904 			add->s.code = NOP;
    905 			tax->s.code = NOP;
    906 			opt_state->done = 0;
    907 		}
    908 	}
    909 	/*
    910 	 * If the comparison at the end of a block is an equality
    911 	 * comparison against a constant, and nobody uses the value
    912 	 * we leave in the A register at the end of a block, and
    913 	 * the operation preceding the comparison is an arithmetic
    914 	 * operation, we can sometime optimize it away.
    915 	 */
    916 	if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
    917 	    !ATOMELEM(b->out_use, A_ATOM)) {
    918 	    	/*
    919 	    	 * We can optimize away certain subtractions of the
    920 	    	 * X register.
    921 	    	 */
    922 		if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
    923 			val = b->val[X_ATOM];
    924 			if (opt_state->vmap[val].is_const) {
    925 				/*
    926 				 * If we have a subtract to do a comparison,
    927 				 * and the X register is a known constant,
    928 				 * we can merge this value into the
    929 				 * comparison:
    930 				 *
    931 				 * sub x  ->	nop
    932 				 * jeq #y	jeq #(x+y)
    933 				 */
    934 				b->s.k += opt_state->vmap[val].const_val;
    935 				last->s.code = NOP;
    936 				opt_state->done = 0;
    937 			} else if (b->s.k == 0) {
    938 				/*
    939 				 * If the X register isn't a constant,
    940 				 * and the comparison in the test is
    941 				 * against 0, we can compare with the
    942 				 * X register, instead:
    943 				 *
    944 				 * sub x  ->	nop
    945 				 * jeq #0	jeq x
    946 				 */
    947 				last->s.code = NOP;
    948 				b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
    949 				opt_state->done = 0;
    950 			}
    951 		}
    952 		/*
    953 		 * Likewise, a constant subtract can be simplified:
    954 		 *
    955 		 * sub #x ->	nop
    956 		 * jeq #y ->	jeq #(x+y)
    957 		 */
    958 		else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
    959 			last->s.code = NOP;
    960 			b->s.k += last->s.k;
    961 			opt_state->done = 0;
    962 		}
    963 		/*
    964 		 * And, similarly, a constant AND can be simplified
    965 		 * if we're testing against 0, i.e.:
    966 		 *
    967 		 * and #k	nop
    968 		 * jeq #0  ->	jset #k
    969 		 */
    970 		else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
    971 		    b->s.k == 0) {
    972 			b->s.k = last->s.k;
    973 			b->s.code = BPF_JMP|BPF_K|BPF_JSET;
    974 			last->s.code = NOP;
    975 			opt_state->done = 0;
    976 			opt_not(b);
    977 		}
    978 	}
    979 	/*
    980 	 * jset #0        ->   never
    981 	 * jset #ffffffff ->   always
    982 	 */
    983 	if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
    984 		if (b->s.k == 0)
    985 			JT(b) = JF(b);
    986 		if ((u_int)b->s.k == 0xffffffffU)
    987 			JF(b) = JT(b);
    988 	}
    989 	/*
    990 	 * If we're comparing against the index register, and the index
    991 	 * register is a known constant, we can just compare against that
    992 	 * constant.
    993 	 */
    994 	val = b->val[X_ATOM];
    995 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
    996 		bpf_int32 v = opt_state->vmap[val].const_val;
    997 		b->s.code &= ~BPF_X;
    998 		b->s.k = v;
    999 	}
   1000 	/*
   1001 	 * If the accumulator is a known constant, we can compute the
   1002 	 * comparison result.
   1003 	 */
   1004 	val = b->val[A_ATOM];
   1005 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
   1006 		bpf_int32 v = opt_state->vmap[val].const_val;
   1007 		switch (BPF_OP(b->s.code)) {
   1008 
   1009 		case BPF_JEQ:
   1010 			v = v == b->s.k;
   1011 			break;
   1012 
   1013 		case BPF_JGT:
   1014 			v = (unsigned)v > (unsigned)b->s.k;
   1015 			break;
   1016 
   1017 		case BPF_JGE:
   1018 			v = (unsigned)v >= (unsigned)b->s.k;
   1019 			break;
   1020 
   1021 		case BPF_JSET:
   1022 			v &= b->s.k;
   1023 			break;
   1024 
   1025 		default:
   1026 			abort();
   1027 		}
   1028 		if (JF(b) != JT(b))
   1029 			opt_state->done = 0;
   1030 		if (v)
   1031 			JF(b) = JT(b);
   1032 		else
   1033 			JT(b) = JF(b);
   1034 	}
   1035 }
   1036 
   1037 /*
   1038  * Compute the symbolic value of expression of 's', and update
   1039  * anything it defines in the value table 'val'.  If 'alter' is true,
   1040  * do various optimizations.  This code would be cleaner if symbolic
   1041  * evaluation and code transformations weren't folded together.
   1042  */
   1043 static void
   1044 opt_stmt(compiler_state_t *cstate, opt_state_t *opt_state,
   1045     struct stmt *s, int val[], int alter)
   1046 {
   1047 	int op;
   1048 	int v;
   1049 
   1050 	switch (s->code) {
   1051 
   1052 	case BPF_LD|BPF_ABS|BPF_W:
   1053 	case BPF_LD|BPF_ABS|BPF_H:
   1054 	case BPF_LD|BPF_ABS|BPF_B:
   1055 		v = F(opt_state, s->code, s->k, 0L);
   1056 		vstore(s, &val[A_ATOM], v, alter);
   1057 		break;
   1058 
   1059 	case BPF_LD|BPF_IND|BPF_W:
   1060 	case BPF_LD|BPF_IND|BPF_H:
   1061 	case BPF_LD|BPF_IND|BPF_B:
   1062 		v = val[X_ATOM];
   1063 		if (alter && opt_state->vmap[v].is_const) {
   1064 			s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
   1065 			s->k += opt_state->vmap[v].const_val;
   1066 			v = F(opt_state, s->code, s->k, 0L);
   1067 			opt_state->done = 0;
   1068 		}
   1069 		else
   1070 			v = F(opt_state, s->code, s->k, v);
   1071 		vstore(s, &val[A_ATOM], v, alter);
   1072 		break;
   1073 
   1074 	case BPF_LD|BPF_LEN:
   1075 		v = F(opt_state, s->code, 0L, 0L);
   1076 		vstore(s, &val[A_ATOM], v, alter);
   1077 		break;
   1078 
   1079 	case BPF_LD|BPF_IMM:
   1080 		v = K(s->k);
   1081 		vstore(s, &val[A_ATOM], v, alter);
   1082 		break;
   1083 
   1084 	case BPF_LDX|BPF_IMM:
   1085 		v = K(s->k);
   1086 		vstore(s, &val[X_ATOM], v, alter);
   1087 		break;
   1088 
   1089 	case BPF_LDX|BPF_MSH|BPF_B:
   1090 		v = F(opt_state, s->code, s->k, 0L);
   1091 		vstore(s, &val[X_ATOM], v, alter);
   1092 		break;
   1093 
   1094 	case BPF_ALU|BPF_NEG:
   1095 		if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
   1096 			s->code = BPF_LD|BPF_IMM;
   1097 			s->k = -opt_state->vmap[val[A_ATOM]].const_val;
   1098 			val[A_ATOM] = K(s->k);
   1099 		}
   1100 		else
   1101 			val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
   1102 		break;
   1103 
   1104 	case BPF_ALU|BPF_ADD|BPF_K:
   1105 	case BPF_ALU|BPF_SUB|BPF_K:
   1106 	case BPF_ALU|BPF_MUL|BPF_K:
   1107 	case BPF_ALU|BPF_DIV|BPF_K:
   1108 	case BPF_ALU|BPF_MOD|BPF_K:
   1109 	case BPF_ALU|BPF_AND|BPF_K:
   1110 	case BPF_ALU|BPF_OR|BPF_K:
   1111 	case BPF_ALU|BPF_XOR|BPF_K:
   1112 	case BPF_ALU|BPF_LSH|BPF_K:
   1113 	case BPF_ALU|BPF_RSH|BPF_K:
   1114 		op = BPF_OP(s->code);
   1115 		if (alter) {
   1116 			if (s->k == 0) {
   1117 				/* don't optimize away "sub #0"
   1118 				 * as it may be needed later to
   1119 				 * fixup the generated math code */
   1120 				if (op == BPF_ADD ||
   1121 				    op == BPF_LSH || op == BPF_RSH ||
   1122 				    op == BPF_OR || op == BPF_XOR) {
   1123 					s->code = NOP;
   1124 					break;
   1125 				}
   1126 				if (op == BPF_MUL || op == BPF_AND) {
   1127 					s->code = BPF_LD|BPF_IMM;
   1128 					val[A_ATOM] = K(s->k);
   1129 					break;
   1130 				}
   1131 			}
   1132 			if (opt_state->vmap[val[A_ATOM]].is_const) {
   1133 				fold_op(cstate, opt_state, s, val[A_ATOM], K(s->k));
   1134 				val[A_ATOM] = K(s->k);
   1135 				break;
   1136 			}
   1137 		}
   1138 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
   1139 		break;
   1140 
   1141 	case BPF_ALU|BPF_ADD|BPF_X:
   1142 	case BPF_ALU|BPF_SUB|BPF_X:
   1143 	case BPF_ALU|BPF_MUL|BPF_X:
   1144 	case BPF_ALU|BPF_DIV|BPF_X:
   1145 	case BPF_ALU|BPF_MOD|BPF_X:
   1146 	case BPF_ALU|BPF_AND|BPF_X:
   1147 	case BPF_ALU|BPF_OR|BPF_X:
   1148 	case BPF_ALU|BPF_XOR|BPF_X:
   1149 	case BPF_ALU|BPF_LSH|BPF_X:
   1150 	case BPF_ALU|BPF_RSH|BPF_X:
   1151 		op = BPF_OP(s->code);
   1152 		if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
   1153 			if (opt_state->vmap[val[A_ATOM]].is_const) {
   1154 				fold_op(cstate, opt_state, s, val[A_ATOM], val[X_ATOM]);
   1155 				val[A_ATOM] = K(s->k);
   1156 			}
   1157 			else {
   1158 				s->code = BPF_ALU|BPF_K|op;
   1159 				s->k = opt_state->vmap[val[X_ATOM]].const_val;
   1160 				opt_state->done = 0;
   1161 				val[A_ATOM] =
   1162 					F(opt_state, s->code, val[A_ATOM], K(s->k));
   1163 			}
   1164 			break;
   1165 		}
   1166 		/*
   1167 		 * Check if we're doing something to an accumulator
   1168 		 * that is 0, and simplify.  This may not seem like
   1169 		 * much of a simplification but it could open up further
   1170 		 * optimizations.
   1171 		 * XXX We could also check for mul by 1, etc.
   1172 		 */
   1173 		if (alter && opt_state->vmap[val[A_ATOM]].is_const
   1174 		    && opt_state->vmap[val[A_ATOM]].const_val == 0) {
   1175 			if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
   1176 				s->code = BPF_MISC|BPF_TXA;
   1177 				vstore(s, &val[A_ATOM], val[X_ATOM], alter);
   1178 				break;
   1179 			}
   1180 			else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
   1181 				 op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
   1182 				s->code = BPF_LD|BPF_IMM;
   1183 				s->k = 0;
   1184 				vstore(s, &val[A_ATOM], K(s->k), alter);
   1185 				break;
   1186 			}
   1187 			else if (op == BPF_NEG) {
   1188 				s->code = NOP;
   1189 				break;
   1190 			}
   1191 		}
   1192 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
   1193 		break;
   1194 
   1195 	case BPF_MISC|BPF_TXA:
   1196 		vstore(s, &val[A_ATOM], val[X_ATOM], alter);
   1197 		break;
   1198 
   1199 	case BPF_LD|BPF_MEM:
   1200 		v = val[s->k];
   1201 		if (alter && opt_state->vmap[v].is_const) {
   1202 			s->code = BPF_LD|BPF_IMM;
   1203 			s->k = opt_state->vmap[v].const_val;
   1204 			opt_state->done = 0;
   1205 		}
   1206 		vstore(s, &val[A_ATOM], v, alter);
   1207 		break;
   1208 
   1209 	case BPF_MISC|BPF_TAX:
   1210 		vstore(s, &val[X_ATOM], val[A_ATOM], alter);
   1211 		break;
   1212 
   1213 	case BPF_LDX|BPF_MEM:
   1214 		v = val[s->k];
   1215 		if (alter && opt_state->vmap[v].is_const) {
   1216 			s->code = BPF_LDX|BPF_IMM;
   1217 			s->k = opt_state->vmap[v].const_val;
   1218 			opt_state->done = 0;
   1219 		}
   1220 		vstore(s, &val[X_ATOM], v, alter);
   1221 		break;
   1222 
   1223 	case BPF_ST:
   1224 		vstore(s, &val[s->k], val[A_ATOM], alter);
   1225 		break;
   1226 
   1227 	case BPF_STX:
   1228 		vstore(s, &val[s->k], val[X_ATOM], alter);
   1229 		break;
   1230 	}
   1231 }
   1232 
   1233 static void
   1234 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
   1235 {
   1236 	register int atom;
   1237 
   1238 	atom = atomuse(s);
   1239 	if (atom >= 0) {
   1240 		if (atom == AX_ATOM) {
   1241 			last[X_ATOM] = 0;
   1242 			last[A_ATOM] = 0;
   1243 		}
   1244 		else
   1245 			last[atom] = 0;
   1246 	}
   1247 	atom = atomdef(s);
   1248 	if (atom >= 0) {
   1249 		if (last[atom]) {
   1250 			opt_state->done = 0;
   1251 			last[atom]->code = NOP;
   1252 		}
   1253 		last[atom] = s;
   1254 	}
   1255 }
   1256 
   1257 static void
   1258 opt_deadstores(opt_state_t *opt_state, register struct block *b)
   1259 {
   1260 	register struct slist *s;
   1261 	register int atom;
   1262 	struct stmt *last[N_ATOMS];
   1263 
   1264 	memset((char *)last, 0, sizeof last);
   1265 
   1266 	for (s = b->stmts; s != 0; s = s->next)
   1267 		deadstmt(opt_state, &s->s, last);
   1268 	deadstmt(opt_state, &b->s, last);
   1269 
   1270 	for (atom = 0; atom < N_ATOMS; ++atom)
   1271 		if (last[atom] && !ATOMELEM(b->out_use, atom)) {
   1272 			last[atom]->code = NOP;
   1273 			opt_state->done = 0;
   1274 		}
   1275 }
   1276 
   1277 static void
   1278 opt_blk(compiler_state_t *cstate, opt_state_t *opt_state,
   1279     struct block *b, int do_stmts)
   1280 {
   1281 	struct slist *s;
   1282 	struct edge *p;
   1283 	int i;
   1284 	bpf_int32 aval, xval;
   1285 
   1286 #if 0
   1287 	for (s = b->stmts; s && s->next; s = s->next)
   1288 		if (BPF_CLASS(s->s.code) == BPF_JMP) {
   1289 			do_stmts = 0;
   1290 			break;
   1291 		}
   1292 #endif
   1293 
   1294 	/*
   1295 	 * Initialize the atom values.
   1296 	 */
   1297 	p = b->in_edges;
   1298 	if (p == 0) {
   1299 		/*
   1300 		 * We have no predecessors, so everything is undefined
   1301 		 * upon entry to this block.
   1302 		 */
   1303 		memset((char *)b->val, 0, sizeof(b->val));
   1304 	} else {
   1305 		/*
   1306 		 * Inherit values from our predecessors.
   1307 		 *
   1308 		 * First, get the values from the predecessor along the
   1309 		 * first edge leading to this node.
   1310 		 */
   1311 		memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
   1312 		/*
   1313 		 * Now look at all the other nodes leading to this node.
   1314 		 * If, for the predecessor along that edge, a register
   1315 		 * has a different value from the one we have (i.e.,
   1316 		 * control paths are merging, and the merging paths
   1317 		 * assign different values to that register), give the
   1318 		 * register the undefined value of 0.
   1319 		 */
   1320 		while ((p = p->next) != NULL) {
   1321 			for (i = 0; i < N_ATOMS; ++i)
   1322 				if (b->val[i] != p->pred->val[i])
   1323 					b->val[i] = 0;
   1324 		}
   1325 	}
   1326 	aval = b->val[A_ATOM];
   1327 	xval = b->val[X_ATOM];
   1328 	for (s = b->stmts; s; s = s->next)
   1329 		opt_stmt(cstate, opt_state, &s->s, b->val, do_stmts);
   1330 
   1331 	/*
   1332 	 * This is a special case: if we don't use anything from this
   1333 	 * block, and we load the accumulator or index register with a
   1334 	 * value that is already there, or if this block is a return,
   1335 	 * eliminate all the statements.
   1336 	 *
   1337 	 * XXX - what if it does a store?
   1338 	 *
   1339 	 * XXX - why does it matter whether we use anything from this
   1340 	 * block?  If the accumulator or index register doesn't change
   1341 	 * its value, isn't that OK even if we use that value?
   1342 	 *
   1343 	 * XXX - if we load the accumulator with a different value,
   1344 	 * and the block ends with a conditional branch, we obviously
   1345 	 * can't eliminate it, as the branch depends on that value.
   1346 	 * For the index register, the conditional branch only depends
   1347 	 * on the index register value if the test is against the index
   1348 	 * register value rather than a constant; if nothing uses the
   1349 	 * value we put into the index register, and we're not testing
   1350 	 * against the index register's value, and there aren't any
   1351 	 * other problems that would keep us from eliminating this
   1352 	 * block, can we eliminate it?
   1353 	 */
   1354 	if (do_stmts &&
   1355 	    ((b->out_use == 0 &&
   1356 	      aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
   1357 	      xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
   1358 	     BPF_CLASS(b->s.code) == BPF_RET)) {
   1359 		if (b->stmts != 0) {
   1360 			b->stmts = 0;
   1361 			opt_state->done = 0;
   1362 		}
   1363 	} else {
   1364 		opt_peep(opt_state, b);
   1365 		opt_deadstores(opt_state, b);
   1366 	}
   1367 	/*
   1368 	 * Set up values for branch optimizer.
   1369 	 */
   1370 	if (BPF_SRC(b->s.code) == BPF_K)
   1371 		b->oval = K(b->s.k);
   1372 	else
   1373 		b->oval = b->val[X_ATOM];
   1374 	b->et.code = b->s.code;
   1375 	b->ef.code = -b->s.code;
   1376 }
   1377 
   1378 /*
   1379  * Return true if any register that is used on exit from 'succ', has
   1380  * an exit value that is different from the corresponding exit value
   1381  * from 'b'.
   1382  */
   1383 static int
   1384 use_conflict(struct block *b, struct block *succ)
   1385 {
   1386 	int atom;
   1387 	atomset use = succ->out_use;
   1388 
   1389 	if (use == 0)
   1390 		return 0;
   1391 
   1392 	for (atom = 0; atom < N_ATOMS; ++atom)
   1393 		if (ATOMELEM(use, atom))
   1394 			if (b->val[atom] != succ->val[atom])
   1395 				return 1;
   1396 	return 0;
   1397 }
   1398 
   1399 static struct block *
   1400 fold_edge(struct block *child, struct edge *ep)
   1401 {
   1402 	int sense;
   1403 	int aval0, aval1, oval0, oval1;
   1404 	int code = ep->code;
   1405 
   1406 	if (code < 0) {
   1407 		code = -code;
   1408 		sense = 0;
   1409 	} else
   1410 		sense = 1;
   1411 
   1412 	if (child->s.code != code)
   1413 		return 0;
   1414 
   1415 	aval0 = child->val[A_ATOM];
   1416 	oval0 = child->oval;
   1417 	aval1 = ep->pred->val[A_ATOM];
   1418 	oval1 = ep->pred->oval;
   1419 
   1420 	if (aval0 != aval1)
   1421 		return 0;
   1422 
   1423 	if (oval0 == oval1)
   1424 		/*
   1425 		 * The operands of the branch instructions are
   1426 		 * identical, so the result is true if a true
   1427 		 * branch was taken to get here, otherwise false.
   1428 		 */
   1429 		return sense ? JT(child) : JF(child);
   1430 
   1431 	if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
   1432 		/*
   1433 		 * At this point, we only know the comparison if we
   1434 		 * came down the true branch, and it was an equality
   1435 		 * comparison with a constant.
   1436 		 *
   1437 		 * I.e., if we came down the true branch, and the branch
   1438 		 * was an equality comparison with a constant, we know the
   1439 		 * accumulator contains that constant.  If we came down
   1440 		 * the false branch, or the comparison wasn't with a
   1441 		 * constant, we don't know what was in the accumulator.
   1442 		 *
   1443 		 * We rely on the fact that distinct constants have distinct
   1444 		 * value numbers.
   1445 		 */
   1446 		return JF(child);
   1447 
   1448 	return 0;
   1449 }
   1450 
   1451 static void
   1452 opt_j(opt_state_t *opt_state, struct edge *ep)
   1453 {
   1454 	register int i, k;
   1455 	register struct block *target;
   1456 
   1457 	if (JT(ep->succ) == 0)
   1458 		return;
   1459 
   1460 	if (JT(ep->succ) == JF(ep->succ)) {
   1461 		/*
   1462 		 * Common branch targets can be eliminated, provided
   1463 		 * there is no data dependency.
   1464 		 */
   1465 		if (!use_conflict(ep->pred, ep->succ->et.succ)) {
   1466 			opt_state->done = 0;
   1467 			ep->succ = JT(ep->succ);
   1468 		}
   1469 	}
   1470 	/*
   1471 	 * For each edge dominator that matches the successor of this
   1472 	 * edge, promote the edge successor to the its grandchild.
   1473 	 *
   1474 	 * XXX We violate the set abstraction here in favor a reasonably
   1475 	 * efficient loop.
   1476 	 */
   1477  top:
   1478 	for (i = 0; i < opt_state->edgewords; ++i) {
   1479 		register bpf_u_int32 x = ep->edom[i];
   1480 
   1481 		while (x != 0) {
   1482 			k = lowest_set_bit(x);
   1483 			x &=~ (1 << k);
   1484 			k += i * BITS_PER_WORD;
   1485 
   1486 			target = fold_edge(ep->succ, opt_state->edges[k]);
   1487 			/*
   1488 			 * Check that there is no data dependency between
   1489 			 * nodes that will be violated if we move the edge.
   1490 			 */
   1491 			if (target != 0 && !use_conflict(ep->pred, target)) {
   1492 				opt_state->done = 0;
   1493 				ep->succ = target;
   1494 				if (JT(target) != 0)
   1495 					/*
   1496 					 * Start over unless we hit a leaf.
   1497 					 */
   1498 					goto top;
   1499 				return;
   1500 			}
   1501 		}
   1502 	}
   1503 }
   1504 
   1505 
   1506 static void
   1507 or_pullup(opt_state_t *opt_state, struct block *b)
   1508 {
   1509 	int val, at_top;
   1510 	struct block *pull;
   1511 	struct block **diffp, **samep;
   1512 	struct edge *ep;
   1513 
   1514 	ep = b->in_edges;
   1515 	if (ep == 0)
   1516 		return;
   1517 
   1518 	/*
   1519 	 * Make sure each predecessor loads the same value.
   1520 	 * XXX why?
   1521 	 */
   1522 	val = ep->pred->val[A_ATOM];
   1523 	for (ep = ep->next; ep != 0; ep = ep->next)
   1524 		if (val != ep->pred->val[A_ATOM])
   1525 			return;
   1526 
   1527 	if (JT(b->in_edges->pred) == b)
   1528 		diffp = &JT(b->in_edges->pred);
   1529 	else
   1530 		diffp = &JF(b->in_edges->pred);
   1531 
   1532 	at_top = 1;
   1533 	for (;;) {
   1534 		if (*diffp == 0)
   1535 			return;
   1536 
   1537 		if (JT(*diffp) != JT(b))
   1538 			return;
   1539 
   1540 		if (!SET_MEMBER((*diffp)->dom, b->id))
   1541 			return;
   1542 
   1543 		if ((*diffp)->val[A_ATOM] != val)
   1544 			break;
   1545 
   1546 		diffp = &JF(*diffp);
   1547 		at_top = 0;
   1548 	}
   1549 	samep = &JF(*diffp);
   1550 	for (;;) {
   1551 		if (*samep == 0)
   1552 			return;
   1553 
   1554 		if (JT(*samep) != JT(b))
   1555 			return;
   1556 
   1557 		if (!SET_MEMBER((*samep)->dom, b->id))
   1558 			return;
   1559 
   1560 		if ((*samep)->val[A_ATOM] == val)
   1561 			break;
   1562 
   1563 		/* XXX Need to check that there are no data dependencies
   1564 		   between dp0 and dp1.  Currently, the code generator
   1565 		   will not produce such dependencies. */
   1566 		samep = &JF(*samep);
   1567 	}
   1568 #ifdef notdef
   1569 	/* XXX This doesn't cover everything. */
   1570 	for (i = 0; i < N_ATOMS; ++i)
   1571 		if ((*samep)->val[i] != pred->val[i])
   1572 			return;
   1573 #endif
   1574 	/* Pull up the node. */
   1575 	pull = *samep;
   1576 	*samep = JF(pull);
   1577 	JF(pull) = *diffp;
   1578 
   1579 	/*
   1580 	 * At the top of the chain, each predecessor needs to point at the
   1581 	 * pulled up node.  Inside the chain, there is only one predecessor
   1582 	 * to worry about.
   1583 	 */
   1584 	if (at_top) {
   1585 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
   1586 			if (JT(ep->pred) == b)
   1587 				JT(ep->pred) = pull;
   1588 			else
   1589 				JF(ep->pred) = pull;
   1590 		}
   1591 	}
   1592 	else
   1593 		*diffp = pull;
   1594 
   1595 	opt_state->done = 0;
   1596 }
   1597 
   1598 static void
   1599 and_pullup(opt_state_t *opt_state, struct block *b)
   1600 {
   1601 	int val, at_top;
   1602 	struct block *pull;
   1603 	struct block **diffp, **samep;
   1604 	struct edge *ep;
   1605 
   1606 	ep = b->in_edges;
   1607 	if (ep == 0)
   1608 		return;
   1609 
   1610 	/*
   1611 	 * Make sure each predecessor loads the same value.
   1612 	 */
   1613 	val = ep->pred->val[A_ATOM];
   1614 	for (ep = ep->next; ep != 0; ep = ep->next)
   1615 		if (val != ep->pred->val[A_ATOM])
   1616 			return;
   1617 
   1618 	if (JT(b->in_edges->pred) == b)
   1619 		diffp = &JT(b->in_edges->pred);
   1620 	else
   1621 		diffp = &JF(b->in_edges->pred);
   1622 
   1623 	at_top = 1;
   1624 	for (;;) {
   1625 		if (*diffp == 0)
   1626 			return;
   1627 
   1628 		if (JF(*diffp) != JF(b))
   1629 			return;
   1630 
   1631 		if (!SET_MEMBER((*diffp)->dom, b->id))
   1632 			return;
   1633 
   1634 		if ((*diffp)->val[A_ATOM] != val)
   1635 			break;
   1636 
   1637 		diffp = &JT(*diffp);
   1638 		at_top = 0;
   1639 	}
   1640 	samep = &JT(*diffp);
   1641 	for (;;) {
   1642 		if (*samep == 0)
   1643 			return;
   1644 
   1645 		if (JF(*samep) != JF(b))
   1646 			return;
   1647 
   1648 		if (!SET_MEMBER((*samep)->dom, b->id))
   1649 			return;
   1650 
   1651 		if ((*samep)->val[A_ATOM] == val)
   1652 			break;
   1653 
   1654 		/* XXX Need to check that there are no data dependencies
   1655 		   between diffp and samep.  Currently, the code generator
   1656 		   will not produce such dependencies. */
   1657 		samep = &JT(*samep);
   1658 	}
   1659 #ifdef notdef
   1660 	/* XXX This doesn't cover everything. */
   1661 	for (i = 0; i < N_ATOMS; ++i)
   1662 		if ((*samep)->val[i] != pred->val[i])
   1663 			return;
   1664 #endif
   1665 	/* Pull up the node. */
   1666 	pull = *samep;
   1667 	*samep = JT(pull);
   1668 	JT(pull) = *diffp;
   1669 
   1670 	/*
   1671 	 * At the top of the chain, each predecessor needs to point at the
   1672 	 * pulled up node.  Inside the chain, there is only one predecessor
   1673 	 * to worry about.
   1674 	 */
   1675 	if (at_top) {
   1676 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
   1677 			if (JT(ep->pred) == b)
   1678 				JT(ep->pred) = pull;
   1679 			else
   1680 				JF(ep->pred) = pull;
   1681 		}
   1682 	}
   1683 	else
   1684 		*diffp = pull;
   1685 
   1686 	opt_state->done = 0;
   1687 }
   1688 
   1689 static void
   1690 opt_blks(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
   1691     int do_stmts)
   1692 {
   1693 	int i, maxlevel;
   1694 	struct block *p;
   1695 
   1696 	init_val(opt_state);
   1697 	maxlevel = ic->root->level;
   1698 
   1699 	find_inedges(opt_state, ic->root);
   1700 	for (i = maxlevel; i >= 0; --i)
   1701 		for (p = opt_state->levels[i]; p; p = p->link)
   1702 			opt_blk(cstate, opt_state, p, do_stmts);
   1703 
   1704 	if (do_stmts)
   1705 		/*
   1706 		 * No point trying to move branches; it can't possibly
   1707 		 * make a difference at this point.
   1708 		 */
   1709 		return;
   1710 
   1711 	for (i = 1; i <= maxlevel; ++i) {
   1712 		for (p = opt_state->levels[i]; p; p = p->link) {
   1713 			opt_j(opt_state, &p->et);
   1714 			opt_j(opt_state, &p->ef);
   1715 		}
   1716 	}
   1717 
   1718 	find_inedges(opt_state, ic->root);
   1719 	for (i = 1; i <= maxlevel; ++i) {
   1720 		for (p = opt_state->levels[i]; p; p = p->link) {
   1721 			or_pullup(opt_state, p);
   1722 			and_pullup(opt_state, p);
   1723 		}
   1724 	}
   1725 }
   1726 
   1727 static inline void
   1728 link_inedge(struct edge *parent, struct block *child)
   1729 {
   1730 	parent->next = child->in_edges;
   1731 	child->in_edges = parent;
   1732 }
   1733 
   1734 static void
   1735 find_inedges(opt_state_t *opt_state, struct block *root)
   1736 {
   1737 	int i;
   1738 	struct block *b;
   1739 
   1740 	for (i = 0; i < opt_state->n_blocks; ++i)
   1741 		opt_state->blocks[i]->in_edges = 0;
   1742 
   1743 	/*
   1744 	 * Traverse the graph, adding each edge to the predecessor
   1745 	 * list of its successors.  Skip the leaves (i.e. level 0).
   1746 	 */
   1747 	for (i = root->level; i > 0; --i) {
   1748 		for (b = opt_state->levels[i]; b != 0; b = b->link) {
   1749 			link_inedge(&b->et, JT(b));
   1750 			link_inedge(&b->ef, JF(b));
   1751 		}
   1752 	}
   1753 }
   1754 
   1755 static void
   1756 opt_root(struct block **b)
   1757 {
   1758 	struct slist *tmp, *s;
   1759 
   1760 	s = (*b)->stmts;
   1761 	(*b)->stmts = 0;
   1762 	while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
   1763 		*b = JT(*b);
   1764 
   1765 	tmp = (*b)->stmts;
   1766 	if (tmp != 0)
   1767 		sappend(s, tmp);
   1768 	(*b)->stmts = s;
   1769 
   1770 	/*
   1771 	 * If the root node is a return, then there is no
   1772 	 * point executing any statements (since the bpf machine
   1773 	 * has no side effects).
   1774 	 */
   1775 	if (BPF_CLASS((*b)->s.code) == BPF_RET)
   1776 		(*b)->stmts = 0;
   1777 }
   1778 
   1779 static void
   1780 opt_loop(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
   1781     int do_stmts)
   1782 {
   1783 
   1784 #ifdef BDEBUG
   1785 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   1786 		printf("opt_loop(root, %d) begin\n", do_stmts);
   1787 		opt_dump(cstate, ic);
   1788 	}
   1789 #endif
   1790 	do {
   1791 		opt_state->done = 1;
   1792 		find_levels(opt_state, ic);
   1793 		find_dom(opt_state, ic->root);
   1794 		find_closure(opt_state, ic->root);
   1795 		find_ud(opt_state, ic->root);
   1796 		find_edom(opt_state, ic->root);
   1797 		opt_blks(cstate, opt_state, ic, do_stmts);
   1798 #ifdef BDEBUG
   1799 		if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   1800 			printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
   1801 			opt_dump(cstate, ic);
   1802 		}
   1803 #endif
   1804 	} while (!opt_state->done);
   1805 }
   1806 
   1807 /*
   1808  * Optimize the filter code in its dag representation.
   1809  */
   1810 void
   1811 bpf_optimize(compiler_state_t *cstate, struct icode *ic)
   1812 {
   1813 	opt_state_t opt_state;
   1814 
   1815 	opt_init(cstate, &opt_state, ic);
   1816 	opt_loop(cstate, &opt_state, ic, 0);
   1817 	opt_loop(cstate, &opt_state, ic, 1);
   1818 	intern_blocks(&opt_state, ic);
   1819 #ifdef BDEBUG
   1820 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   1821 		printf("after intern_blocks()\n");
   1822 		opt_dump(cstate, ic);
   1823 	}
   1824 #endif
   1825 	opt_root(&ic->root);
   1826 #ifdef BDEBUG
   1827 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   1828 		printf("after opt_root()\n");
   1829 		opt_dump(cstate, ic);
   1830 	}
   1831 #endif
   1832 	opt_cleanup(&opt_state);
   1833 }
   1834 
   1835 static void
   1836 make_marks(struct icode *ic, struct block *p)
   1837 {
   1838 	if (!isMarked(ic, p)) {
   1839 		Mark(ic, p);
   1840 		if (BPF_CLASS(p->s.code) != BPF_RET) {
   1841 			make_marks(ic, JT(p));
   1842 			make_marks(ic, JF(p));
   1843 		}
   1844 	}
   1845 }
   1846 
   1847 /*
   1848  * Mark code array such that isMarked(ic->cur_mark, i) is true
   1849  * only for nodes that are alive.
   1850  */
   1851 static void
   1852 mark_code(struct icode *ic)
   1853 {
   1854 	ic->cur_mark += 1;
   1855 	make_marks(ic, ic->root);
   1856 }
   1857 
   1858 /*
   1859  * True iff the two stmt lists load the same value from the packet into
   1860  * the accumulator.
   1861  */
   1862 static int
   1863 eq_slist(struct slist *x, struct slist *y)
   1864 {
   1865 	for (;;) {
   1866 		while (x && x->s.code == NOP)
   1867 			x = x->next;
   1868 		while (y && y->s.code == NOP)
   1869 			y = y->next;
   1870 		if (x == 0)
   1871 			return y == 0;
   1872 		if (y == 0)
   1873 			return x == 0;
   1874 		if (x->s.code != y->s.code || x->s.k != y->s.k)
   1875 			return 0;
   1876 		x = x->next;
   1877 		y = y->next;
   1878 	}
   1879 }
   1880 
   1881 static inline int
   1882 eq_blk(struct block *b0, struct block *b1)
   1883 {
   1884 	if (b0->s.code == b1->s.code &&
   1885 	    b0->s.k == b1->s.k &&
   1886 	    b0->et.succ == b1->et.succ &&
   1887 	    b0->ef.succ == b1->ef.succ)
   1888 		return eq_slist(b0->stmts, b1->stmts);
   1889 	return 0;
   1890 }
   1891 
   1892 static void
   1893 intern_blocks(opt_state_t *opt_state, struct icode *ic)
   1894 {
   1895 	struct block *p;
   1896 	int i, j;
   1897 	int done1; /* don't shadow global */
   1898  top:
   1899 	done1 = 1;
   1900 	for (i = 0; i < opt_state->n_blocks; ++i)
   1901 		opt_state->blocks[i]->link = 0;
   1902 
   1903 	mark_code(ic);
   1904 
   1905 	for (i = opt_state->n_blocks - 1; --i >= 0; ) {
   1906 		if (!isMarked(ic, opt_state->blocks[i]))
   1907 			continue;
   1908 		for (j = i + 1; j < opt_state->n_blocks; ++j) {
   1909 			if (!isMarked(ic, opt_state->blocks[j]))
   1910 				continue;
   1911 			if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
   1912 				opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
   1913 					opt_state->blocks[j]->link : opt_state->blocks[j];
   1914 				break;
   1915 			}
   1916 		}
   1917 	}
   1918 	for (i = 0; i < opt_state->n_blocks; ++i) {
   1919 		p = opt_state->blocks[i];
   1920 		if (JT(p) == 0)
   1921 			continue;
   1922 		if (JT(p)->link) {
   1923 			done1 = 0;
   1924 			JT(p) = JT(p)->link;
   1925 		}
   1926 		if (JF(p)->link) {
   1927 			done1 = 0;
   1928 			JF(p) = JF(p)->link;
   1929 		}
   1930 	}
   1931 	if (!done1)
   1932 		goto top;
   1933 }
   1934 
   1935 static void
   1936 opt_cleanup(opt_state_t *opt_state)
   1937 {
   1938 	free((void *)opt_state->vnode_base);
   1939 	free((void *)opt_state->vmap);
   1940 	free((void *)opt_state->edges);
   1941 	free((void *)opt_state->space);
   1942 	free((void *)opt_state->levels);
   1943 	free((void *)opt_state->blocks);
   1944 }
   1945 
   1946 /*
   1947  * Return the number of stmts in 's'.
   1948  */
   1949 static u_int
   1950 slength(struct slist *s)
   1951 {
   1952 	u_int n = 0;
   1953 
   1954 	for (; s; s = s->next)
   1955 		if (s->s.code != NOP)
   1956 			++n;
   1957 	return n;
   1958 }
   1959 
   1960 /*
   1961  * Return the number of nodes reachable by 'p'.
   1962  * All nodes should be initially unmarked.
   1963  */
   1964 static int
   1965 count_blocks(struct icode *ic, struct block *p)
   1966 {
   1967 	if (p == 0 || isMarked(ic, p))
   1968 		return 0;
   1969 	Mark(ic, p);
   1970 	return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
   1971 }
   1972 
   1973 /*
   1974  * Do a depth first search on the flow graph, numbering the
   1975  * the basic blocks, and entering them into the 'blocks' array.`
   1976  */
   1977 static void
   1978 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
   1979 {
   1980 	int n;
   1981 
   1982 	if (p == 0 || isMarked(ic, p))
   1983 		return;
   1984 
   1985 	Mark(ic, p);
   1986 	n = opt_state->n_blocks++;
   1987 	p->id = n;
   1988 	opt_state->blocks[n] = p;
   1989 
   1990 	number_blks_r(opt_state, ic, JT(p));
   1991 	number_blks_r(opt_state, ic, JF(p));
   1992 }
   1993 
   1994 /*
   1995  * Return the number of stmts in the flowgraph reachable by 'p'.
   1996  * The nodes should be unmarked before calling.
   1997  *
   1998  * Note that "stmts" means "instructions", and that this includes
   1999  *
   2000  *	side-effect statements in 'p' (slength(p->stmts));
   2001  *
   2002  *	statements in the true branch from 'p' (count_stmts(JT(p)));
   2003  *
   2004  *	statements in the false branch from 'p' (count_stmts(JF(p)));
   2005  *
   2006  *	the conditional jump itself (1);
   2007  *
   2008  *	an extra long jump if the true branch requires it (p->longjt);
   2009  *
   2010  *	an extra long jump if the false branch requires it (p->longjf).
   2011  */
   2012 static u_int
   2013 count_stmts(struct icode *ic, struct block *p)
   2014 {
   2015 	u_int n;
   2016 
   2017 	if (p == 0 || isMarked(ic, p))
   2018 		return 0;
   2019 	Mark(ic, p);
   2020 	n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
   2021 	return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
   2022 }
   2023 
   2024 /*
   2025  * Allocate memory.  All allocation is done before optimization
   2026  * is begun.  A linear bound on the size of all data structures is computed
   2027  * from the total number of blocks and/or statements.
   2028  */
   2029 static void
   2030 opt_init(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic)
   2031 {
   2032 	bpf_u_int32 *p;
   2033 	int i, n, max_stmts;
   2034 
   2035 	/*
   2036 	 * First, count the blocks, so we can malloc an array to map
   2037 	 * block number to block.  Then, put the blocks into the array.
   2038 	 */
   2039 	unMarkAll(ic);
   2040 	n = count_blocks(ic, ic->root);
   2041 	opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
   2042 	if (opt_state->blocks == NULL)
   2043 		bpf_error(cstate, "malloc");
   2044 	unMarkAll(ic);
   2045 	opt_state->n_blocks = 0;
   2046 	number_blks_r(opt_state, ic, ic->root);
   2047 
   2048 	opt_state->n_edges = 2 * opt_state->n_blocks;
   2049 	opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
   2050 	if (opt_state->edges == NULL)
   2051 		bpf_error(cstate, "malloc");
   2052 
   2053 	/*
   2054 	 * The number of levels is bounded by the number of nodes.
   2055 	 */
   2056 	opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
   2057 	if (opt_state->levels == NULL)
   2058 		bpf_error(cstate, "malloc");
   2059 
   2060 	opt_state->edgewords = opt_state->n_edges / (8 * sizeof(bpf_u_int32)) + 1;
   2061 	opt_state->nodewords = opt_state->n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
   2062 
   2063 	/* XXX */
   2064 	opt_state->space = (bpf_u_int32 *)malloc(2 * opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->space)
   2065 				 + opt_state->n_edges * opt_state->edgewords * sizeof(*opt_state->space));
   2066 	if (opt_state->space == NULL)
   2067 		bpf_error(cstate, "malloc");
   2068 	p = opt_state->space;
   2069 	opt_state->all_dom_sets = p;
   2070 	for (i = 0; i < n; ++i) {
   2071 		opt_state->blocks[i]->dom = p;
   2072 		p += opt_state->nodewords;
   2073 	}
   2074 	opt_state->all_closure_sets = p;
   2075 	for (i = 0; i < n; ++i) {
   2076 		opt_state->blocks[i]->closure = p;
   2077 		p += opt_state->nodewords;
   2078 	}
   2079 	opt_state->all_edge_sets = p;
   2080 	for (i = 0; i < n; ++i) {
   2081 		register struct block *b = opt_state->blocks[i];
   2082 
   2083 		b->et.edom = p;
   2084 		p += opt_state->edgewords;
   2085 		b->ef.edom = p;
   2086 		p += opt_state->edgewords;
   2087 		b->et.id = i;
   2088 		opt_state->edges[i] = &b->et;
   2089 		b->ef.id = opt_state->n_blocks + i;
   2090 		opt_state->edges[opt_state->n_blocks + i] = &b->ef;
   2091 		b->et.pred = b;
   2092 		b->ef.pred = b;
   2093 	}
   2094 	max_stmts = 0;
   2095 	for (i = 0; i < n; ++i)
   2096 		max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
   2097 	/*
   2098 	 * We allocate at most 3 value numbers per statement,
   2099 	 * so this is an upper bound on the number of valnodes
   2100 	 * we'll need.
   2101 	 */
   2102 	opt_state->maxval = 3 * max_stmts;
   2103 	opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
   2104 	opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
   2105 	if (opt_state->vmap == NULL || opt_state->vnode_base == NULL)
   2106 		bpf_error(cstate, "malloc");
   2107 }
   2108 
   2109 /*
   2110  * This is only used when supporting optimizer debugging.  It is
   2111  * global state, so do *not* do more than one compile in parallel
   2112  * and expect it to provide meaningful information.
   2113  */
   2114 #ifdef BDEBUG
   2115 int bids[NBIDS];
   2116 #endif
   2117 
   2118 /*
   2119  * Returns true if successful.  Returns false if a branch has
   2120  * an offset that is too large.  If so, we have marked that
   2121  * branch so that on a subsequent iteration, it will be treated
   2122  * properly.
   2123  */
   2124 static int
   2125 convert_code_r(compiler_state_t *cstate, conv_state_t *conv_state,
   2126     struct icode *ic, struct block *p)
   2127 {
   2128 	struct bpf_insn *dst;
   2129 	struct slist *src;
   2130 	u_int slen;
   2131 	u_int off;
   2132 	u_int extrajmps;	/* number of extra jumps inserted */
   2133 	struct slist **offset = NULL;
   2134 
   2135 	if (p == 0 || isMarked(ic, p))
   2136 		return (1);
   2137 	Mark(ic, p);
   2138 
   2139 	if (convert_code_r(cstate, conv_state, ic, JF(p)) == 0)
   2140 		return (0);
   2141 	if (convert_code_r(cstate, conv_state, ic, JT(p)) == 0)
   2142 		return (0);
   2143 
   2144 	slen = slength(p->stmts);
   2145 	dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
   2146 		/* inflate length by any extra jumps */
   2147 
   2148 	p->offset = (int)(dst - conv_state->fstart);
   2149 
   2150 	/* generate offset[] for convenience  */
   2151 	if (slen) {
   2152 		offset = (struct slist **)calloc(slen, sizeof(struct slist *));
   2153 		if (!offset) {
   2154 			bpf_error(cstate, "not enough core");
   2155 			/*NOTREACHED*/
   2156 		}
   2157 	}
   2158 	src = p->stmts;
   2159 	for (off = 0; off < slen && src; off++) {
   2160 #if 0
   2161 		printf("off=%d src=%x\n", off, src);
   2162 #endif
   2163 		offset[off] = src;
   2164 		src = src->next;
   2165 	}
   2166 
   2167 	off = 0;
   2168 	for (src = p->stmts; src; src = src->next) {
   2169 		if (src->s.code == NOP)
   2170 			continue;
   2171 		dst->code = (u_short)src->s.code;
   2172 		dst->k = src->s.k;
   2173 
   2174 		/* fill block-local relative jump */
   2175 		if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
   2176 #if 0
   2177 			if (src->s.jt || src->s.jf) {
   2178 				bpf_error(cstate, "illegal jmp destination");
   2179 				/*NOTREACHED*/
   2180 			}
   2181 #endif
   2182 			goto filled;
   2183 		}
   2184 		if (off == slen - 2)	/*???*/
   2185 			goto filled;
   2186 
   2187 	    {
   2188 		u_int i;
   2189 		int jt, jf;
   2190 		const char ljerr[] = "%s for block-local relative jump: off=%d";
   2191 
   2192 #if 0
   2193 		printf("code=%x off=%d %x %x\n", src->s.code,
   2194 			off, src->s.jt, src->s.jf);
   2195 #endif
   2196 
   2197 		if (!src->s.jt || !src->s.jf) {
   2198 			bpf_error(cstate, ljerr, "no jmp destination", off);
   2199 			/*NOTREACHED*/
   2200 		}
   2201 
   2202 		jt = jf = 0;
   2203 		for (i = 0; i < slen; i++) {
   2204 			if (offset[i] == src->s.jt) {
   2205 				if (jt) {
   2206 					bpf_error(cstate, ljerr, "multiple matches", off);
   2207 					/*NOTREACHED*/
   2208 				}
   2209 
   2210 				if (i - off - 1 >= 256) {
   2211 					bpf_error(cstate, ljerr, "out-of-range jump", off);
   2212 					/*NOTREACHED*/
   2213 				}
   2214 				dst->jt = (u_char)(i - off - 1);
   2215 				jt++;
   2216 			}
   2217 			if (offset[i] == src->s.jf) {
   2218 				if (jf) {
   2219 					bpf_error(cstate, ljerr, "multiple matches", off);
   2220 					/*NOTREACHED*/
   2221 				}
   2222 				if (i - off - 1 >= 256) {
   2223 					bpf_error(cstate, ljerr, "out-of-range jump", off);
   2224 					/*NOTREACHED*/
   2225 				}
   2226 				dst->jf = (u_char)(i - off - 1);
   2227 				jf++;
   2228 			}
   2229 		}
   2230 		if (!jt || !jf) {
   2231 			bpf_error(cstate, ljerr, "no destination found", off);
   2232 			/*NOTREACHED*/
   2233 		}
   2234 	    }
   2235 filled:
   2236 		++dst;
   2237 		++off;
   2238 	}
   2239 	if (offset)
   2240 		free(offset);
   2241 
   2242 #ifdef BDEBUG
   2243 	if (dst - conv_state->fstart < NBIDS)
   2244 		bids[dst - conv_state->fstart] = p->id + 1;
   2245 #endif
   2246 	dst->code = (u_short)p->s.code;
   2247 	dst->k = p->s.k;
   2248 	if (JT(p)) {
   2249 		extrajmps = 0;
   2250 		off = JT(p)->offset - (p->offset + slen) - 1;
   2251 		if (off >= 256) {
   2252 		    /* offset too large for branch, must add a jump */
   2253 		    if (p->longjt == 0) {
   2254 		    	/* mark this instruction and retry */
   2255 			p->longjt++;
   2256 			return(0);
   2257 		    }
   2258 		    /* branch if T to following jump */
   2259 		    if (extrajmps >= 256) {
   2260 			bpf_error(cstate, "too many extra jumps");
   2261 			/*NOTREACHED*/
   2262 		    }
   2263 		    dst->jt = (u_char)extrajmps;
   2264 		    extrajmps++;
   2265 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
   2266 		    dst[extrajmps].k = off - extrajmps;
   2267 		}
   2268 		else
   2269 		    dst->jt = (u_char)off;
   2270 		off = JF(p)->offset - (p->offset + slen) - 1;
   2271 		if (off >= 256) {
   2272 		    /* offset too large for branch, must add a jump */
   2273 		    if (p->longjf == 0) {
   2274 		    	/* mark this instruction and retry */
   2275 			p->longjf++;
   2276 			return(0);
   2277 		    }
   2278 		    /* branch if F to following jump */
   2279 		    /* if two jumps are inserted, F goes to second one */
   2280 		    if (extrajmps >= 256) {
   2281 			bpf_error(cstate, "too many extra jumps");
   2282 			/*NOTREACHED*/
   2283 		    }
   2284 		    dst->jf = (u_char)extrajmps;
   2285 		    extrajmps++;
   2286 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
   2287 		    dst[extrajmps].k = off - extrajmps;
   2288 		}
   2289 		else
   2290 		    dst->jf = (u_char)off;
   2291 	}
   2292 	return (1);
   2293 }
   2294 
   2295 
   2296 /*
   2297  * Convert flowgraph intermediate representation to the
   2298  * BPF array representation.  Set *lenp to the number of instructions.
   2299  *
   2300  * This routine does *NOT* leak the memory pointed to by fp.  It *must
   2301  * not* do free(fp) before returning fp; doing so would make no sense,
   2302  * as the BPF array pointed to by the return value of icode_to_fcode()
   2303  * must be valid - it's being returned for use in a bpf_program structure.
   2304  *
   2305  * If it appears that icode_to_fcode() is leaking, the problem is that
   2306  * the program using pcap_compile() is failing to free the memory in
   2307  * the BPF program when it's done - the leak is in the program, not in
   2308  * the routine that happens to be allocating the memory.  (By analogy, if
   2309  * a program calls fopen() without ever calling fclose() on the FILE *,
   2310  * it will leak the FILE structure; the leak is not in fopen(), it's in
   2311  * the program.)  Change the program to use pcap_freecode() when it's
   2312  * done with the filter program.  See the pcap man page.
   2313  */
   2314 struct bpf_insn *
   2315 icode_to_fcode(compiler_state_t *cstate, struct icode *ic,
   2316     struct block *root, u_int *lenp)
   2317 {
   2318 	u_int n;
   2319 	struct bpf_insn *fp;
   2320 	conv_state_t conv_state;
   2321 
   2322 	/*
   2323 	 * Loop doing convert_code_r() until no branches remain
   2324 	 * with too-large offsets.
   2325 	 */
   2326 	for (;;) {
   2327 	    unMarkAll(ic);
   2328 	    n = *lenp = count_stmts(ic, root);
   2329 
   2330 	    fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
   2331 	    if (fp == NULL)
   2332 		    bpf_error(cstate, "malloc");
   2333 	    memset((char *)fp, 0, sizeof(*fp) * n);
   2334 	    conv_state.fstart = fp;
   2335 	    conv_state.ftail = fp + n;
   2336 
   2337 	    unMarkAll(ic);
   2338 	    if (convert_code_r(cstate, &conv_state, ic, root))
   2339 		break;
   2340 	    free(fp);
   2341 	}
   2342 
   2343 	return fp;
   2344 }
   2345 
   2346 /*
   2347  * Make a copy of a BPF program and put it in the "fcode" member of
   2348  * a "pcap_t".
   2349  *
   2350  * If we fail to allocate memory for the copy, fill in the "errbuf"
   2351  * member of the "pcap_t" with an error message, and return -1;
   2352  * otherwise, return 0.
   2353  */
   2354 int
   2355 install_bpf_program(pcap_t *p, struct bpf_program *fp)
   2356 {
   2357 	size_t prog_size;
   2358 
   2359 	/*
   2360 	 * Validate the program.
   2361 	 */
   2362 	if (!bpf_validate(fp->bf_insns, fp->bf_len)) {
   2363 		pcap_snprintf(p->errbuf, sizeof(p->errbuf),
   2364 			"BPF program is not valid");
   2365 		return (-1);
   2366 	}
   2367 
   2368 	/*
   2369 	 * Free up any already installed program.
   2370 	 */
   2371 	pcap_freecode(&p->fcode);
   2372 
   2373 	prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
   2374 	p->fcode.bf_len = fp->bf_len;
   2375 	p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
   2376 	if (p->fcode.bf_insns == NULL) {
   2377 		pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
   2378 		    errno, "malloc");
   2379 		return (-1);
   2380 	}
   2381 	memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
   2382 	return (0);
   2383 }
   2384 
   2385 #ifdef BDEBUG
   2386 static void
   2387 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
   2388     FILE *out)
   2389 {
   2390 	int icount, noffset;
   2391 	int i;
   2392 
   2393 	if (block == NULL || isMarked(ic, block))
   2394 		return;
   2395 	Mark(ic, block);
   2396 
   2397 	icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
   2398 	noffset = min(block->offset + icount, (int)prog->bf_len);
   2399 
   2400 	fprintf(out, "\tblock%d [shape=ellipse, id=\"block-%d\" label=\"BLOCK%d\\n", block->id, block->id, block->id);
   2401 	for (i = block->offset; i < noffset; i++) {
   2402 		fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
   2403 	}
   2404 	fprintf(out, "\" tooltip=\"");
   2405 	for (i = 0; i < BPF_MEMWORDS; i++)
   2406 		if (block->val[i] != VAL_UNKNOWN)
   2407 			fprintf(out, "val[%d]=%d ", i, block->val[i]);
   2408 	fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
   2409 	fprintf(out, "val[X]=%d", block->val[X_ATOM]);
   2410 	fprintf(out, "\"");
   2411 	if (JT(block) == NULL)
   2412 		fprintf(out, ", peripheries=2");
   2413 	fprintf(out, "];\n");
   2414 
   2415 	dot_dump_node(ic, JT(block), prog, out);
   2416 	dot_dump_node(ic, JF(block), prog, out);
   2417 }
   2418 
   2419 static void
   2420 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
   2421 {
   2422 	if (block == NULL || isMarked(ic, block))
   2423 		return;
   2424 	Mark(ic, block);
   2425 
   2426 	if (JT(block)) {
   2427 		fprintf(out, "\t\"block%d\":se -> \"block%d\":n [label=\"T\"]; \n",
   2428 				block->id, JT(block)->id);
   2429 		fprintf(out, "\t\"block%d\":sw -> \"block%d\":n [label=\"F\"]; \n",
   2430 			   block->id, JF(block)->id);
   2431 	}
   2432 	dot_dump_edge(ic, JT(block), out);
   2433 	dot_dump_edge(ic, JF(block), out);
   2434 }
   2435 
   2436 /* Output the block CFG using graphviz/DOT language
   2437  * In the CFG, block's code, value index for each registers at EXIT,
   2438  * and the jump relationship is show.
   2439  *
   2440  * example DOT for BPF `ip src host 1.1.1.1' is:
   2441     digraph BPF {
   2442     	block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2	jf 5" tooltip="val[A]=0 val[X]=0"];
   2443     	block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4	jf 5" tooltip="val[A]=0 val[X]=0"];
   2444     	block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
   2445     	block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
   2446     	"block0":se -> "block1":n [label="T"];
   2447     	"block0":sw -> "block3":n [label="F"];
   2448     	"block1":se -> "block2":n [label="T"];
   2449     	"block1":sw -> "block3":n [label="F"];
   2450     }
   2451  *
   2452  *  After install graphviz on http://www.graphviz.org/, save it as bpf.dot
   2453  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
   2454  */
   2455 static void
   2456 dot_dump(compiler_state_t *cstate, struct icode *ic)
   2457 {
   2458 	struct bpf_program f;
   2459 	FILE *out = stdout;
   2460 
   2461 	memset(bids, 0, sizeof bids);
   2462 	f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
   2463 
   2464 	fprintf(out, "digraph BPF {\n");
   2465 	unMarkAll(ic);
   2466 	dot_dump_node(ic, ic->root, &f, out);
   2467 	unMarkAll(ic);
   2468 	dot_dump_edge(ic, ic->root, out);
   2469 	fprintf(out, "}\n");
   2470 
   2471 	free((char *)f.bf_insns);
   2472 }
   2473 
   2474 static void
   2475 plain_dump(compiler_state_t *cstate, struct icode *ic)
   2476 {
   2477 	struct bpf_program f;
   2478 
   2479 	memset(bids, 0, sizeof bids);
   2480 	f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
   2481 	bpf_dump(&f, 1);
   2482 	putchar('\n');
   2483 	free((char *)f.bf_insns);
   2484 }
   2485 
   2486 static void
   2487 opt_dump(compiler_state_t *cstate, struct icode *ic)
   2488 {
   2489 	/*
   2490 	 * If the CFG, in DOT format, is requested, output it rather than
   2491 	 * the code that would be generated from that graph.
   2492 	 */
   2493 	if (pcap_print_dot_graph)
   2494 		dot_dump(cstate, ic);
   2495 	else
   2496 		plain_dump(cstate, ic);
   2497 }
   2498 #endif
   2499