Home | History | Annotate | Download | only in bpf
      1 /*
      2  * eBPF kernel space program part
      3  *
      4  * Toy eBPF program for demonstration purposes, some parts derived from
      5  * kernel tree's samples/bpf/sockex2_kern.c example.
      6  *
      7  * More background on eBPF, kernel tree: Documentation/networking/filter.txt
      8  *
      9  * Note, this file is rather large, and most classifier and actions are
     10  * likely smaller to accomplish one specific use-case and are tailored
     11  * for high performance. For performance reasons, you might also have the
     12  * classifier and action already merged inside the classifier.
     13  *
     14  * In order to show various features it serves as a bigger programming
     15  * example, which you should feel free to rip apart and experiment with.
     16  *
     17  * Compilation, configuration example:
     18  *
     19  *  Note: as long as the BPF backend in LLVM is still experimental,
     20  *  you need to build LLVM with LLVM with --enable-experimental-targets=BPF
     21  *  Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y,
     22  *  and you have libelf.h and gelf.h headers and can link tc against -lelf.
     23  *
     24  *  In case you need to sync kernel headers, go to your kernel source tree:
     25  *  # make headers_install INSTALL_HDR_PATH=/usr/
     26  *
     27  *  $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH
     28  *  $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o
     29  *  $ objdump -h bpf.o
     30  *  [...]
     31  *  3 classifier    000007f8  0000000000000000  0000000000000000  00000040  2**3
     32  *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
     33  *  4 action-mark   00000088  0000000000000000  0000000000000000  00000838  2**3
     34  *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
     35  *  5 action-rand   00000098  0000000000000000  0000000000000000  000008c0  2**3
     36  *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
     37  *  6 maps          00000030  0000000000000000  0000000000000000  00000958  2**2
     38  *                  CONTENTS, ALLOC, LOAD, DATA
     39  *  7 license       00000004  0000000000000000  0000000000000000  00000988  2**0
     40  *                  CONTENTS, ALLOC, LOAD, DATA
     41  *  [...]
     42  *  # echo 1 > /proc/sys/net/core/bpf_jit_enable
     43  *  $ gcc bpf_agent.c -o bpf_agent -Wall -O2
     44  *  # ./bpf_agent /tmp/bpf-uds      (e.g. on a different terminal)
     45  *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
     46  *                             action bpf obj bpf.o sec action-mark            \
     47  *                             action bpf obj bpf.o sec action-rand ok
     48  *  # tc filter show dev em1
     49  *  filter parent 1: protocol all pref 49152 bpf
     50  *  filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier]
     51  *    action order 1: bpf bpf.o:[action-mark] default-action pipe
     52  *    index 52 ref 1 bind 1
     53  *
     54  *    action order 2: bpf bpf.o:[action-rand] default-action pipe
     55  *    index 53 ref 1 bind 1
     56  *
     57  *    action order 3: gact action pass
     58  *    random type none pass val 0
     59  *    index 38 ref 1 bind 1
     60  *
     61  * The same program can also be installed on ingress side (as opposed to above
     62  * egress configuration), e.g.:
     63  *
     64  * # tc qdisc add dev em1 handle ffff: ingress
     65  * # tc filter add dev em1 parent ffff: bpf obj ...
     66  *
     67  * Notes on BPF agent:
     68  *
     69  * In the above example, the bpf_agent creates the unix domain socket
     70  * natively. "tc exec" can also spawn a shell and hold the socktes there:
     71  *
     72  *  # tc exec bpf imp /tmp/bpf-uds
     73  *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
     74  *                             action bpf obj bpf.o sec action-mark            \
     75  *                             action bpf obj bpf.o sec action-rand ok
     76  *  sh-4.2# (shell spawned from tc exec)
     77  *  sh-4.2# bpf_agent
     78  *  [...]
     79  *
     80  * This will read out fds over environment and produce the same data dump
     81  * as below. This has the advantage that the spawned shell owns the fds
     82  * and thus if the agent is restarted, it can reattach to the same fds, also
     83  * various programs can easily read/modify the data simultaneously from user
     84  * space side.
     85  *
     86  * If the shell is unnecessary, the agent can also just be spawned directly
     87  * via tc exec:
     88  *
     89  *  # tc exec bpf imp /tmp/bpf-uds run bpf_agent
     90  *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
     91  *                             action bpf obj bpf.o sec action-mark            \
     92  *                             action bpf obj bpf.o sec action-rand ok
     93  *
     94  * BPF agent example output:
     95  *
     96  * ver: 1
     97  * obj: bpf.o
     98  * dev: 64770
     99  * ino: 6045133
    100  * maps: 3
    101  * map0:
    102  *  `- fd: 4
    103  *   | serial: 1
    104  *   | type: 1
    105  *   | max elem: 256
    106  *   | size key: 1
    107  *   ` size val: 16
    108  * map1:
    109  *  `- fd: 5
    110  *   | serial: 2
    111  *   | type: 1
    112  *   | max elem: 1024
    113  *   | size key: 4
    114  *   ` size val: 16
    115  * map2:
    116  *  `- fd: 6
    117  *   | serial: 3
    118  *   | type: 2
    119  *   | max elem: 64
    120  *   | size key: 4
    121  *   ` size val: 8
    122  * data, period: 5sec
    123  *  `- number of drops:	cpu0:     0	cpu1:     0	cpu2:     0	cpu3:     0
    124  *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 0, mis: 0]	q3:[pkts: 0, mis: 0]
    125  *   ` protos:	tcp:[pkts: 0, bytes: 0]	udp:[pkts: 0, bytes: 0]	icmp:[pkts: 0, bytes: 0]
    126  * data, period: 5sec
    127  *  `- number of drops:	cpu0:     5	cpu1:     0	cpu2:     0	cpu3:     1
    128  *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 24, mis: 14]	q3:[pkts: 0, mis: 0]
    129  *   ` protos:	tcp:[pkts: 13, bytes: 1989]	udp:[pkts: 10, bytes: 710]	icmp:[pkts: 0, bytes: 0]
    130  * data, period: 5sec
    131  *  `- number of drops:	cpu0:     5	cpu1:     0	cpu2:     3	cpu3:     3
    132  *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 39, mis: 21]	q3:[pkts: 0, mis: 0]
    133  *   ` protos:	tcp:[pkts: 20, bytes: 3549]	udp:[pkts: 18, bytes: 1278]	icmp:[pkts: 0, bytes: 0]
    134  * [...]
    135  *
    136  * This now means, the below classifier and action pipeline has been loaded
    137  * as eBPF bytecode into the kernel, the kernel has verified that the
    138  * execution of the bytecode is "safe", and it has JITed the programs
    139  * afterwards, so that upon invocation they're running on native speed. tc
    140  * has transferred all map file descriptors to the bpf_agent via IPC and
    141  * even after tc exits, the agent can read out or modify all map data.
    142  *
    143  * Note that the export to the uds is done only once in the classifier and
    144  * not in the action. It's enough to export the (here) shared descriptors
    145  * once.
    146  *
    147  * If you need to disassemble the generated JIT image (echo with 2), the
    148  * kernel tree has under tools/net/ a small helper, you can invoke e.g.
    149  * `bpf_jit_disasm -o`.
    150  *
    151  * Please find in the code below further comments.
    152  *
    153  *   -- Happy eBPF hacking! ;)
    154  */
    155 #include <stdint.h>
    156 #include <stdbool.h>
    157 #include <sys/types.h>
    158 #include <sys/socket.h>
    159 #include <asm/types.h>
    160 #include <linux/in.h>
    161 #include <linux/if.h>
    162 #include <linux/if_ether.h>
    163 #include <linux/ip.h>
    164 #include <linux/ipv6.h>
    165 #include <linux/if_tunnel.h>
    166 #include <linux/filter.h>
    167 #include <linux/bpf.h>
    168 
    169 /* Common, shared definitions with ebpf_agent.c. */
    170 #include "bpf_shared.h"
    171 /* BPF helper functions for our example. */
    172 #include "../../include/bpf_api.h"
    173 
    174 /* Could be defined here as well, or included from the header. */
    175 #define TC_ACT_UNSPEC		(-1)
    176 #define TC_ACT_OK		0
    177 #define TC_ACT_RECLASSIFY	1
    178 #define TC_ACT_SHOT		2
    179 #define TC_ACT_PIPE		3
    180 #define TC_ACT_STOLEN		4
    181 #define TC_ACT_QUEUED		5
    182 #define TC_ACT_REPEAT		6
    183 
    184 /* Other, misc stuff. */
    185 #define IP_MF			0x2000
    186 #define IP_OFFSET		0x1FFF
    187 
    188 /* eBPF map definitions, all placed in section "maps". */
    189 struct bpf_elf_map __section("maps") map_proto = {
    190 	.type		=	BPF_MAP_TYPE_HASH,
    191 	.id		=	BPF_MAP_ID_PROTO,
    192 	.size_key	=	sizeof(uint8_t),
    193 	.size_value	=	sizeof(struct count_tuple),
    194 	.max_elem	=	256,
    195 };
    196 
    197 struct bpf_elf_map __section("maps") map_queue = {
    198 	.type		=	BPF_MAP_TYPE_HASH,
    199 	.id		=	BPF_MAP_ID_QUEUE,
    200 	.size_key	=	sizeof(uint32_t),
    201 	.size_value	=	sizeof(struct count_queue),
    202 	.max_elem	=	1024,
    203 };
    204 
    205 struct bpf_elf_map __section("maps") map_drops = {
    206 	.type		=	BPF_MAP_TYPE_ARRAY,
    207 	.id		=	BPF_MAP_ID_DROPS,
    208 	.size_key	=	sizeof(uint32_t),
    209 	.size_value	=	sizeof(long),
    210 	.max_elem	=	64,
    211 };
    212 
    213 /* Helper functions and definitions for the flow dissector used by the
    214  * example classifier. This resembles the kernel's flow dissector to
    215  * some extend and is just used as an example to show what's possible
    216  * with eBPF.
    217  */
    218 struct sockaddr;
    219 
    220 struct vlan_hdr {
    221 	__be16 h_vlan_TCI;
    222 	__be16 h_vlan_encapsulated_proto;
    223 };
    224 
    225 struct flow_keys {
    226 	__u32 src;
    227 	__u32 dst;
    228 	union {
    229 		__u32 ports;
    230 		__u16 port16[2];
    231 	};
    232 	__s32 th_off;
    233 	__u8 ip_proto;
    234 };
    235 
    236 static inline int flow_ports_offset(__u8 ip_proto)
    237 {
    238 	switch (ip_proto) {
    239 	case IPPROTO_TCP:
    240 	case IPPROTO_UDP:
    241 	case IPPROTO_DCCP:
    242 	case IPPROTO_ESP:
    243 	case IPPROTO_SCTP:
    244 	case IPPROTO_UDPLITE:
    245 	default:
    246 		return 0;
    247 	case IPPROTO_AH:
    248 		return 4;
    249 	}
    250 }
    251 
    252 static inline bool flow_is_frag(struct __sk_buff *skb, int nh_off)
    253 {
    254 	return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) &
    255 		  (IP_MF | IP_OFFSET));
    256 }
    257 
    258 static inline int flow_parse_ipv4(struct __sk_buff *skb, int nh_off,
    259 				  __u8 *ip_proto, struct flow_keys *flow)
    260 {
    261 	__u8 ip_ver_len;
    262 
    263 	if (unlikely(flow_is_frag(skb, nh_off)))
    264 		*ip_proto = 0;
    265 	else
    266 		*ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr,
    267 							     protocol));
    268 	if (*ip_proto != IPPROTO_GRE) {
    269 		flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr));
    270 		flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr));
    271 	}
    272 
    273 	ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */);
    274 	if (likely(ip_ver_len == 0x45))
    275 		nh_off += 20;
    276 	else
    277 		nh_off += (ip_ver_len & 0xF) << 2;
    278 
    279 	return nh_off;
    280 }
    281 
    282 static inline __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off)
    283 {
    284 	__u32 w0 = load_word(skb, off);
    285 	__u32 w1 = load_word(skb, off + sizeof(w0));
    286 	__u32 w2 = load_word(skb, off + sizeof(w0) * 2);
    287 	__u32 w3 = load_word(skb, off + sizeof(w0) * 3);
    288 
    289 	return w0 ^ w1 ^ w2 ^ w3;
    290 }
    291 
    292 static inline int flow_parse_ipv6(struct __sk_buff *skb, int nh_off,
    293 				  __u8 *ip_proto, struct flow_keys *flow)
    294 {
    295 	*ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr));
    296 
    297 	flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr));
    298 	flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr));
    299 
    300 	return nh_off + sizeof(struct ipv6hdr);
    301 }
    302 
    303 static inline bool flow_dissector(struct __sk_buff *skb,
    304 				  struct flow_keys *flow)
    305 {
    306 	int poff, nh_off = BPF_LL_OFF + ETH_HLEN;
    307 	__be16 proto = skb->protocol;
    308 	__u8 ip_proto;
    309 
    310 	/* TODO: check for skb->vlan_tci, skb->vlan_proto first */
    311 	if (proto == htons(ETH_P_8021AD)) {
    312 		proto = load_half(skb, nh_off +
    313 				  offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
    314 		nh_off += sizeof(struct vlan_hdr);
    315 	}
    316 	if (proto == htons(ETH_P_8021Q)) {
    317 		proto = load_half(skb, nh_off +
    318 				  offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
    319 		nh_off += sizeof(struct vlan_hdr);
    320 	}
    321 
    322 	if (likely(proto == htons(ETH_P_IP)))
    323 		nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
    324 	else if (proto == htons(ETH_P_IPV6))
    325 		nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
    326 	else
    327 		return false;
    328 
    329 	switch (ip_proto) {
    330 	case IPPROTO_GRE: {
    331 		struct gre_hdr {
    332 			__be16 flags;
    333 			__be16 proto;
    334 		};
    335 
    336 		__u16 gre_flags = load_half(skb, nh_off +
    337 					    offsetof(struct gre_hdr, flags));
    338 		__u16 gre_proto = load_half(skb, nh_off +
    339 					    offsetof(struct gre_hdr, proto));
    340 
    341 		if (gre_flags & (GRE_VERSION | GRE_ROUTING))
    342 			break;
    343 
    344 		nh_off += 4;
    345 		if (gre_flags & GRE_CSUM)
    346 			nh_off += 4;
    347 		if (gre_flags & GRE_KEY)
    348 			nh_off += 4;
    349 		if (gre_flags & GRE_SEQ)
    350 			nh_off += 4;
    351 
    352 		if (gre_proto == ETH_P_8021Q) {
    353 			gre_proto = load_half(skb, nh_off +
    354 					      offsetof(struct vlan_hdr,
    355 						       h_vlan_encapsulated_proto));
    356 			nh_off += sizeof(struct vlan_hdr);
    357 		}
    358 		if (gre_proto == ETH_P_IP)
    359 			nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
    360 		else if (gre_proto == ETH_P_IPV6)
    361 			nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
    362 		else
    363 			return false;
    364 		break;
    365 	}
    366 	case IPPROTO_IPIP:
    367 		nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
    368 		break;
    369 	case IPPROTO_IPV6:
    370 		nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
    371 	default:
    372 		break;
    373 	}
    374 
    375 	nh_off += flow_ports_offset(ip_proto);
    376 
    377 	flow->ports = load_word(skb, nh_off);
    378 	flow->th_off = nh_off;
    379 	flow->ip_proto = ip_proto;
    380 
    381 	return true;
    382 }
    383 
    384 static inline void cls_update_proto_map(const struct __sk_buff *skb,
    385 					const struct flow_keys *flow)
    386 {
    387 	uint8_t proto = flow->ip_proto;
    388 	struct count_tuple *ct, _ct;
    389 
    390 	ct = map_lookup_elem(&map_proto, &proto);
    391 	if (likely(ct)) {
    392 		lock_xadd(&ct->packets, 1);
    393 		lock_xadd(&ct->bytes, skb->len);
    394 		return;
    395 	}
    396 
    397 	/* No hit yet, we need to create a new entry. */
    398 	_ct.packets = 1;
    399 	_ct.bytes = skb->len;
    400 
    401 	map_update_elem(&map_proto, &proto, &_ct, BPF_ANY);
    402 }
    403 
    404 static inline void cls_update_queue_map(const struct __sk_buff *skb)
    405 {
    406 	uint32_t queue = skb->queue_mapping;
    407 	struct count_queue *cq, _cq;
    408 	bool mismatch;
    409 
    410 	mismatch = skb->queue_mapping != get_smp_processor_id();
    411 
    412 	cq = map_lookup_elem(&map_queue, &queue);
    413 	if (likely(cq)) {
    414 		lock_xadd(&cq->total, 1);
    415 		if (mismatch)
    416 			lock_xadd(&cq->mismatch, 1);
    417 		return;
    418 	}
    419 
    420 	/* No hit yet, we need to create a new entry. */
    421 	_cq.total = 1;
    422 	_cq.mismatch = mismatch ? 1 : 0;
    423 
    424 	map_update_elem(&map_queue, &queue, &_cq, BPF_ANY);
    425 }
    426 
    427 /* eBPF program definitions, placed in various sections, which can
    428  * have custom section names. If custom names are in use, it's
    429  * required to point tc to the correct section, e.g.
    430  *
    431  *     tc filter add [...] bpf obj cls.o sec cls-tos [...]
    432  *
    433  * in case the program resides in __section("cls-tos").
    434  *
    435  * Default section for cls_bpf is: "classifier", for act_bpf is:
    436  * "action". Naturally, if for example multiple actions are present
    437  * in the same file, they need to have distinct section names.
    438  *
    439  * It is however not required to have multiple programs sharing
    440  * a file.
    441  */
    442 __section("classifier")
    443 int cls_main(struct __sk_buff *skb)
    444 {
    445 	struct flow_keys flow;
    446 
    447 	if (!flow_dissector(skb, &flow))
    448 		return 0; /* No match in cls_bpf. */
    449 
    450 	cls_update_proto_map(skb, &flow);
    451 	cls_update_queue_map(skb);
    452 
    453 	return flow.ip_proto;
    454 }
    455 
    456 static inline void act_update_drop_map(void)
    457 {
    458 	uint32_t *count, cpu = get_smp_processor_id();
    459 
    460 	count = map_lookup_elem(&map_drops, &cpu);
    461 	if (count)
    462 		/* Only this cpu is accessing this element. */
    463 		(*count)++;
    464 }
    465 
    466 __section("action-mark")
    467 int act_mark_main(struct __sk_buff *skb)
    468 {
    469 	/* You could also mangle skb data here with the helper function
    470 	 * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could
    471 	 * do that already in the classifier itself as a merged combination
    472 	 * of classifier'n'action model.
    473 	 */
    474 
    475 	if (skb->mark == 0xcafe) {
    476 		act_update_drop_map();
    477 		return TC_ACT_SHOT;
    478 	}
    479 
    480 	/* Default configured tc opcode. */
    481 	return TC_ACT_UNSPEC;
    482 }
    483 
    484 __section("action-rand")
    485 int act_rand_main(struct __sk_buff *skb)
    486 {
    487 	/* Sorry, we're near event horizon ... */
    488 	if ((get_prandom_u32() & 3) == 0) {
    489 		act_update_drop_map();
    490 		return TC_ACT_SHOT;
    491 	}
    492 
    493 	return TC_ACT_UNSPEC;
    494 }
    495 
    496 /* Last but not least, the file contains a license. Some future helper
    497  * functions may only be available with a GPL license.
    498  */
    499 BPF_LICENSE("GPL");
    500