1 /* 2 * eBPF kernel space program part 3 * 4 * Toy eBPF program for demonstration purposes, some parts derived from 5 * kernel tree's samples/bpf/sockex2_kern.c example. 6 * 7 * More background on eBPF, kernel tree: Documentation/networking/filter.txt 8 * 9 * Note, this file is rather large, and most classifier and actions are 10 * likely smaller to accomplish one specific use-case and are tailored 11 * for high performance. For performance reasons, you might also have the 12 * classifier and action already merged inside the classifier. 13 * 14 * In order to show various features it serves as a bigger programming 15 * example, which you should feel free to rip apart and experiment with. 16 * 17 * Compilation, configuration example: 18 * 19 * Note: as long as the BPF backend in LLVM is still experimental, 20 * you need to build LLVM with LLVM with --enable-experimental-targets=BPF 21 * Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y, 22 * and you have libelf.h and gelf.h headers and can link tc against -lelf. 23 * 24 * In case you need to sync kernel headers, go to your kernel source tree: 25 * # make headers_install INSTALL_HDR_PATH=/usr/ 26 * 27 * $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH 28 * $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o 29 * $ objdump -h bpf.o 30 * [...] 31 * 3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3 32 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE 33 * 4 action-mark 00000088 0000000000000000 0000000000000000 00000838 2**3 34 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE 35 * 5 action-rand 00000098 0000000000000000 0000000000000000 000008c0 2**3 36 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE 37 * 6 maps 00000030 0000000000000000 0000000000000000 00000958 2**2 38 * CONTENTS, ALLOC, LOAD, DATA 39 * 7 license 00000004 0000000000000000 0000000000000000 00000988 2**0 40 * CONTENTS, ALLOC, LOAD, DATA 41 * [...] 42 * # echo 1 > /proc/sys/net/core/bpf_jit_enable 43 * $ gcc bpf_agent.c -o bpf_agent -Wall -O2 44 * # ./bpf_agent /tmp/bpf-uds (e.g. on a different terminal) 45 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ 46 * action bpf obj bpf.o sec action-mark \ 47 * action bpf obj bpf.o sec action-rand ok 48 * # tc filter show dev em1 49 * filter parent 1: protocol all pref 49152 bpf 50 * filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier] 51 * action order 1: bpf bpf.o:[action-mark] default-action pipe 52 * index 52 ref 1 bind 1 53 * 54 * action order 2: bpf bpf.o:[action-rand] default-action pipe 55 * index 53 ref 1 bind 1 56 * 57 * action order 3: gact action pass 58 * random type none pass val 0 59 * index 38 ref 1 bind 1 60 * 61 * The same program can also be installed on ingress side (as opposed to above 62 * egress configuration), e.g.: 63 * 64 * # tc qdisc add dev em1 handle ffff: ingress 65 * # tc filter add dev em1 parent ffff: bpf obj ... 66 * 67 * Notes on BPF agent: 68 * 69 * In the above example, the bpf_agent creates the unix domain socket 70 * natively. "tc exec" can also spawn a shell and hold the socktes there: 71 * 72 * # tc exec bpf imp /tmp/bpf-uds 73 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ 74 * action bpf obj bpf.o sec action-mark \ 75 * action bpf obj bpf.o sec action-rand ok 76 * sh-4.2# (shell spawned from tc exec) 77 * sh-4.2# bpf_agent 78 * [...] 79 * 80 * This will read out fds over environment and produce the same data dump 81 * as below. This has the advantage that the spawned shell owns the fds 82 * and thus if the agent is restarted, it can reattach to the same fds, also 83 * various programs can easily read/modify the data simultaneously from user 84 * space side. 85 * 86 * If the shell is unnecessary, the agent can also just be spawned directly 87 * via tc exec: 88 * 89 * # tc exec bpf imp /tmp/bpf-uds run bpf_agent 90 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ 91 * action bpf obj bpf.o sec action-mark \ 92 * action bpf obj bpf.o sec action-rand ok 93 * 94 * BPF agent example output: 95 * 96 * ver: 1 97 * obj: bpf.o 98 * dev: 64770 99 * ino: 6045133 100 * maps: 3 101 * map0: 102 * `- fd: 4 103 * | serial: 1 104 * | type: 1 105 * | max elem: 256 106 * | size key: 1 107 * ` size val: 16 108 * map1: 109 * `- fd: 5 110 * | serial: 2 111 * | type: 1 112 * | max elem: 1024 113 * | size key: 4 114 * ` size val: 16 115 * map2: 116 * `- fd: 6 117 * | serial: 3 118 * | type: 2 119 * | max elem: 64 120 * | size key: 4 121 * ` size val: 8 122 * data, period: 5sec 123 * `- number of drops: cpu0: 0 cpu1: 0 cpu2: 0 cpu3: 0 124 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 0, mis: 0] q3:[pkts: 0, mis: 0] 125 * ` protos: tcp:[pkts: 0, bytes: 0] udp:[pkts: 0, bytes: 0] icmp:[pkts: 0, bytes: 0] 126 * data, period: 5sec 127 * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 0 cpu3: 1 128 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 24, mis: 14] q3:[pkts: 0, mis: 0] 129 * ` protos: tcp:[pkts: 13, bytes: 1989] udp:[pkts: 10, bytes: 710] icmp:[pkts: 0, bytes: 0] 130 * data, period: 5sec 131 * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 3 cpu3: 3 132 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 39, mis: 21] q3:[pkts: 0, mis: 0] 133 * ` protos: tcp:[pkts: 20, bytes: 3549] udp:[pkts: 18, bytes: 1278] icmp:[pkts: 0, bytes: 0] 134 * [...] 135 * 136 * This now means, the below classifier and action pipeline has been loaded 137 * as eBPF bytecode into the kernel, the kernel has verified that the 138 * execution of the bytecode is "safe", and it has JITed the programs 139 * afterwards, so that upon invocation they're running on native speed. tc 140 * has transferred all map file descriptors to the bpf_agent via IPC and 141 * even after tc exits, the agent can read out or modify all map data. 142 * 143 * Note that the export to the uds is done only once in the classifier and 144 * not in the action. It's enough to export the (here) shared descriptors 145 * once. 146 * 147 * If you need to disassemble the generated JIT image (echo with 2), the 148 * kernel tree has under tools/net/ a small helper, you can invoke e.g. 149 * `bpf_jit_disasm -o`. 150 * 151 * Please find in the code below further comments. 152 * 153 * -- Happy eBPF hacking! ;) 154 */ 155 #include <stdint.h> 156 #include <stdbool.h> 157 #include <sys/types.h> 158 #include <sys/socket.h> 159 #include <asm/types.h> 160 #include <linux/in.h> 161 #include <linux/if.h> 162 #include <linux/if_ether.h> 163 #include <linux/ip.h> 164 #include <linux/ipv6.h> 165 #include <linux/if_tunnel.h> 166 #include <linux/filter.h> 167 #include <linux/bpf.h> 168 169 /* Common, shared definitions with ebpf_agent.c. */ 170 #include "bpf_shared.h" 171 /* BPF helper functions for our example. */ 172 #include "../../include/bpf_api.h" 173 174 /* Could be defined here as well, or included from the header. */ 175 #define TC_ACT_UNSPEC (-1) 176 #define TC_ACT_OK 0 177 #define TC_ACT_RECLASSIFY 1 178 #define TC_ACT_SHOT 2 179 #define TC_ACT_PIPE 3 180 #define TC_ACT_STOLEN 4 181 #define TC_ACT_QUEUED 5 182 #define TC_ACT_REPEAT 6 183 184 /* Other, misc stuff. */ 185 #define IP_MF 0x2000 186 #define IP_OFFSET 0x1FFF 187 188 /* eBPF map definitions, all placed in section "maps". */ 189 struct bpf_elf_map __section("maps") map_proto = { 190 .type = BPF_MAP_TYPE_HASH, 191 .id = BPF_MAP_ID_PROTO, 192 .size_key = sizeof(uint8_t), 193 .size_value = sizeof(struct count_tuple), 194 .max_elem = 256, 195 }; 196 197 struct bpf_elf_map __section("maps") map_queue = { 198 .type = BPF_MAP_TYPE_HASH, 199 .id = BPF_MAP_ID_QUEUE, 200 .size_key = sizeof(uint32_t), 201 .size_value = sizeof(struct count_queue), 202 .max_elem = 1024, 203 }; 204 205 struct bpf_elf_map __section("maps") map_drops = { 206 .type = BPF_MAP_TYPE_ARRAY, 207 .id = BPF_MAP_ID_DROPS, 208 .size_key = sizeof(uint32_t), 209 .size_value = sizeof(long), 210 .max_elem = 64, 211 }; 212 213 /* Helper functions and definitions for the flow dissector used by the 214 * example classifier. This resembles the kernel's flow dissector to 215 * some extend and is just used as an example to show what's possible 216 * with eBPF. 217 */ 218 struct sockaddr; 219 220 struct vlan_hdr { 221 __be16 h_vlan_TCI; 222 __be16 h_vlan_encapsulated_proto; 223 }; 224 225 struct flow_keys { 226 __u32 src; 227 __u32 dst; 228 union { 229 __u32 ports; 230 __u16 port16[2]; 231 }; 232 __s32 th_off; 233 __u8 ip_proto; 234 }; 235 236 static inline int flow_ports_offset(__u8 ip_proto) 237 { 238 switch (ip_proto) { 239 case IPPROTO_TCP: 240 case IPPROTO_UDP: 241 case IPPROTO_DCCP: 242 case IPPROTO_ESP: 243 case IPPROTO_SCTP: 244 case IPPROTO_UDPLITE: 245 default: 246 return 0; 247 case IPPROTO_AH: 248 return 4; 249 } 250 } 251 252 static inline bool flow_is_frag(struct __sk_buff *skb, int nh_off) 253 { 254 return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) & 255 (IP_MF | IP_OFFSET)); 256 } 257 258 static inline int flow_parse_ipv4(struct __sk_buff *skb, int nh_off, 259 __u8 *ip_proto, struct flow_keys *flow) 260 { 261 __u8 ip_ver_len; 262 263 if (unlikely(flow_is_frag(skb, nh_off))) 264 *ip_proto = 0; 265 else 266 *ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr, 267 protocol)); 268 if (*ip_proto != IPPROTO_GRE) { 269 flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr)); 270 flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr)); 271 } 272 273 ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */); 274 if (likely(ip_ver_len == 0x45)) 275 nh_off += 20; 276 else 277 nh_off += (ip_ver_len & 0xF) << 2; 278 279 return nh_off; 280 } 281 282 static inline __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off) 283 { 284 __u32 w0 = load_word(skb, off); 285 __u32 w1 = load_word(skb, off + sizeof(w0)); 286 __u32 w2 = load_word(skb, off + sizeof(w0) * 2); 287 __u32 w3 = load_word(skb, off + sizeof(w0) * 3); 288 289 return w0 ^ w1 ^ w2 ^ w3; 290 } 291 292 static inline int flow_parse_ipv6(struct __sk_buff *skb, int nh_off, 293 __u8 *ip_proto, struct flow_keys *flow) 294 { 295 *ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr)); 296 297 flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr)); 298 flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr)); 299 300 return nh_off + sizeof(struct ipv6hdr); 301 } 302 303 static inline bool flow_dissector(struct __sk_buff *skb, 304 struct flow_keys *flow) 305 { 306 int poff, nh_off = BPF_LL_OFF + ETH_HLEN; 307 __be16 proto = skb->protocol; 308 __u8 ip_proto; 309 310 /* TODO: check for skb->vlan_tci, skb->vlan_proto first */ 311 if (proto == htons(ETH_P_8021AD)) { 312 proto = load_half(skb, nh_off + 313 offsetof(struct vlan_hdr, h_vlan_encapsulated_proto)); 314 nh_off += sizeof(struct vlan_hdr); 315 } 316 if (proto == htons(ETH_P_8021Q)) { 317 proto = load_half(skb, nh_off + 318 offsetof(struct vlan_hdr, h_vlan_encapsulated_proto)); 319 nh_off += sizeof(struct vlan_hdr); 320 } 321 322 if (likely(proto == htons(ETH_P_IP))) 323 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); 324 else if (proto == htons(ETH_P_IPV6)) 325 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); 326 else 327 return false; 328 329 switch (ip_proto) { 330 case IPPROTO_GRE: { 331 struct gre_hdr { 332 __be16 flags; 333 __be16 proto; 334 }; 335 336 __u16 gre_flags = load_half(skb, nh_off + 337 offsetof(struct gre_hdr, flags)); 338 __u16 gre_proto = load_half(skb, nh_off + 339 offsetof(struct gre_hdr, proto)); 340 341 if (gre_flags & (GRE_VERSION | GRE_ROUTING)) 342 break; 343 344 nh_off += 4; 345 if (gre_flags & GRE_CSUM) 346 nh_off += 4; 347 if (gre_flags & GRE_KEY) 348 nh_off += 4; 349 if (gre_flags & GRE_SEQ) 350 nh_off += 4; 351 352 if (gre_proto == ETH_P_8021Q) { 353 gre_proto = load_half(skb, nh_off + 354 offsetof(struct vlan_hdr, 355 h_vlan_encapsulated_proto)); 356 nh_off += sizeof(struct vlan_hdr); 357 } 358 if (gre_proto == ETH_P_IP) 359 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); 360 else if (gre_proto == ETH_P_IPV6) 361 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); 362 else 363 return false; 364 break; 365 } 366 case IPPROTO_IPIP: 367 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); 368 break; 369 case IPPROTO_IPV6: 370 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); 371 default: 372 break; 373 } 374 375 nh_off += flow_ports_offset(ip_proto); 376 377 flow->ports = load_word(skb, nh_off); 378 flow->th_off = nh_off; 379 flow->ip_proto = ip_proto; 380 381 return true; 382 } 383 384 static inline void cls_update_proto_map(const struct __sk_buff *skb, 385 const struct flow_keys *flow) 386 { 387 uint8_t proto = flow->ip_proto; 388 struct count_tuple *ct, _ct; 389 390 ct = map_lookup_elem(&map_proto, &proto); 391 if (likely(ct)) { 392 lock_xadd(&ct->packets, 1); 393 lock_xadd(&ct->bytes, skb->len); 394 return; 395 } 396 397 /* No hit yet, we need to create a new entry. */ 398 _ct.packets = 1; 399 _ct.bytes = skb->len; 400 401 map_update_elem(&map_proto, &proto, &_ct, BPF_ANY); 402 } 403 404 static inline void cls_update_queue_map(const struct __sk_buff *skb) 405 { 406 uint32_t queue = skb->queue_mapping; 407 struct count_queue *cq, _cq; 408 bool mismatch; 409 410 mismatch = skb->queue_mapping != get_smp_processor_id(); 411 412 cq = map_lookup_elem(&map_queue, &queue); 413 if (likely(cq)) { 414 lock_xadd(&cq->total, 1); 415 if (mismatch) 416 lock_xadd(&cq->mismatch, 1); 417 return; 418 } 419 420 /* No hit yet, we need to create a new entry. */ 421 _cq.total = 1; 422 _cq.mismatch = mismatch ? 1 : 0; 423 424 map_update_elem(&map_queue, &queue, &_cq, BPF_ANY); 425 } 426 427 /* eBPF program definitions, placed in various sections, which can 428 * have custom section names. If custom names are in use, it's 429 * required to point tc to the correct section, e.g. 430 * 431 * tc filter add [...] bpf obj cls.o sec cls-tos [...] 432 * 433 * in case the program resides in __section("cls-tos"). 434 * 435 * Default section for cls_bpf is: "classifier", for act_bpf is: 436 * "action". Naturally, if for example multiple actions are present 437 * in the same file, they need to have distinct section names. 438 * 439 * It is however not required to have multiple programs sharing 440 * a file. 441 */ 442 __section("classifier") 443 int cls_main(struct __sk_buff *skb) 444 { 445 struct flow_keys flow; 446 447 if (!flow_dissector(skb, &flow)) 448 return 0; /* No match in cls_bpf. */ 449 450 cls_update_proto_map(skb, &flow); 451 cls_update_queue_map(skb); 452 453 return flow.ip_proto; 454 } 455 456 static inline void act_update_drop_map(void) 457 { 458 uint32_t *count, cpu = get_smp_processor_id(); 459 460 count = map_lookup_elem(&map_drops, &cpu); 461 if (count) 462 /* Only this cpu is accessing this element. */ 463 (*count)++; 464 } 465 466 __section("action-mark") 467 int act_mark_main(struct __sk_buff *skb) 468 { 469 /* You could also mangle skb data here with the helper function 470 * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could 471 * do that already in the classifier itself as a merged combination 472 * of classifier'n'action model. 473 */ 474 475 if (skb->mark == 0xcafe) { 476 act_update_drop_map(); 477 return TC_ACT_SHOT; 478 } 479 480 /* Default configured tc opcode. */ 481 return TC_ACT_UNSPEC; 482 } 483 484 __section("action-rand") 485 int act_rand_main(struct __sk_buff *skb) 486 { 487 /* Sorry, we're near event horizon ... */ 488 if ((get_prandom_u32() & 3) == 0) { 489 act_update_drop_map(); 490 return TC_ACT_SHOT; 491 } 492 493 return TC_ACT_UNSPEC; 494 } 495 496 /* Last but not least, the file contains a license. Some future helper 497 * functions may only be available with a GPL license. 498 */ 499 BPF_LICENSE("GPL"); 500