1 /* 2 * Copyright (c) 2015 PLUMgrid, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef _GNU_SOURCE 17 #define _GNU_SOURCE 18 #endif 19 20 #include <arpa/inet.h> 21 #include <errno.h> 22 #include <fcntl.h> 23 #include <inttypes.h> 24 #include <limits.h> 25 #include <linux/bpf.h> 26 #include <linux/bpf_common.h> 27 #include <linux/if_packet.h> 28 #include <linux/perf_event.h> 29 #include <linux/pkt_cls.h> 30 #include <linux/rtnetlink.h> 31 #include <linux/sched.h> 32 #include <linux/unistd.h> 33 #include <linux/version.h> 34 #include <net/ethernet.h> 35 #include <net/if.h> 36 #include <sched.h> 37 #include <stdbool.h> 38 #include <stdio.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <sys/ioctl.h> 42 #include <sys/resource.h> 43 #include <sys/stat.h> 44 #include <sys/types.h> 45 #include <unistd.h> 46 #include <linux/if_alg.h> 47 48 #include "libbpf.h" 49 #include "perf_reader.h" 50 51 // TODO: Remove this when CentOS 6 support is not needed anymore 52 #include "setns.h" 53 54 // TODO: remove these defines when linux-libc-dev exports them properly 55 56 #ifndef __NR_bpf 57 #if defined(__powerpc64__) 58 #define __NR_bpf 361 59 #elif defined(__s390x__) 60 #define __NR_bpf 351 61 #elif defined(__aarch64__) 62 #define __NR_bpf 280 63 #else 64 #define __NR_bpf 321 65 #endif 66 #endif 67 68 #ifndef SO_ATTACH_BPF 69 #define SO_ATTACH_BPF 50 70 #endif 71 72 #ifndef PERF_EVENT_IOC_SET_BPF 73 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) 74 #endif 75 76 #ifndef PERF_FLAG_FD_CLOEXEC 77 #define PERF_FLAG_FD_CLOEXEC (1UL << 3) 78 #endif 79 80 // TODO: Remove this when CentOS 6 support is not needed anymore 81 #ifndef AF_ALG 82 #define AF_ALG 38 83 #endif 84 85 #define min(x, y) ((x) < (y) ? (x) : (y)) 86 87 struct bpf_helper { 88 char *name; 89 char *required_version; 90 }; 91 92 static struct bpf_helper helpers[] = { 93 {"map_lookup_elem", "3.19"}, 94 {"map_update_elem", "3.19"}, 95 {"map_delete_elem", "3.19"}, 96 {"probe_read", "4.1"}, 97 {"ktime_get_ns", "4.1"}, 98 {"trace_printk", "4.1"}, 99 {"get_prandom_u32", "4.1"}, 100 {"get_smp_processor_id", "4.1"}, 101 {"skb_store_bytes", "4.1"}, 102 {"l3_csum_replace", "4.1"}, 103 {"l4_csum_replace", "4.1"}, 104 {"tail_call", "4.2"}, 105 {"clone_redirect", "4.2"}, 106 {"get_current_pid_tgid", "4.2"}, 107 {"get_current_uid_gid", "4.2"}, 108 {"get_current_comm", "4.2"}, 109 {"get_cgroup_classid", "4.3"}, 110 {"skb_vlan_push", "4.3"}, 111 {"skb_vlan_pop", "4.3"}, 112 {"skb_get_tunnel_key", "4.3"}, 113 {"skb_set_tunnel_key", "4.3"}, 114 {"perf_event_read", "4.3"}, 115 {"redirect", "4.4"}, 116 {"get_route_realm", "4.4"}, 117 {"perf_event_output", "4.4"}, 118 {"skb_load_bytes", "4.5"}, 119 {"get_stackid", "4.6"}, 120 {"csum_diff", "4.6"}, 121 {"skb_get_tunnel_opt", "4.6"}, 122 {"skb_set_tunnel_opt", "4.6"}, 123 {"skb_change_proto", "4.8"}, 124 {"skb_change_type", "4.8"}, 125 {"skb_under_cgroup", "4.8"}, 126 {"get_hash_recalc", "4.8"}, 127 {"get_current_task", "4.8"}, 128 {"probe_write_user", "4.8"}, 129 {"current_task_under_cgroup", "4.9"}, 130 {"skb_change_tail", "4.9"}, 131 {"skb_pull_data", "4.9"}, 132 {"csum_update", "4.9"}, 133 {"set_hash_invalid", "4.9"}, 134 {"get_numa_node_id", "4.10"}, 135 {"skb_change_head", "4.10"}, 136 {"xdp_adjust_head", "4.10"}, 137 {"probe_read_str", "4.11"}, 138 {"get_socket_cookie", "4.12"}, 139 {"get_socket_uid", "4.12"}, 140 {"set_hash", "4.13"}, 141 {"setsockopt", "4.13"}, 142 {"skb_adjust_room", "4.13"}, 143 {"redirect_map", "4.14"}, 144 {"sk_redirect_map", "4.14"}, 145 {"sock_map_update", "4.14"}, 146 {"xdp_adjust_meta", "4.15"}, 147 {"perf_event_read_value", "4.15"}, 148 {"perf_prog_read_value", "4.15"}, 149 {"getsockopt", "4.15"}, 150 {"override_return", "4.16"}, 151 {"sock_ops_cb_flags_set", "4.16"}, 152 {"msg_redirect_map", "4.17"}, 153 {"msg_apply_bytes", "4.17"}, 154 {"msg_cork_bytes", "4.17"}, 155 {"msg_pull_data", "4.17"}, 156 {"bind", "4.17"}, 157 {"xdp_adjust_tail", "4.18"}, 158 {"skb_get_xfrm_state", "4.18"}, 159 {"get_stack", "4.18"}, 160 {"skb_load_bytes_relative", "4.18"}, 161 {"fib_lookup", "4.18"}, 162 {"sock_hash_update", "4.18"}, 163 {"msg_redirect_hash", "4.18"}, 164 {"sk_redirect_hash", "4.18"}, 165 {"lwt_push_encap", "4.18"}, 166 {"lwt_seg6_store_bytes", "4.18"}, 167 {"lwt_seg6_adjust_srh", "4.18"}, 168 {"lwt_seg6_action", "4.18"}, 169 {"rc_repeat", "4.18"}, 170 {"rc_keydown", "4.18"}, 171 {"skb_cgroup_id", "4.18"}, 172 {"get_current_cgroup_id", "4.18"}, 173 {"get_local_storage", "4.19"}, 174 {"sk_select_reuseport", "4.19"}, 175 {"skb_ancestor_cgroup_id", "4.19"}, 176 {"sk_lookup_tcp", "4.20"}, 177 {"sk_lookup_udp", "4.20"}, 178 {"sk_release", "4.20"}, 179 {"map_push_elem", "4.20"}, 180 {"map_pop_elem", "4.20"}, 181 {"map_peak_elem", "4.20"}, 182 {"msg_push_data", "4.20"}, 183 }; 184 185 static uint64_t ptr_to_u64(void *ptr) 186 { 187 return (uint64_t) (unsigned long) ptr; 188 } 189 190 int bpf_create_map(enum bpf_map_type map_type, const char *name, 191 int key_size, int value_size, 192 int max_entries, int map_flags) 193 { 194 size_t name_len = name ? strlen(name) : 0; 195 union bpf_attr attr; 196 memset(&attr, 0, sizeof(attr)); 197 attr.map_type = map_type; 198 attr.key_size = key_size; 199 attr.value_size = value_size; 200 attr.max_entries = max_entries; 201 attr.map_flags = map_flags; 202 memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1)); 203 204 int ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); 205 206 if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) { 207 memset(attr.map_name, 0, BPF_OBJ_NAME_LEN); 208 ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); 209 } 210 211 if (ret < 0 && errno == EPERM) { 212 // see note below about the rationale for this retry 213 214 struct rlimit rl = {}; 215 if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) { 216 rl.rlim_max = RLIM_INFINITY; 217 rl.rlim_cur = rl.rlim_max; 218 if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0) 219 ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); 220 } 221 } 222 return ret; 223 } 224 225 int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags) 226 { 227 union bpf_attr attr; 228 memset(&attr, 0, sizeof(attr)); 229 attr.map_fd = fd; 230 attr.key = ptr_to_u64(key); 231 attr.value = ptr_to_u64(value); 232 attr.flags = flags; 233 234 return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); 235 } 236 237 int bpf_lookup_elem(int fd, void *key, void *value) 238 { 239 union bpf_attr attr; 240 memset(&attr, 0, sizeof(attr)); 241 attr.map_fd = fd; 242 attr.key = ptr_to_u64(key); 243 attr.value = ptr_to_u64(value); 244 245 return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); 246 } 247 248 int bpf_delete_elem(int fd, void *key) 249 { 250 union bpf_attr attr; 251 memset(&attr, 0, sizeof(attr)); 252 attr.map_fd = fd; 253 attr.key = ptr_to_u64(key); 254 255 return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); 256 } 257 258 int bpf_get_first_key(int fd, void *key, size_t key_size) 259 { 260 union bpf_attr attr; 261 int i, res; 262 263 memset(&attr, 0, sizeof(attr)); 264 attr.map_fd = fd; 265 attr.key = 0; 266 attr.next_key = ptr_to_u64(key); 267 268 // 4.12 and above kernel supports passing NULL to BPF_MAP_GET_NEXT_KEY 269 // to get first key of the map. For older kernels, the call will fail. 270 res = syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); 271 if (res < 0 && errno == EFAULT) { 272 // Fall back to try to find a non-existing key. 273 static unsigned char try_values[3] = {0, 0xff, 0x55}; 274 attr.key = ptr_to_u64(key); 275 for (i = 0; i < 3; i++) { 276 memset(key, try_values[i], key_size); 277 // We want to check the existence of the key but we don't know the size 278 // of map's value. So we pass an invalid pointer for value, expect 279 // the call to fail and check if the error is ENOENT indicating the 280 // key doesn't exist. If we use NULL for the invalid pointer, it might 281 // trigger a page fault in kernel and affect performance. Hence we use 282 // ~0 which will fail and return fast. 283 // This should fail since we pass an invalid pointer for value. 284 if (bpf_lookup_elem(fd, key, (void *)~0) >= 0) 285 return -1; 286 // This means the key doesn't exist. 287 if (errno == ENOENT) 288 return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); 289 } 290 return -1; 291 } else { 292 return res; 293 } 294 } 295 296 int bpf_get_next_key(int fd, void *key, void *next_key) 297 { 298 union bpf_attr attr; 299 memset(&attr, 0, sizeof(attr)); 300 attr.map_fd = fd; 301 attr.key = ptr_to_u64(key); 302 attr.next_key = ptr_to_u64(next_key); 303 304 return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); 305 } 306 307 static void bpf_print_hints(int ret, char *log) 308 { 309 if (ret < 0) 310 fprintf(stderr, "bpf: Failed to load program: %s\n", strerror(errno)); 311 if (log == NULL) 312 return; 313 else 314 fprintf(stderr, "%s\n", log); 315 316 if (ret >= 0) 317 return; 318 319 // The following error strings will need maintenance to match LLVM. 320 321 // stack busting 322 if (strstr(log, "invalid stack off=-") != NULL) { 323 fprintf(stderr, "HINT: Looks like you exceeded the BPF stack limit. " 324 "This can happen if you allocate too much local variable storage. " 325 "For example, if you allocated a 1 Kbyte struct (maybe for " 326 "BPF_PERF_OUTPUT), busting a max stack of 512 bytes.\n\n"); 327 } 328 329 // didn't check NULL on map lookup 330 if (strstr(log, "invalid mem access 'map_value_or_null'") != NULL) { 331 fprintf(stderr, "HINT: The 'map_value_or_null' error can happen if " 332 "you dereference a pointer value from a map lookup without first " 333 "checking if that pointer is NULL.\n\n"); 334 } 335 336 // lacking a bpf_probe_read 337 if (strstr(log, "invalid mem access 'inv'") != NULL) { 338 fprintf(stderr, "HINT: The invalid mem access 'inv' error can happen " 339 "if you try to dereference memory without first using " 340 "bpf_probe_read() to copy it to the BPF stack. Sometimes the " 341 "bpf_probe_read is automatic by the bcc rewriter, other times " 342 "you'll need to be explicit.\n\n"); 343 } 344 345 // helper function not found in kernel 346 char *helper_str = strstr(log, "invalid func "); 347 if (helper_str != NULL) { 348 helper_str += strlen("invalid func "); 349 char *str = strchr(helper_str, '#'); 350 if (str != NULL) { 351 helper_str = str + 1; 352 } 353 unsigned int helper_id = atoi(helper_str); 354 if (helper_id && helper_id < sizeof(helpers) / sizeof(struct bpf_helper)) { 355 struct bpf_helper helper = helpers[helper_id - 1]; 356 fprintf(stderr, "HINT: bpf_%s missing (added in Linux %s).\n\n", 357 helper.name, helper.required_version); 358 } 359 } 360 } 361 #define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u)) 362 363 int bpf_obj_get_info(int prog_map_fd, void *info, uint32_t *info_len) 364 { 365 union bpf_attr attr; 366 int err; 367 368 memset(&attr, 0, sizeof(attr)); 369 attr.info.bpf_fd = prog_map_fd; 370 attr.info.info_len = *info_len; 371 attr.info.info = ptr_to_u64(info); 372 373 err = syscall(__NR_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); 374 if (!err) 375 *info_len = attr.info.info_len; 376 377 return err; 378 } 379 380 int bpf_prog_compute_tag(const struct bpf_insn *insns, int prog_len, 381 unsigned long long *ptag) 382 { 383 struct sockaddr_alg alg = { 384 .salg_family = AF_ALG, 385 .salg_type = "hash", 386 .salg_name = "sha1", 387 }; 388 int shafd = socket(AF_ALG, SOCK_SEQPACKET | SOCK_CLOEXEC, 0); 389 if (shafd < 0) { 390 fprintf(stderr, "sha1 socket not available %s\n", strerror(errno)); 391 return -1; 392 } 393 int ret = bind(shafd, (struct sockaddr *)&alg, sizeof(alg)); 394 if (ret < 0) { 395 fprintf(stderr, "sha1 bind fail %s\n", strerror(errno)); 396 close(shafd); 397 return ret; 398 } 399 int shafd2 = accept4(shafd, NULL, 0, SOCK_CLOEXEC); 400 if (shafd2 < 0) { 401 fprintf(stderr, "sha1 accept fail %s\n", strerror(errno)); 402 close(shafd); 403 return -1; 404 } 405 struct bpf_insn prog[prog_len / 8]; 406 bool map_ld_seen = false; 407 int i; 408 for (i = 0; i < prog_len / 8; i++) { 409 prog[i] = insns[i]; 410 if (insns[i].code == (BPF_LD | BPF_DW | BPF_IMM) && 411 insns[i].src_reg == BPF_PSEUDO_MAP_FD && 412 !map_ld_seen) { 413 prog[i].imm = 0; 414 map_ld_seen = true; 415 } else if (insns[i].code == 0 && map_ld_seen) { 416 prog[i].imm = 0; 417 map_ld_seen = false; 418 } else { 419 map_ld_seen = false; 420 } 421 } 422 ret = write(shafd2, prog, prog_len); 423 if (ret != prog_len) { 424 fprintf(stderr, "sha1 write fail %s\n", strerror(errno)); 425 close(shafd2); 426 close(shafd); 427 return -1; 428 } 429 430 union { 431 unsigned char sha[20]; 432 unsigned long long tag; 433 } u = {}; 434 ret = read(shafd2, u.sha, 20); 435 if (ret != 20) { 436 fprintf(stderr, "sha1 read fail %s\n", strerror(errno)); 437 close(shafd2); 438 close(shafd); 439 return -1; 440 } 441 *ptag = __builtin_bswap64(u.tag); 442 close(shafd2); 443 close(shafd); 444 return 0; 445 } 446 447 int bpf_prog_get_tag(int fd, unsigned long long *ptag) 448 { 449 char fmt[64]; 450 snprintf(fmt, sizeof(fmt), "/proc/self/fdinfo/%d", fd); 451 FILE * f = fopen(fmt, "re"); 452 if (!f) { 453 /* fprintf(stderr, "failed to open fdinfo %s\n", strerror(errno));*/ 454 return -1; 455 } 456 fgets(fmt, sizeof(fmt), f); // pos 457 fgets(fmt, sizeof(fmt), f); // flags 458 fgets(fmt, sizeof(fmt), f); // mnt_id 459 fgets(fmt, sizeof(fmt), f); // prog_type 460 fgets(fmt, sizeof(fmt), f); // prog_jited 461 fgets(fmt, sizeof(fmt), f); // prog_tag 462 fclose(f); 463 char *p = strchr(fmt, ':'); 464 if (!p) { 465 /* fprintf(stderr, "broken fdinfo %s\n", fmt);*/ 466 return -2; 467 } 468 unsigned long long tag = 0; 469 sscanf(p + 1, "%llx", &tag); 470 *ptag = tag; 471 return 0; 472 } 473 474 int bpf_prog_load(enum bpf_prog_type prog_type, const char *name, 475 const struct bpf_insn *insns, int prog_len, 476 const char *license, unsigned kern_version, 477 int log_level, char *log_buf, unsigned log_buf_size) 478 { 479 size_t name_len = name ? strlen(name) : 0; 480 union bpf_attr attr; 481 char *tmp_log_buf = NULL; 482 unsigned tmp_log_buf_size = 0; 483 int ret = 0, name_offset = 0; 484 485 memset(&attr, 0, sizeof(attr)); 486 487 attr.prog_type = prog_type; 488 attr.kern_version = kern_version; 489 attr.license = ptr_to_u64((void *)license); 490 491 attr.insns = ptr_to_u64((void *)insns); 492 attr.insn_cnt = prog_len / sizeof(struct bpf_insn); 493 if (attr.insn_cnt > BPF_MAXINSNS) { 494 errno = EINVAL; 495 fprintf(stderr, 496 "bpf: %s. Program %s too large (%u insns), at most %d insns\n\n", 497 strerror(errno), name, attr.insn_cnt, BPF_MAXINSNS); 498 return -1; 499 } 500 501 attr.log_level = log_level; 502 if (attr.log_level > 0) { 503 if (log_buf_size > 0) { 504 // Use user-provided log buffer if availiable. 505 log_buf[0] = 0; 506 attr.log_buf = ptr_to_u64(log_buf); 507 attr.log_size = log_buf_size; 508 } else { 509 // Create and use temporary log buffer if user didn't provide one. 510 tmp_log_buf_size = LOG_BUF_SIZE; 511 tmp_log_buf = malloc(tmp_log_buf_size); 512 if (!tmp_log_buf) { 513 fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n", 514 strerror(errno)); 515 attr.log_level = 0; 516 } else { 517 tmp_log_buf[0] = 0; 518 attr.log_buf = ptr_to_u64(tmp_log_buf); 519 attr.log_size = tmp_log_buf_size; 520 } 521 } 522 } 523 524 if (strncmp(name, "kprobe__", 8) == 0) 525 name_offset = 8; 526 else if (strncmp(name, "tracepoint__", 12) == 0) 527 name_offset = 12; 528 else if (strncmp(name, "raw_tracepoint__", 16) == 0) 529 name_offset = 16; 530 memcpy(attr.prog_name, name + name_offset, 531 min(name_len - name_offset, BPF_OBJ_NAME_LEN - 1)); 532 533 ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); 534 // BPF object name is not supported on older Kernels. 535 // If we failed due to this, clear the name and try again. 536 if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) { 537 memset(attr.prog_name, 0, BPF_OBJ_NAME_LEN); 538 ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); 539 } 540 541 if (ret < 0 && errno == EPERM) { 542 // When EPERM is returned, two reasons are possible: 543 // 1. user has no permissions for bpf() 544 // 2. user has insufficent rlimit for locked memory 545 // Unfortunately, there is no api to inspect the current usage of locked 546 // mem for the user, so an accurate calculation of how much memory to lock 547 // for this new program is difficult to calculate. As a hack, bump the limit 548 // to unlimited. If program load fails again, return the error. 549 struct rlimit rl = {}; 550 if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) { 551 rl.rlim_max = RLIM_INFINITY; 552 rl.rlim_cur = rl.rlim_max; 553 if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0) 554 ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); 555 } 556 } 557 558 // The load has failed. Handle log message. 559 if (ret < 0) { 560 // User has provided a log buffer. 561 if (log_buf_size) { 562 // If logging is not already enabled, enable it and do the syscall again. 563 if (attr.log_level == 0) { 564 attr.log_level = 1; 565 attr.log_buf = ptr_to_u64(log_buf); 566 attr.log_size = log_buf_size; 567 ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); 568 } 569 // Print the log message and return. 570 bpf_print_hints(ret, log_buf); 571 if (errno == ENOSPC) 572 fprintf(stderr, "bpf: log_buf size may be insufficient\n"); 573 goto return_result; 574 } 575 576 // User did not provide log buffer. We will try to increase size of 577 // our temporary log buffer to get full error message. 578 if (tmp_log_buf) 579 free(tmp_log_buf); 580 tmp_log_buf_size = LOG_BUF_SIZE; 581 if (attr.log_level == 0) 582 attr.log_level = 1; 583 for (;;) { 584 tmp_log_buf = malloc(tmp_log_buf_size); 585 if (!tmp_log_buf) { 586 fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n", 587 strerror(errno)); 588 goto return_result; 589 } 590 tmp_log_buf[0] = 0; 591 attr.log_buf = ptr_to_u64(tmp_log_buf); 592 attr.log_size = tmp_log_buf_size; 593 594 ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); 595 if (ret < 0 && errno == ENOSPC) { 596 // Temporary buffer size is not enough. Double it and try again. 597 free(tmp_log_buf); 598 tmp_log_buf = NULL; 599 tmp_log_buf_size <<= 1; 600 } else { 601 break; 602 } 603 } 604 } 605 606 // Check if we should print the log message if log_level is not 0, 607 // either specified by user or set due to error. 608 if (attr.log_level > 0) { 609 // Don't print if user enabled logging and provided log buffer, 610 // but there is no error. 611 if (log_buf && ret < 0) 612 bpf_print_hints(ret, log_buf); 613 else if (tmp_log_buf) 614 bpf_print_hints(ret, tmp_log_buf); 615 } 616 617 return_result: 618 if (tmp_log_buf) 619 free(tmp_log_buf); 620 return ret; 621 } 622 623 int bpf_open_raw_sock(const char *name) 624 { 625 struct sockaddr_ll sll; 626 int sock; 627 628 sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL)); 629 if (sock < 0) { 630 fprintf(stderr, "cannot create raw socket\n"); 631 return -1; 632 } 633 634 /* Do not bind on empty interface names */ 635 if (!name || *name == '\0') 636 return sock; 637 638 memset(&sll, 0, sizeof(sll)); 639 sll.sll_family = AF_PACKET; 640 sll.sll_ifindex = if_nametoindex(name); 641 if (sll.sll_ifindex == 0) { 642 fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno)); 643 close(sock); 644 return -1; 645 } 646 sll.sll_protocol = htons(ETH_P_ALL); 647 if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) { 648 fprintf(stderr, "bind to %s: %s\n", name, strerror(errno)); 649 close(sock); 650 return -1; 651 } 652 653 return sock; 654 } 655 656 int bpf_attach_socket(int sock, int prog) { 657 return setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog, sizeof(prog)); 658 } 659 660 #define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type" 661 static int bpf_find_probe_type(const char *event_type) 662 { 663 int fd; 664 int ret; 665 char buf[PATH_MAX]; 666 667 ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type); 668 if (ret < 0 || ret >= (int)sizeof(buf)) 669 return -1; 670 671 fd = open(buf, O_RDONLY | O_CLOEXEC); 672 if (fd < 0) 673 return -1; 674 ret = read(fd, buf, sizeof(buf)); 675 close(fd); 676 if (ret < 0 || ret >= (int)sizeof(buf)) 677 return -1; 678 errno = 0; 679 ret = (int)strtol(buf, NULL, 10); 680 return errno ? -1 : ret; 681 } 682 683 #define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe" 684 static int bpf_get_retprobe_bit(const char *event_type) 685 { 686 int fd; 687 int ret; 688 char buf[PATH_MAX]; 689 690 ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type); 691 if (ret < 0 || ret >= (int)sizeof(buf)) 692 return -1; 693 694 fd = open(buf, O_RDONLY | O_CLOEXEC); 695 if (fd < 0) 696 return -1; 697 ret = read(fd, buf, sizeof(buf)); 698 close(fd); 699 if (ret < 0 || ret >= (int)sizeof(buf)) 700 return -1; 701 if (strlen(buf) < strlen("config:")) 702 return -1; 703 errno = 0; 704 ret = (int)strtol(buf + strlen("config:"), NULL, 10); 705 return errno ? -1 : ret; 706 } 707 708 /* 709 * new kernel API allows creating [k,u]probe with perf_event_open, which 710 * makes it easier to clean up the [k,u]probe. This function tries to 711 * create pfd with the new API. 712 */ 713 static int bpf_try_perf_event_open_with_probe(const char *name, uint64_t offs, 714 int pid, char *event_type, int is_return) 715 { 716 struct perf_event_attr attr = {}; 717 int type = bpf_find_probe_type(event_type); 718 int is_return_bit = bpf_get_retprobe_bit(event_type); 719 int cpu = 0; 720 721 if (type < 0 || is_return_bit < 0) 722 return -1; 723 attr.sample_period = 1; 724 attr.wakeup_events = 1; 725 if (is_return) 726 attr.config |= 1 << is_return_bit; 727 728 /* 729 * struct perf_event_attr in latest perf_event.h has the following 730 * extension to config1 and config2. To keep bcc compatibe with 731 * older perf_event.h, we use config1 and config2 here instead of 732 * kprobe_func, uprobe_path, kprobe_addr, and probe_offset. 733 * 734 * union { 735 * __u64 bp_addr; 736 * __u64 kprobe_func; 737 * __u64 uprobe_path; 738 * __u64 config1; 739 * }; 740 * union { 741 * __u64 bp_len; 742 * __u64 kprobe_addr; 743 * __u64 probe_offset; 744 * __u64 config2; 745 * }; 746 */ 747 attr.config2 = offs; /* config2 here is kprobe_addr or probe_offset */ 748 attr.size = sizeof(attr); 749 attr.type = type; 750 /* config1 here is kprobe_func or uprobe_path */ 751 attr.config1 = ptr_to_u64((void *)name); 752 // PID filter is only possible for uprobe events. 753 if (pid < 0) 754 pid = -1; 755 // perf_event_open API doesn't allow both pid and cpu to be -1. 756 // So only set it to -1 when PID is not -1. 757 // Tracing events do not do CPU filtering in any cases. 758 if (pid != -1) 759 cpu = -1; 760 return syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */, 761 PERF_FLAG_FD_CLOEXEC); 762 } 763 764 // When a valid Perf Event FD provided through pfd, it will be used to enable 765 // and attach BPF program to the event, and event_path will be ignored. 766 // Otherwise, event_path is expected to contain the path to the event in debugfs 767 // and it will be used to open the Perf Event FD. 768 // In either case, if the attach partially failed (such as issue with the 769 // ioctl operations), the **caller** need to clean up the Perf Event FD, either 770 // provided by the caller or opened here. 771 static int bpf_attach_tracing_event(int progfd, const char *event_path, int pid, 772 int *pfd) 773 { 774 int efd, cpu = 0; 775 ssize_t bytes; 776 char buf[PATH_MAX]; 777 struct perf_event_attr attr = {}; 778 // Caller did not provided a valid Perf Event FD. Create one with the debugfs 779 // event path provided. 780 if (*pfd < 0) { 781 snprintf(buf, sizeof(buf), "%s/id", event_path); 782 efd = open(buf, O_RDONLY | O_CLOEXEC, 0); 783 if (efd < 0) { 784 fprintf(stderr, "open(%s): %s\n", buf, strerror(errno)); 785 return -1; 786 } 787 788 bytes = read(efd, buf, sizeof(buf)); 789 if (bytes <= 0 || bytes >= (int)sizeof(buf)) { 790 fprintf(stderr, "read(%s): %s\n", buf, strerror(errno)); 791 close(efd); 792 return -1; 793 } 794 close(efd); 795 buf[bytes] = '\0'; 796 attr.config = strtol(buf, NULL, 0); 797 attr.type = PERF_TYPE_TRACEPOINT; 798 attr.sample_period = 1; 799 attr.wakeup_events = 1; 800 // PID filter is only possible for uprobe events. 801 if (pid < 0) 802 pid = -1; 803 // perf_event_open API doesn't allow both pid and cpu to be -1. 804 // So only set it to -1 when PID is not -1. 805 // Tracing events do not do CPU filtering in any cases. 806 if (pid != -1) 807 cpu = -1; 808 *pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); 809 if (*pfd < 0) { 810 fprintf(stderr, "perf_event_open(%s/id): %s\n", event_path, strerror(errno)); 811 return -1; 812 } 813 } 814 815 if (ioctl(*pfd, PERF_EVENT_IOC_SET_BPF, progfd) < 0) { 816 perror("ioctl(PERF_EVENT_IOC_SET_BPF)"); 817 return -1; 818 } 819 if (ioctl(*pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) { 820 perror("ioctl(PERF_EVENT_IOC_ENABLE)"); 821 return -1; 822 } 823 824 return 0; 825 } 826 827 int bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type, 828 const char *ev_name, const char *fn_name, uint64_t fn_offset) 829 { 830 int kfd, pfd = -1; 831 char buf[256]; 832 char event_alias[128]; 833 static char *event_type = "kprobe"; 834 835 // Try create the kprobe Perf Event with perf_event_open API. 836 pfd = bpf_try_perf_event_open_with_probe(fn_name, fn_offset, -1, event_type, 837 attach_type != BPF_PROBE_ENTRY); 838 // If failed, most likely Kernel doesn't support the new perf_event_open API 839 // yet. Try create the event using debugfs. 840 if (pfd < 0) { 841 snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type); 842 kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0); 843 if (kfd < 0) { 844 fprintf(stderr, "open(%s): %s\n", buf, strerror(errno)); 845 goto error; 846 } 847 848 snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid()); 849 850 if (fn_offset > 0 && attach_type == BPF_PROBE_ENTRY) 851 snprintf(buf, sizeof(buf), "p:%ss/%s %s+%"PRIu64, 852 event_type, event_alias, fn_name, fn_offset); 853 else 854 snprintf(buf, sizeof(buf), "%c:%ss/%s %s", 855 attach_type == BPF_PROBE_ENTRY ? 'p' : 'r', 856 event_type, event_alias, fn_name); 857 858 if (write(kfd, buf, strlen(buf)) < 0) { 859 if (errno == ENOENT) 860 fprintf(stderr, "cannot attach kprobe, probe entry may not exist\n"); 861 else 862 fprintf(stderr, "cannot attach kprobe, %s\n", strerror(errno)); 863 close(kfd); 864 goto error; 865 } 866 close(kfd); 867 snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias); 868 } 869 // If perf_event_open succeeded, bpf_attach_tracing_event will use the created 870 // Perf Event FD directly and buf would be empty and unused. 871 // Otherwise it will read the event ID from the path in buf, create the 872 // Perf Event event using that ID, and updated value of pfd. 873 if (bpf_attach_tracing_event(progfd, buf, -1 /* PID */, &pfd) == 0) 874 return pfd; 875 876 error: 877 bpf_close_perf_event_fd(pfd); 878 return -1; 879 } 880 881 static int enter_mount_ns(int pid) { 882 struct stat self_stat, target_stat; 883 int self_fd = -1, target_fd = -1; 884 char buf[64]; 885 886 if (pid < 0) 887 return -1; 888 889 if ((size_t)snprintf(buf, sizeof(buf), "/proc/%d/ns/mnt", pid) >= sizeof(buf)) 890 return -1; 891 892 self_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC); 893 if (self_fd < 0) { 894 perror("open(/proc/self/ns/mnt)"); 895 return -1; 896 } 897 898 target_fd = open(buf, O_RDONLY | O_CLOEXEC); 899 if (target_fd < 0) { 900 perror("open(/proc/<pid>/ns/mnt)"); 901 goto error; 902 } 903 904 if (fstat(self_fd, &self_stat)) { 905 perror("fstat(self_fd)"); 906 goto error; 907 } 908 909 if (fstat(target_fd, &target_stat)) { 910 perror("fstat(target_fd)"); 911 goto error; 912 } 913 914 // both target and current ns are same, avoid setns and close all fds 915 if (self_stat.st_ino == target_stat.st_ino) 916 goto error; 917 918 if (setns(target_fd, CLONE_NEWNS)) { 919 perror("setns(target)"); 920 goto error; 921 } 922 923 close(target_fd); 924 return self_fd; 925 926 error: 927 if (self_fd >= 0) 928 close(self_fd); 929 if (target_fd >= 0) 930 close(target_fd); 931 return -1; 932 } 933 934 static void exit_mount_ns(int fd) { 935 if (fd < 0) 936 return; 937 938 if (setns(fd, CLONE_NEWNS)) 939 perror("setns"); 940 close(fd); 941 } 942 943 int bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, 944 const char *ev_name, const char *binary_path, 945 uint64_t offset, pid_t pid) 946 { 947 char buf[PATH_MAX]; 948 char event_alias[PATH_MAX]; 949 static char *event_type = "uprobe"; 950 int res, kfd = -1, pfd = -1, ns_fd = -1; 951 // Try create the uprobe Perf Event with perf_event_open API. 952 pfd = bpf_try_perf_event_open_with_probe(binary_path, offset, pid, event_type, 953 attach_type != BPF_PROBE_ENTRY); 954 // If failed, most likely Kernel doesn't support the new perf_event_open API 955 // yet. Try create the event using debugfs. 956 if (pfd < 0) { 957 snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type); 958 kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0); 959 if (kfd < 0) { 960 fprintf(stderr, "open(%s): %s\n", buf, strerror(errno)); 961 goto error; 962 } 963 964 res = snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid()); 965 if (res < 0 || res >= (int)sizeof(event_alias)) { 966 fprintf(stderr, "Event name (%s) is too long for buffer\n", ev_name); 967 goto error; 968 } 969 res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r', 970 event_type, event_alias, binary_path, (unsigned long)offset); 971 if (res < 0 || res >= (int)sizeof(buf)) { 972 fprintf(stderr, "Event alias (%s) too long for buffer\n", event_alias); 973 goto error; 974 } 975 976 ns_fd = enter_mount_ns(pid); 977 if (write(kfd, buf, strlen(buf)) < 0) { 978 if (errno == EINVAL) 979 fprintf(stderr, "check dmesg output for possible cause\n"); 980 goto error; 981 } 982 close(kfd); 983 kfd = -1; 984 exit_mount_ns(ns_fd); 985 ns_fd = -1; 986 987 snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias); 988 } 989 // If perf_event_open succeeded, bpf_attach_tracing_event will use the created 990 // Perf Event FD directly and buf would be empty and unused. 991 // Otherwise it will read the event ID from the path in buf, create the 992 // Perf Event event using that ID, and updated value of pfd. 993 if (bpf_attach_tracing_event(progfd, buf, pid, &pfd) == 0) 994 return pfd; 995 996 error: 997 if (kfd >= 0) 998 close(kfd); 999 exit_mount_ns(ns_fd); 1000 bpf_close_perf_event_fd(pfd); 1001 return -1; 1002 } 1003 1004 static int bpf_detach_probe(const char *ev_name, const char *event_type) 1005 { 1006 int kfd = -1, res; 1007 char buf[PATH_MAX]; 1008 int found_event = 0; 1009 size_t bufsize = 0; 1010 char *cptr = NULL; 1011 FILE *fp; 1012 1013 /* 1014 * For [k,u]probe created with perf_event_open (on newer kernel), it is 1015 * not necessary to clean it up in [k,u]probe_events. We first look up 1016 * the %s_bcc_%d line in [k,u]probe_events. If the event is not found, 1017 * it is safe to skip the cleaning up process (write -:... to the file). 1018 */ 1019 snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type); 1020 fp = fopen(buf, "re"); 1021 if (!fp) { 1022 fprintf(stderr, "open(%s): %s\n", buf, strerror(errno)); 1023 goto error; 1024 } 1025 1026 res = snprintf(buf, sizeof(buf), "%ss/%s_bcc_%d", event_type, ev_name, getpid()); 1027 if (res < 0 || res >= (int)sizeof(buf)) { 1028 fprintf(stderr, "snprintf(%s): %d\n", ev_name, res); 1029 goto error; 1030 } 1031 1032 while (getline(&cptr, &bufsize, fp) != -1) 1033 if (strstr(cptr, buf) != NULL) { 1034 found_event = 1; 1035 break; 1036 } 1037 free(cptr); 1038 fclose(fp); 1039 fp = NULL; 1040 1041 if (!found_event) 1042 return 0; 1043 1044 snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type); 1045 kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0); 1046 if (kfd < 0) { 1047 fprintf(stderr, "open(%s): %s\n", buf, strerror(errno)); 1048 goto error; 1049 } 1050 1051 res = snprintf(buf, sizeof(buf), "-:%ss/%s_bcc_%d", event_type, ev_name, getpid()); 1052 if (res < 0 || res >= (int)sizeof(buf)) { 1053 fprintf(stderr, "snprintf(%s): %d\n", ev_name, res); 1054 goto error; 1055 } 1056 if (write(kfd, buf, strlen(buf)) < 0) { 1057 fprintf(stderr, "write(%s): %s\n", buf, strerror(errno)); 1058 goto error; 1059 } 1060 1061 close(kfd); 1062 return 0; 1063 1064 error: 1065 if (kfd >= 0) 1066 close(kfd); 1067 if (fp) 1068 fclose(fp); 1069 return -1; 1070 } 1071 1072 int bpf_detach_kprobe(const char *ev_name) 1073 { 1074 return bpf_detach_probe(ev_name, "kprobe"); 1075 } 1076 1077 int bpf_detach_uprobe(const char *ev_name) 1078 { 1079 return bpf_detach_probe(ev_name, "uprobe"); 1080 } 1081 1082 1083 int bpf_attach_tracepoint(int progfd, const char *tp_category, 1084 const char *tp_name) 1085 { 1086 char buf[256]; 1087 int pfd = -1; 1088 1089 snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%s/%s", 1090 tp_category, tp_name); 1091 if (bpf_attach_tracing_event(progfd, buf, -1 /* PID */, &pfd) == 0) 1092 return pfd; 1093 1094 bpf_close_perf_event_fd(pfd); 1095 return -1; 1096 } 1097 1098 int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) { 1099 tp_category = NULL; 1100 tp_name = NULL; 1101 // Right now, there is nothing to do, but it's a good idea to encourage 1102 // callers to detach anything they attach. 1103 return 0; 1104 } 1105 1106 int bpf_attach_raw_tracepoint(int progfd, char *tp_name) 1107 { 1108 union bpf_attr attr; 1109 int ret; 1110 1111 bzero(&attr, sizeof(attr)); 1112 attr.raw_tracepoint.name = ptr_to_u64(tp_name); 1113 attr.raw_tracepoint.prog_fd = progfd; 1114 1115 ret = syscall(__NR_bpf, BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr)); 1116 if (ret < 0) 1117 fprintf(stderr, "bpf_attach_raw_tracepoint (%s): %s\n", tp_name, strerror(errno)); 1118 return ret; 1119 } 1120 1121 void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, 1122 perf_reader_lost_cb lost_cb, void *cb_cookie, 1123 int pid, int cpu, int page_cnt) { 1124 int pfd; 1125 struct perf_event_attr attr = {}; 1126 struct perf_reader *reader = NULL; 1127 1128 reader = perf_reader_new(raw_cb, lost_cb, cb_cookie, page_cnt); 1129 if (!reader) 1130 goto error; 1131 1132 attr.config = 10;//PERF_COUNT_SW_BPF_OUTPUT; 1133 attr.type = PERF_TYPE_SOFTWARE; 1134 attr.sample_type = PERF_SAMPLE_RAW; 1135 attr.sample_period = 1; 1136 attr.wakeup_events = 1; 1137 pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC); 1138 if (pfd < 0) { 1139 fprintf(stderr, "perf_event_open: %s\n", strerror(errno)); 1140 fprintf(stderr, " (check your kernel for PERF_COUNT_SW_BPF_OUTPUT support, 4.4 or newer)\n"); 1141 goto error; 1142 } 1143 perf_reader_set_fd(reader, pfd); 1144 1145 if (perf_reader_mmap(reader) < 0) 1146 goto error; 1147 1148 if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) { 1149 perror("ioctl(PERF_EVENT_IOC_ENABLE)"); 1150 goto error; 1151 } 1152 1153 return reader; 1154 1155 error: 1156 if (reader) 1157 perf_reader_free(reader); 1158 1159 return NULL; 1160 } 1161 1162 static int invalid_perf_config(uint32_t type, uint64_t config) { 1163 switch (type) { 1164 case PERF_TYPE_HARDWARE: 1165 if (config >= PERF_COUNT_HW_MAX) { 1166 fprintf(stderr, "HARDWARE perf event config out of range\n"); 1167 goto is_invalid; 1168 } 1169 return 0; 1170 case PERF_TYPE_SOFTWARE: 1171 if (config >= PERF_COUNT_SW_MAX) { 1172 fprintf(stderr, "SOFTWARE perf event config out of range\n"); 1173 goto is_invalid; 1174 } else if (config == 10 /* PERF_COUNT_SW_BPF_OUTPUT */) { 1175 fprintf(stderr, "Unable to open or attach perf event for BPF_OUTPUT\n"); 1176 goto is_invalid; 1177 } 1178 return 0; 1179 case PERF_TYPE_HW_CACHE: 1180 if (((config >> 16) >= PERF_COUNT_HW_CACHE_RESULT_MAX) || 1181 (((config >> 8) & 0xff) >= PERF_COUNT_HW_CACHE_OP_MAX) || 1182 ((config & 0xff) >= PERF_COUNT_HW_CACHE_MAX)) { 1183 fprintf(stderr, "HW_CACHE perf event config out of range\n"); 1184 goto is_invalid; 1185 } 1186 return 0; 1187 case PERF_TYPE_TRACEPOINT: 1188 case PERF_TYPE_BREAKPOINT: 1189 fprintf(stderr, 1190 "Unable to open or attach TRACEPOINT or BREAKPOINT events\n"); 1191 goto is_invalid; 1192 default: 1193 return 0; 1194 } 1195 is_invalid: 1196 fprintf(stderr, "Invalid perf event type %" PRIu32 " config %" PRIu64 "\n", 1197 type, config); 1198 return 1; 1199 } 1200 1201 int bpf_open_perf_event(uint32_t type, uint64_t config, int pid, int cpu) { 1202 int fd; 1203 struct perf_event_attr attr = {}; 1204 1205 if (invalid_perf_config(type, config)) { 1206 return -1; 1207 } 1208 1209 attr.sample_period = LONG_MAX; 1210 attr.type = type; 1211 attr.config = config; 1212 1213 fd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC); 1214 if (fd < 0) { 1215 fprintf(stderr, "perf_event_open: %s\n", strerror(errno)); 1216 return -1; 1217 } 1218 1219 if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0) { 1220 perror("ioctl(PERF_EVENT_IOC_ENABLE)"); 1221 close(fd); 1222 return -1; 1223 } 1224 1225 return fd; 1226 } 1227 1228 int bpf_attach_xdp(const char *dev_name, int progfd, uint32_t flags) { 1229 struct sockaddr_nl sa; 1230 int sock, seq = 0, len, ret = -1; 1231 char buf[4096]; 1232 struct nlattr *nla, *nla_xdp; 1233 struct { 1234 struct nlmsghdr nh; 1235 struct ifinfomsg ifinfo; 1236 char attrbuf[64]; 1237 } req; 1238 struct nlmsghdr *nh; 1239 struct nlmsgerr *err; 1240 socklen_t addrlen; 1241 1242 memset(&sa, 0, sizeof(sa)); 1243 sa.nl_family = AF_NETLINK; 1244 1245 sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); 1246 if (sock < 0) { 1247 fprintf(stderr, "bpf: opening a netlink socket: %s\n", strerror(errno)); 1248 return -1; 1249 } 1250 1251 if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { 1252 fprintf(stderr, "bpf: bind to netlink: %s\n", strerror(errno)); 1253 goto cleanup; 1254 } 1255 1256 addrlen = sizeof(sa); 1257 if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { 1258 fprintf(stderr, "bpf: get sock name of netlink: %s\n", strerror(errno)); 1259 goto cleanup; 1260 } 1261 1262 if (addrlen != sizeof(sa)) { 1263 fprintf(stderr, "bpf: wrong netlink address length: %d\n", addrlen); 1264 goto cleanup; 1265 } 1266 1267 memset(&req, 0, sizeof(req)); 1268 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); 1269 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1270 req.nh.nlmsg_type = RTM_SETLINK; 1271 req.nh.nlmsg_pid = 0; 1272 req.nh.nlmsg_seq = ++seq; 1273 req.ifinfo.ifi_family = AF_UNSPEC; 1274 req.ifinfo.ifi_index = if_nametoindex(dev_name); 1275 if (req.ifinfo.ifi_index == 0) { 1276 fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno)); 1277 goto cleanup; 1278 } 1279 1280 nla = (struct nlattr *)(((char *)&req) 1281 + NLMSG_ALIGN(req.nh.nlmsg_len)); 1282 nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; 1283 1284 nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN); 1285 nla->nla_len = NLA_HDRLEN; 1286 1287 // we specify the FD passed over by the user 1288 nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; 1289 nla_xdp->nla_len = NLA_HDRLEN + sizeof(progfd); 1290 memcpy((char *)nla_xdp + NLA_HDRLEN, &progfd, sizeof(progfd)); 1291 nla->nla_len += nla_xdp->nla_len; 1292 1293 // parse flags as passed by the user 1294 if (flags) { 1295 nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); 1296 nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/; 1297 nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); 1298 memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); 1299 nla->nla_len += nla_xdp->nla_len; 1300 } 1301 1302 req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); 1303 1304 if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { 1305 fprintf(stderr, "bpf: send to netlink: %s\n", strerror(errno)); 1306 goto cleanup; 1307 } 1308 1309 len = recv(sock, buf, sizeof(buf), 0); 1310 if (len < 0) { 1311 fprintf(stderr, "bpf: recv from netlink: %s\n", strerror(errno)); 1312 goto cleanup; 1313 } 1314 1315 for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, (unsigned int)len); 1316 nh = NLMSG_NEXT(nh, len)) { 1317 if (nh->nlmsg_pid != sa.nl_pid) { 1318 fprintf(stderr, "bpf: Wrong pid %u, expected %u\n", 1319 nh->nlmsg_pid, sa.nl_pid); 1320 errno = EBADMSG; 1321 goto cleanup; 1322 } 1323 if (nh->nlmsg_seq != (unsigned int)seq) { 1324 fprintf(stderr, "bpf: Wrong seq %d, expected %d\n", 1325 nh->nlmsg_seq, seq); 1326 errno = EBADMSG; 1327 goto cleanup; 1328 } 1329 switch (nh->nlmsg_type) { 1330 case NLMSG_ERROR: 1331 err = (struct nlmsgerr *)NLMSG_DATA(nh); 1332 if (!err->error) 1333 continue; 1334 fprintf(stderr, "bpf: nlmsg error %s\n", strerror(-err->error)); 1335 errno = -err->error; 1336 goto cleanup; 1337 case NLMSG_DONE: 1338 break; 1339 } 1340 } 1341 1342 ret = 0; 1343 1344 cleanup: 1345 close(sock); 1346 return ret; 1347 } 1348 1349 int bpf_attach_perf_event_raw(int progfd, void *perf_event_attr, pid_t pid, 1350 int cpu, int group_fd, unsigned long extra_flags) { 1351 int fd = syscall(__NR_perf_event_open, perf_event_attr, pid, cpu, group_fd, 1352 PERF_FLAG_FD_CLOEXEC | extra_flags); 1353 if (fd < 0) { 1354 perror("perf_event_open failed"); 1355 return -1; 1356 } 1357 if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, progfd) != 0) { 1358 perror("ioctl(PERF_EVENT_IOC_SET_BPF) failed"); 1359 close(fd); 1360 return -1; 1361 } 1362 if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) != 0) { 1363 perror("ioctl(PERF_EVENT_IOC_ENABLE) failed"); 1364 close(fd); 1365 return -1; 1366 } 1367 1368 return fd; 1369 } 1370 1371 int bpf_attach_perf_event(int progfd, uint32_t ev_type, uint32_t ev_config, 1372 uint64_t sample_period, uint64_t sample_freq, 1373 pid_t pid, int cpu, int group_fd) { 1374 if (invalid_perf_config(ev_type, ev_config)) { 1375 return -1; 1376 } 1377 if (!((sample_period > 0) ^ (sample_freq > 0))) { 1378 fprintf( 1379 stderr, "Exactly one of sample_period / sample_freq should be set\n" 1380 ); 1381 return -1; 1382 } 1383 1384 struct perf_event_attr attr = {}; 1385 attr.type = ev_type; 1386 attr.config = ev_config; 1387 if (pid > 0) 1388 attr.inherit = 1; 1389 if (sample_freq > 0) { 1390 attr.freq = 1; 1391 attr.sample_freq = sample_freq; 1392 } else { 1393 attr.sample_period = sample_period; 1394 } 1395 1396 return bpf_attach_perf_event_raw(progfd, &attr, pid, cpu, group_fd, 0); 1397 } 1398 1399 int bpf_close_perf_event_fd(int fd) { 1400 int res, error = 0; 1401 if (fd >= 0) { 1402 res = ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); 1403 if (res != 0) { 1404 perror("ioctl(PERF_EVENT_IOC_DISABLE) failed"); 1405 error = res; 1406 } 1407 res = close(fd); 1408 if (res != 0) { 1409 perror("close perf event FD failed"); 1410 error = (res && !error) ? res : error; 1411 } 1412 } 1413 return error; 1414 } 1415 1416 int bpf_obj_pin(int fd, const char *pathname) 1417 { 1418 union bpf_attr attr; 1419 1420 memset(&attr, 0, sizeof(attr)); 1421 attr.pathname = ptr_to_u64((void *)pathname); 1422 attr.bpf_fd = fd; 1423 1424 return syscall(__NR_bpf, BPF_OBJ_PIN, &attr, sizeof(attr)); 1425 } 1426 1427 int bpf_obj_get(const char *pathname) 1428 { 1429 union bpf_attr attr; 1430 1431 memset(&attr, 0, sizeof(attr)); 1432 attr.pathname = ptr_to_u64((void *)pathname); 1433 1434 return syscall(__NR_bpf, BPF_OBJ_GET, &attr, sizeof(attr)); 1435 } 1436 1437 int bpf_prog_get_next_id(uint32_t start_id, uint32_t *next_id) 1438 { 1439 union bpf_attr attr; 1440 int err; 1441 1442 memset(&attr, 0, sizeof(attr)); 1443 attr.start_id = start_id; 1444 1445 err = syscall(__NR_bpf, BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr)); 1446 if (!err) 1447 *next_id = attr.next_id; 1448 1449 return err; 1450 } 1451 1452 int bpf_prog_get_fd_by_id(uint32_t id) 1453 { 1454 union bpf_attr attr; 1455 1456 memset(&attr, 0, sizeof(attr)); 1457 attr.prog_id = id; 1458 1459 return syscall(__NR_bpf, BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr)); 1460 } 1461 1462 int bpf_map_get_fd_by_id(uint32_t id) 1463 { 1464 union bpf_attr attr; 1465 1466 memset(&attr, 0, sizeof(attr)); 1467 attr.map_id = id; 1468 1469 return syscall(__NR_bpf, BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr)); 1470 } 1471