Home | History | Annotate | Download | only in cc
      1 /*
      2  * Copyright (c) 2015 PLUMgrid, Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 #ifndef _GNU_SOURCE
     17 #define _GNU_SOURCE
     18 #endif
     19 
     20 #include <arpa/inet.h>
     21 #include <errno.h>
     22 #include <fcntl.h>
     23 #include <inttypes.h>
     24 #include <limits.h>
     25 #include <linux/bpf.h>
     26 #include <linux/bpf_common.h>
     27 #include <linux/if_packet.h>
     28 #include <linux/perf_event.h>
     29 #include <linux/pkt_cls.h>
     30 #include <linux/rtnetlink.h>
     31 #include <linux/sched.h>
     32 #include <linux/unistd.h>
     33 #include <linux/version.h>
     34 #include <net/ethernet.h>
     35 #include <net/if.h>
     36 #include <sched.h>
     37 #include <stdbool.h>
     38 #include <stdio.h>
     39 #include <stdlib.h>
     40 #include <string.h>
     41 #include <sys/ioctl.h>
     42 #include <sys/resource.h>
     43 #include <sys/stat.h>
     44 #include <sys/types.h>
     45 #include <unistd.h>
     46 #include <linux/if_alg.h>
     47 
     48 #include "libbpf.h"
     49 #include "perf_reader.h"
     50 
     51 // TODO: Remove this when CentOS 6 support is not needed anymore
     52 #include "setns.h"
     53 
     54 // TODO: remove these defines when linux-libc-dev exports them properly
     55 
     56 #ifndef __NR_bpf
     57 #if defined(__powerpc64__)
     58 #define __NR_bpf 361
     59 #elif defined(__s390x__)
     60 #define __NR_bpf 351
     61 #elif defined(__aarch64__)
     62 #define __NR_bpf 280
     63 #else
     64 #define __NR_bpf 321
     65 #endif
     66 #endif
     67 
     68 #ifndef SO_ATTACH_BPF
     69 #define SO_ATTACH_BPF 50
     70 #endif
     71 
     72 #ifndef PERF_EVENT_IOC_SET_BPF
     73 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
     74 #endif
     75 
     76 #ifndef PERF_FLAG_FD_CLOEXEC
     77 #define PERF_FLAG_FD_CLOEXEC (1UL << 3)
     78 #endif
     79 
     80 // TODO: Remove this when CentOS 6 support is not needed anymore
     81 #ifndef AF_ALG
     82 #define AF_ALG 38
     83 #endif
     84 
     85 #define min(x, y) ((x) < (y) ? (x) : (y))
     86 
     87 struct bpf_helper {
     88   char *name;
     89   char *required_version;
     90 };
     91 
     92 static struct bpf_helper helpers[] = {
     93   {"map_lookup_elem", "3.19"},
     94   {"map_update_elem", "3.19"},
     95   {"map_delete_elem", "3.19"},
     96   {"probe_read", "4.1"},
     97   {"ktime_get_ns", "4.1"},
     98   {"trace_printk", "4.1"},
     99   {"get_prandom_u32", "4.1"},
    100   {"get_smp_processor_id", "4.1"},
    101   {"skb_store_bytes", "4.1"},
    102   {"l3_csum_replace", "4.1"},
    103   {"l4_csum_replace", "4.1"},
    104   {"tail_call", "4.2"},
    105   {"clone_redirect", "4.2"},
    106   {"get_current_pid_tgid", "4.2"},
    107   {"get_current_uid_gid", "4.2"},
    108   {"get_current_comm", "4.2"},
    109   {"get_cgroup_classid", "4.3"},
    110   {"skb_vlan_push", "4.3"},
    111   {"skb_vlan_pop", "4.3"},
    112   {"skb_get_tunnel_key", "4.3"},
    113   {"skb_set_tunnel_key", "4.3"},
    114   {"perf_event_read", "4.3"},
    115   {"redirect", "4.4"},
    116   {"get_route_realm", "4.4"},
    117   {"perf_event_output", "4.4"},
    118   {"skb_load_bytes", "4.5"},
    119   {"get_stackid", "4.6"},
    120   {"csum_diff", "4.6"},
    121   {"skb_get_tunnel_opt", "4.6"},
    122   {"skb_set_tunnel_opt", "4.6"},
    123   {"skb_change_proto", "4.8"},
    124   {"skb_change_type", "4.8"},
    125   {"skb_under_cgroup", "4.8"},
    126   {"get_hash_recalc", "4.8"},
    127   {"get_current_task", "4.8"},
    128   {"probe_write_user", "4.8"},
    129   {"current_task_under_cgroup", "4.9"},
    130   {"skb_change_tail", "4.9"},
    131   {"skb_pull_data", "4.9"},
    132   {"csum_update", "4.9"},
    133   {"set_hash_invalid", "4.9"},
    134   {"get_numa_node_id", "4.10"},
    135   {"skb_change_head", "4.10"},
    136   {"xdp_adjust_head", "4.10"},
    137   {"probe_read_str", "4.11"},
    138   {"get_socket_cookie", "4.12"},
    139   {"get_socket_uid", "4.12"},
    140   {"set_hash", "4.13"},
    141   {"setsockopt", "4.13"},
    142   {"skb_adjust_room", "4.13"},
    143   {"redirect_map", "4.14"},
    144   {"sk_redirect_map", "4.14"},
    145   {"sock_map_update", "4.14"},
    146   {"xdp_adjust_meta", "4.15"},
    147   {"perf_event_read_value", "4.15"},
    148   {"perf_prog_read_value", "4.15"},
    149   {"getsockopt", "4.15"},
    150   {"override_return", "4.16"},
    151   {"sock_ops_cb_flags_set", "4.16"},
    152   {"msg_redirect_map", "4.17"},
    153   {"msg_apply_bytes", "4.17"},
    154   {"msg_cork_bytes", "4.17"},
    155   {"msg_pull_data", "4.17"},
    156   {"bind", "4.17"},
    157   {"xdp_adjust_tail", "4.18"},
    158   {"skb_get_xfrm_state", "4.18"},
    159   {"get_stack", "4.18"},
    160   {"skb_load_bytes_relative", "4.18"},
    161   {"fib_lookup", "4.18"},
    162   {"sock_hash_update", "4.18"},
    163   {"msg_redirect_hash", "4.18"},
    164   {"sk_redirect_hash", "4.18"},
    165   {"lwt_push_encap", "4.18"},
    166   {"lwt_seg6_store_bytes", "4.18"},
    167   {"lwt_seg6_adjust_srh", "4.18"},
    168   {"lwt_seg6_action", "4.18"},
    169   {"rc_repeat", "4.18"},
    170   {"rc_keydown", "4.18"},
    171   {"skb_cgroup_id", "4.18"},
    172   {"get_current_cgroup_id", "4.18"},
    173   {"get_local_storage", "4.19"},
    174   {"sk_select_reuseport", "4.19"},
    175   {"skb_ancestor_cgroup_id", "4.19"},
    176   {"sk_lookup_tcp", "4.20"},
    177   {"sk_lookup_udp", "4.20"},
    178   {"sk_release", "4.20"},
    179   {"map_push_elem", "4.20"},
    180   {"map_pop_elem", "4.20"},
    181   {"map_peak_elem", "4.20"},
    182   {"msg_push_data", "4.20"},
    183 };
    184 
    185 static uint64_t ptr_to_u64(void *ptr)
    186 {
    187   return (uint64_t) (unsigned long) ptr;
    188 }
    189 
    190 int bpf_create_map(enum bpf_map_type map_type, const char *name,
    191                    int key_size, int value_size,
    192                    int max_entries, int map_flags)
    193 {
    194   size_t name_len = name ? strlen(name) : 0;
    195   union bpf_attr attr;
    196   memset(&attr, 0, sizeof(attr));
    197   attr.map_type = map_type;
    198   attr.key_size = key_size;
    199   attr.value_size = value_size;
    200   attr.max_entries = max_entries;
    201   attr.map_flags = map_flags;
    202   memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
    203 
    204   int ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
    205 
    206   if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) {
    207     memset(attr.map_name, 0, BPF_OBJ_NAME_LEN);
    208     ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
    209   }
    210 
    211   if (ret < 0 && errno == EPERM) {
    212     // see note below about the rationale for this retry
    213 
    214     struct rlimit rl = {};
    215     if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) {
    216       rl.rlim_max = RLIM_INFINITY;
    217       rl.rlim_cur = rl.rlim_max;
    218       if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0)
    219         ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
    220     }
    221   }
    222   return ret;
    223 }
    224 
    225 int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
    226 {
    227   union bpf_attr attr;
    228   memset(&attr, 0, sizeof(attr));
    229   attr.map_fd = fd;
    230   attr.key = ptr_to_u64(key);
    231   attr.value = ptr_to_u64(value);
    232   attr.flags = flags;
    233 
    234   return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
    235 }
    236 
    237 int bpf_lookup_elem(int fd, void *key, void *value)
    238 {
    239   union bpf_attr attr;
    240   memset(&attr, 0, sizeof(attr));
    241   attr.map_fd = fd;
    242   attr.key = ptr_to_u64(key);
    243   attr.value = ptr_to_u64(value);
    244 
    245   return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
    246 }
    247 
    248 int bpf_delete_elem(int fd, void *key)
    249 {
    250   union bpf_attr attr;
    251   memset(&attr, 0, sizeof(attr));
    252   attr.map_fd = fd;
    253   attr.key = ptr_to_u64(key);
    254 
    255   return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
    256 }
    257 
    258 int bpf_get_first_key(int fd, void *key, size_t key_size)
    259 {
    260   union bpf_attr attr;
    261   int i, res;
    262 
    263   memset(&attr, 0, sizeof(attr));
    264   attr.map_fd = fd;
    265   attr.key = 0;
    266   attr.next_key = ptr_to_u64(key);
    267 
    268   // 4.12 and above kernel supports passing NULL to BPF_MAP_GET_NEXT_KEY
    269   // to get first key of the map. For older kernels, the call will fail.
    270   res = syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
    271   if (res < 0 && errno == EFAULT) {
    272     // Fall back to try to find a non-existing key.
    273     static unsigned char try_values[3] = {0, 0xff, 0x55};
    274     attr.key = ptr_to_u64(key);
    275     for (i = 0; i < 3; i++) {
    276       memset(key, try_values[i], key_size);
    277       // We want to check the existence of the key but we don't know the size
    278       // of map's value. So we pass an invalid pointer for value, expect
    279       // the call to fail and check if the error is ENOENT indicating the
    280       // key doesn't exist. If we use NULL for the invalid pointer, it might
    281       // trigger a page fault in kernel and affect performance. Hence we use
    282       // ~0 which will fail and return fast.
    283       // This should fail since we pass an invalid pointer for value.
    284       if (bpf_lookup_elem(fd, key, (void *)~0) >= 0)
    285         return -1;
    286       // This means the key doesn't exist.
    287       if (errno == ENOENT)
    288         return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
    289     }
    290     return -1;
    291   } else {
    292     return res;
    293   }
    294 }
    295 
    296 int bpf_get_next_key(int fd, void *key, void *next_key)
    297 {
    298   union bpf_attr attr;
    299   memset(&attr, 0, sizeof(attr));
    300   attr.map_fd = fd;
    301   attr.key = ptr_to_u64(key);
    302   attr.next_key = ptr_to_u64(next_key);
    303 
    304   return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
    305 }
    306 
    307 static void bpf_print_hints(int ret, char *log)
    308 {
    309   if (ret < 0)
    310     fprintf(stderr, "bpf: Failed to load program: %s\n", strerror(errno));
    311   if (log == NULL)
    312     return;
    313   else
    314     fprintf(stderr, "%s\n", log);
    315 
    316   if (ret >= 0)
    317     return;
    318 
    319   // The following error strings will need maintenance to match LLVM.
    320 
    321   // stack busting
    322   if (strstr(log, "invalid stack off=-") != NULL) {
    323     fprintf(stderr, "HINT: Looks like you exceeded the BPF stack limit. "
    324       "This can happen if you allocate too much local variable storage. "
    325       "For example, if you allocated a 1 Kbyte struct (maybe for "
    326       "BPF_PERF_OUTPUT), busting a max stack of 512 bytes.\n\n");
    327   }
    328 
    329   // didn't check NULL on map lookup
    330   if (strstr(log, "invalid mem access 'map_value_or_null'") != NULL) {
    331     fprintf(stderr, "HINT: The 'map_value_or_null' error can happen if "
    332       "you dereference a pointer value from a map lookup without first "
    333       "checking if that pointer is NULL.\n\n");
    334   }
    335 
    336   // lacking a bpf_probe_read
    337   if (strstr(log, "invalid mem access 'inv'") != NULL) {
    338     fprintf(stderr, "HINT: The invalid mem access 'inv' error can happen "
    339       "if you try to dereference memory without first using "
    340       "bpf_probe_read() to copy it to the BPF stack. Sometimes the "
    341       "bpf_probe_read is automatic by the bcc rewriter, other times "
    342       "you'll need to be explicit.\n\n");
    343   }
    344 
    345   // helper function not found in kernel
    346   char *helper_str = strstr(log, "invalid func ");
    347   if (helper_str != NULL) {
    348     helper_str += strlen("invalid func ");
    349     char *str = strchr(helper_str, '#');
    350     if (str != NULL) {
    351       helper_str = str + 1;
    352     }
    353     unsigned int helper_id = atoi(helper_str);
    354     if (helper_id && helper_id < sizeof(helpers) / sizeof(struct bpf_helper)) {
    355       struct bpf_helper helper = helpers[helper_id - 1];
    356       fprintf(stderr, "HINT: bpf_%s missing (added in Linux %s).\n\n",
    357               helper.name, helper.required_version);
    358     }
    359   }
    360 }
    361 #define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u))
    362 
    363 int bpf_obj_get_info(int prog_map_fd, void *info, uint32_t *info_len)
    364 {
    365   union bpf_attr attr;
    366   int err;
    367 
    368   memset(&attr, 0, sizeof(attr));
    369   attr.info.bpf_fd = prog_map_fd;
    370   attr.info.info_len = *info_len;
    371   attr.info.info = ptr_to_u64(info);
    372 
    373   err = syscall(__NR_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
    374   if (!err)
    375           *info_len = attr.info.info_len;
    376 
    377   return err;
    378 }
    379 
    380 int bpf_prog_compute_tag(const struct bpf_insn *insns, int prog_len,
    381                          unsigned long long *ptag)
    382 {
    383   struct sockaddr_alg alg = {
    384     .salg_family    = AF_ALG,
    385     .salg_type      = "hash",
    386     .salg_name      = "sha1",
    387   };
    388   int shafd = socket(AF_ALG, SOCK_SEQPACKET | SOCK_CLOEXEC, 0);
    389   if (shafd < 0) {
    390     fprintf(stderr, "sha1 socket not available %s\n", strerror(errno));
    391     return -1;
    392   }
    393   int ret = bind(shafd, (struct sockaddr *)&alg, sizeof(alg));
    394   if (ret < 0) {
    395     fprintf(stderr, "sha1 bind fail %s\n", strerror(errno));
    396     close(shafd);
    397     return ret;
    398   }
    399   int shafd2 = accept4(shafd, NULL, 0, SOCK_CLOEXEC);
    400   if (shafd2 < 0) {
    401     fprintf(stderr, "sha1 accept fail %s\n", strerror(errno));
    402     close(shafd);
    403     return -1;
    404   }
    405   struct bpf_insn prog[prog_len / 8];
    406   bool map_ld_seen = false;
    407   int i;
    408   for (i = 0; i < prog_len / 8; i++) {
    409     prog[i] = insns[i];
    410     if (insns[i].code == (BPF_LD | BPF_DW | BPF_IMM) &&
    411         insns[i].src_reg == BPF_PSEUDO_MAP_FD &&
    412         !map_ld_seen) {
    413       prog[i].imm = 0;
    414       map_ld_seen = true;
    415     } else if (insns[i].code == 0 && map_ld_seen) {
    416       prog[i].imm = 0;
    417       map_ld_seen = false;
    418     } else {
    419       map_ld_seen = false;
    420     }
    421   }
    422   ret = write(shafd2, prog, prog_len);
    423   if (ret != prog_len) {
    424     fprintf(stderr, "sha1 write fail %s\n", strerror(errno));
    425     close(shafd2);
    426     close(shafd);
    427     return -1;
    428   }
    429 
    430   union {
    431 	  unsigned char sha[20];
    432 	  unsigned long long tag;
    433   } u = {};
    434   ret = read(shafd2, u.sha, 20);
    435   if (ret != 20) {
    436     fprintf(stderr, "sha1 read fail %s\n", strerror(errno));
    437     close(shafd2);
    438     close(shafd);
    439     return -1;
    440   }
    441   *ptag = __builtin_bswap64(u.tag);
    442   close(shafd2);
    443   close(shafd);
    444   return 0;
    445 }
    446 
    447 int bpf_prog_get_tag(int fd, unsigned long long *ptag)
    448 {
    449   char fmt[64];
    450   snprintf(fmt, sizeof(fmt), "/proc/self/fdinfo/%d", fd);
    451   FILE * f = fopen(fmt, "re");
    452   if (!f) {
    453 /*    fprintf(stderr, "failed to open fdinfo %s\n", strerror(errno));*/
    454     return -1;
    455   }
    456   fgets(fmt, sizeof(fmt), f); // pos
    457   fgets(fmt, sizeof(fmt), f); // flags
    458   fgets(fmt, sizeof(fmt), f); // mnt_id
    459   fgets(fmt, sizeof(fmt), f); // prog_type
    460   fgets(fmt, sizeof(fmt), f); // prog_jited
    461   fgets(fmt, sizeof(fmt), f); // prog_tag
    462   fclose(f);
    463   char *p = strchr(fmt, ':');
    464   if (!p) {
    465 /*    fprintf(stderr, "broken fdinfo %s\n", fmt);*/
    466     return -2;
    467   }
    468   unsigned long long tag = 0;
    469   sscanf(p + 1, "%llx", &tag);
    470   *ptag = tag;
    471   return 0;
    472 }
    473 
    474 int bpf_prog_load(enum bpf_prog_type prog_type, const char *name,
    475                   const struct bpf_insn *insns, int prog_len,
    476                   const char *license, unsigned kern_version,
    477                   int log_level, char *log_buf, unsigned log_buf_size)
    478 {
    479   size_t name_len = name ? strlen(name) : 0;
    480   union bpf_attr attr;
    481   char *tmp_log_buf = NULL;
    482   unsigned tmp_log_buf_size = 0;
    483   int ret = 0, name_offset = 0;
    484 
    485   memset(&attr, 0, sizeof(attr));
    486 
    487   attr.prog_type = prog_type;
    488   attr.kern_version = kern_version;
    489   attr.license = ptr_to_u64((void *)license);
    490 
    491   attr.insns = ptr_to_u64((void *)insns);
    492   attr.insn_cnt = prog_len / sizeof(struct bpf_insn);
    493   if (attr.insn_cnt > BPF_MAXINSNS) {
    494     errno = EINVAL;
    495     fprintf(stderr,
    496             "bpf: %s. Program %s too large (%u insns), at most %d insns\n\n",
    497             strerror(errno), name, attr.insn_cnt, BPF_MAXINSNS);
    498     return -1;
    499   }
    500 
    501   attr.log_level = log_level;
    502   if (attr.log_level > 0) {
    503     if (log_buf_size > 0) {
    504       // Use user-provided log buffer if availiable.
    505       log_buf[0] = 0;
    506       attr.log_buf = ptr_to_u64(log_buf);
    507       attr.log_size = log_buf_size;
    508     } else {
    509       // Create and use temporary log buffer if user didn't provide one.
    510       tmp_log_buf_size = LOG_BUF_SIZE;
    511       tmp_log_buf = malloc(tmp_log_buf_size);
    512       if (!tmp_log_buf) {
    513         fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n",
    514                 strerror(errno));
    515         attr.log_level = 0;
    516       } else {
    517         tmp_log_buf[0] = 0;
    518         attr.log_buf = ptr_to_u64(tmp_log_buf);
    519         attr.log_size = tmp_log_buf_size;
    520       }
    521     }
    522   }
    523 
    524   if (strncmp(name, "kprobe__", 8) == 0)
    525     name_offset = 8;
    526   else if (strncmp(name, "tracepoint__", 12) == 0)
    527     name_offset = 12;
    528   else if (strncmp(name, "raw_tracepoint__", 16) == 0)
    529     name_offset = 16;
    530   memcpy(attr.prog_name, name + name_offset,
    531          min(name_len - name_offset, BPF_OBJ_NAME_LEN - 1));
    532 
    533   ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
    534   // BPF object name is not supported on older Kernels.
    535   // If we failed due to this, clear the name and try again.
    536   if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) {
    537     memset(attr.prog_name, 0, BPF_OBJ_NAME_LEN);
    538     ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
    539   }
    540 
    541   if (ret < 0 && errno == EPERM) {
    542     // When EPERM is returned, two reasons are possible:
    543     //  1. user has no permissions for bpf()
    544     //  2. user has insufficent rlimit for locked memory
    545     // Unfortunately, there is no api to inspect the current usage of locked
    546     // mem for the user, so an accurate calculation of how much memory to lock
    547     // for this new program is difficult to calculate. As a hack, bump the limit
    548     // to unlimited. If program load fails again, return the error.
    549     struct rlimit rl = {};
    550     if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) {
    551       rl.rlim_max = RLIM_INFINITY;
    552       rl.rlim_cur = rl.rlim_max;
    553       if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0)
    554         ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
    555     }
    556   }
    557 
    558   // The load has failed. Handle log message.
    559   if (ret < 0) {
    560     // User has provided a log buffer.
    561     if (log_buf_size) {
    562       // If logging is not already enabled, enable it and do the syscall again.
    563       if (attr.log_level == 0) {
    564         attr.log_level = 1;
    565         attr.log_buf = ptr_to_u64(log_buf);
    566         attr.log_size = log_buf_size;
    567         ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
    568       }
    569       // Print the log message and return.
    570       bpf_print_hints(ret, log_buf);
    571       if (errno == ENOSPC)
    572         fprintf(stderr, "bpf: log_buf size may be insufficient\n");
    573       goto return_result;
    574     }
    575 
    576     // User did not provide log buffer. We will try to increase size of
    577     // our temporary log buffer to get full error message.
    578     if (tmp_log_buf)
    579       free(tmp_log_buf);
    580     tmp_log_buf_size = LOG_BUF_SIZE;
    581     if (attr.log_level == 0)
    582       attr.log_level = 1;
    583     for (;;) {
    584       tmp_log_buf = malloc(tmp_log_buf_size);
    585       if (!tmp_log_buf) {
    586         fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n",
    587                 strerror(errno));
    588         goto return_result;
    589       }
    590       tmp_log_buf[0] = 0;
    591       attr.log_buf = ptr_to_u64(tmp_log_buf);
    592       attr.log_size = tmp_log_buf_size;
    593 
    594       ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
    595       if (ret < 0 && errno == ENOSPC) {
    596         // Temporary buffer size is not enough. Double it and try again.
    597         free(tmp_log_buf);
    598         tmp_log_buf = NULL;
    599         tmp_log_buf_size <<= 1;
    600       } else {
    601         break;
    602       }
    603     }
    604   }
    605 
    606   // Check if we should print the log message if log_level is not 0,
    607   // either specified by user or set due to error.
    608   if (attr.log_level > 0) {
    609     // Don't print if user enabled logging and provided log buffer,
    610     // but there is no error.
    611     if (log_buf && ret < 0)
    612       bpf_print_hints(ret, log_buf);
    613     else if (tmp_log_buf)
    614       bpf_print_hints(ret, tmp_log_buf);
    615   }
    616 
    617 return_result:
    618   if (tmp_log_buf)
    619     free(tmp_log_buf);
    620   return ret;
    621 }
    622 
    623 int bpf_open_raw_sock(const char *name)
    624 {
    625   struct sockaddr_ll sll;
    626   int sock;
    627 
    628   sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL));
    629   if (sock < 0) {
    630     fprintf(stderr, "cannot create raw socket\n");
    631     return -1;
    632   }
    633 
    634   /* Do not bind on empty interface names */
    635   if (!name || *name == '\0')
    636     return sock;
    637 
    638   memset(&sll, 0, sizeof(sll));
    639   sll.sll_family = AF_PACKET;
    640   sll.sll_ifindex = if_nametoindex(name);
    641   if (sll.sll_ifindex == 0) {
    642     fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno));
    643     close(sock);
    644     return -1;
    645   }
    646   sll.sll_protocol = htons(ETH_P_ALL);
    647   if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
    648     fprintf(stderr, "bind to %s: %s\n", name, strerror(errno));
    649     close(sock);
    650     return -1;
    651   }
    652 
    653   return sock;
    654 }
    655 
    656 int bpf_attach_socket(int sock, int prog) {
    657   return setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog, sizeof(prog));
    658 }
    659 
    660 #define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
    661 static int bpf_find_probe_type(const char *event_type)
    662 {
    663   int fd;
    664   int ret;
    665   char buf[PATH_MAX];
    666 
    667   ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
    668   if (ret < 0 || ret >= (int)sizeof(buf))
    669     return -1;
    670 
    671   fd = open(buf, O_RDONLY | O_CLOEXEC);
    672   if (fd < 0)
    673     return -1;
    674   ret = read(fd, buf, sizeof(buf));
    675   close(fd);
    676   if (ret < 0 || ret >= (int)sizeof(buf))
    677     return -1;
    678   errno = 0;
    679   ret = (int)strtol(buf, NULL, 10);
    680   return errno ? -1 : ret;
    681 }
    682 
    683 #define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
    684 static int bpf_get_retprobe_bit(const char *event_type)
    685 {
    686   int fd;
    687   int ret;
    688   char buf[PATH_MAX];
    689 
    690   ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
    691   if (ret < 0 || ret >= (int)sizeof(buf))
    692     return -1;
    693 
    694   fd = open(buf, O_RDONLY | O_CLOEXEC);
    695   if (fd < 0)
    696     return -1;
    697   ret = read(fd, buf, sizeof(buf));
    698   close(fd);
    699   if (ret < 0 || ret >= (int)sizeof(buf))
    700     return -1;
    701   if (strlen(buf) < strlen("config:"))
    702     return -1;
    703   errno = 0;
    704   ret = (int)strtol(buf + strlen("config:"), NULL, 10);
    705   return errno ? -1 : ret;
    706 }
    707 
    708 /*
    709  * new kernel API allows creating [k,u]probe with perf_event_open, which
    710  * makes it easier to clean up the [k,u]probe. This function tries to
    711  * create pfd with the new API.
    712  */
    713 static int bpf_try_perf_event_open_with_probe(const char *name, uint64_t offs,
    714              int pid, char *event_type, int is_return)
    715 {
    716   struct perf_event_attr attr = {};
    717   int type = bpf_find_probe_type(event_type);
    718   int is_return_bit = bpf_get_retprobe_bit(event_type);
    719   int cpu = 0;
    720 
    721   if (type < 0 || is_return_bit < 0)
    722     return -1;
    723   attr.sample_period = 1;
    724   attr.wakeup_events = 1;
    725   if (is_return)
    726     attr.config |= 1 << is_return_bit;
    727 
    728   /*
    729    * struct perf_event_attr in latest perf_event.h has the following
    730    * extension to config1 and config2. To keep bcc compatibe with
    731    * older perf_event.h, we use config1 and config2 here instead of
    732    * kprobe_func, uprobe_path, kprobe_addr, and probe_offset.
    733    *
    734    * union {
    735    *  __u64 bp_addr;
    736    *  __u64 kprobe_func;
    737    *  __u64 uprobe_path;
    738    *  __u64 config1;
    739    * };
    740    * union {
    741    *   __u64 bp_len;
    742    *   __u64 kprobe_addr;
    743    *   __u64 probe_offset;
    744    *   __u64 config2;
    745    * };
    746    */
    747   attr.config2 = offs;  /* config2 here is kprobe_addr or probe_offset */
    748   attr.size = sizeof(attr);
    749   attr.type = type;
    750   /* config1 here is kprobe_func or  uprobe_path */
    751   attr.config1 = ptr_to_u64((void *)name);
    752   // PID filter is only possible for uprobe events.
    753   if (pid < 0)
    754     pid = -1;
    755   // perf_event_open API doesn't allow both pid and cpu to be -1.
    756   // So only set it to -1 when PID is not -1.
    757   // Tracing events do not do CPU filtering in any cases.
    758   if (pid != -1)
    759     cpu = -1;
    760   return syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */,
    761                  PERF_FLAG_FD_CLOEXEC);
    762 }
    763 
    764 // When a valid Perf Event FD provided through pfd, it will be used to enable
    765 // and attach BPF program to the event, and event_path will be ignored.
    766 // Otherwise, event_path is expected to contain the path to the event in debugfs
    767 // and it will be used to open the Perf Event FD.
    768 // In either case, if the attach partially failed (such as issue with the
    769 // ioctl operations), the **caller** need to clean up the Perf Event FD, either
    770 // provided by the caller or opened here.
    771 static int bpf_attach_tracing_event(int progfd, const char *event_path, int pid,
    772                                     int *pfd)
    773 {
    774   int efd, cpu = 0;
    775   ssize_t bytes;
    776   char buf[PATH_MAX];
    777   struct perf_event_attr attr = {};
    778   // Caller did not provided a valid Perf Event FD. Create one with the debugfs
    779   // event path provided.
    780   if (*pfd < 0) {
    781     snprintf(buf, sizeof(buf), "%s/id", event_path);
    782     efd = open(buf, O_RDONLY | O_CLOEXEC, 0);
    783     if (efd < 0) {
    784       fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
    785       return -1;
    786     }
    787 
    788     bytes = read(efd, buf, sizeof(buf));
    789     if (bytes <= 0 || bytes >= (int)sizeof(buf)) {
    790       fprintf(stderr, "read(%s): %s\n", buf, strerror(errno));
    791       close(efd);
    792       return -1;
    793     }
    794     close(efd);
    795     buf[bytes] = '\0';
    796     attr.config = strtol(buf, NULL, 0);
    797     attr.type = PERF_TYPE_TRACEPOINT;
    798     attr.sample_period = 1;
    799     attr.wakeup_events = 1;
    800     // PID filter is only possible for uprobe events.
    801     if (pid < 0)
    802       pid = -1;
    803     // perf_event_open API doesn't allow both pid and cpu to be -1.
    804     // So only set it to -1 when PID is not -1.
    805     // Tracing events do not do CPU filtering in any cases.
    806     if (pid != -1)
    807       cpu = -1;
    808     *pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
    809     if (*pfd < 0) {
    810       fprintf(stderr, "perf_event_open(%s/id): %s\n", event_path, strerror(errno));
    811       return -1;
    812     }
    813   }
    814 
    815   if (ioctl(*pfd, PERF_EVENT_IOC_SET_BPF, progfd) < 0) {
    816     perror("ioctl(PERF_EVENT_IOC_SET_BPF)");
    817     return -1;
    818   }
    819   if (ioctl(*pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
    820     perror("ioctl(PERF_EVENT_IOC_ENABLE)");
    821     return -1;
    822   }
    823 
    824   return 0;
    825 }
    826 
    827 int bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type,
    828                       const char *ev_name, const char *fn_name, uint64_t fn_offset)
    829 {
    830   int kfd, pfd = -1;
    831   char buf[256];
    832   char event_alias[128];
    833   static char *event_type = "kprobe";
    834 
    835   // Try create the kprobe Perf Event with perf_event_open API.
    836   pfd = bpf_try_perf_event_open_with_probe(fn_name, fn_offset, -1, event_type,
    837                                            attach_type != BPF_PROBE_ENTRY);
    838   // If failed, most likely Kernel doesn't support the new perf_event_open API
    839   // yet. Try create the event using debugfs.
    840   if (pfd < 0) {
    841     snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
    842     kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0);
    843     if (kfd < 0) {
    844       fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
    845       goto error;
    846     }
    847 
    848     snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
    849 
    850     if (fn_offset > 0 && attach_type == BPF_PROBE_ENTRY)
    851       snprintf(buf, sizeof(buf), "p:%ss/%s %s+%"PRIu64,
    852                event_type, event_alias, fn_name, fn_offset);
    853     else
    854       snprintf(buf, sizeof(buf), "%c:%ss/%s %s",
    855                attach_type == BPF_PROBE_ENTRY ? 'p' : 'r',
    856                event_type, event_alias, fn_name);
    857 
    858     if (write(kfd, buf, strlen(buf)) < 0) {
    859       if (errno == ENOENT)
    860          fprintf(stderr, "cannot attach kprobe, probe entry may not exist\n");
    861       else
    862          fprintf(stderr, "cannot attach kprobe, %s\n", strerror(errno));
    863       close(kfd);
    864       goto error;
    865     }
    866     close(kfd);
    867     snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
    868   }
    869   // If perf_event_open succeeded, bpf_attach_tracing_event will use the created
    870   // Perf Event FD directly and buf would be empty and unused.
    871   // Otherwise it will read the event ID from the path in buf, create the
    872   // Perf Event event using that ID, and updated value of pfd.
    873   if (bpf_attach_tracing_event(progfd, buf, -1 /* PID */, &pfd) == 0)
    874     return pfd;
    875 
    876 error:
    877   bpf_close_perf_event_fd(pfd);
    878   return -1;
    879 }
    880 
    881 static int enter_mount_ns(int pid) {
    882   struct stat self_stat, target_stat;
    883   int self_fd = -1, target_fd = -1;
    884   char buf[64];
    885 
    886   if (pid < 0)
    887     return -1;
    888 
    889   if ((size_t)snprintf(buf, sizeof(buf), "/proc/%d/ns/mnt", pid) >= sizeof(buf))
    890     return -1;
    891 
    892   self_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
    893   if (self_fd < 0) {
    894     perror("open(/proc/self/ns/mnt)");
    895     return -1;
    896   }
    897 
    898   target_fd = open(buf, O_RDONLY | O_CLOEXEC);
    899   if (target_fd < 0) {
    900     perror("open(/proc/<pid>/ns/mnt)");
    901     goto error;
    902   }
    903 
    904   if (fstat(self_fd, &self_stat)) {
    905     perror("fstat(self_fd)");
    906     goto error;
    907   }
    908 
    909   if (fstat(target_fd, &target_stat)) {
    910     perror("fstat(target_fd)");
    911     goto error;
    912   }
    913 
    914   // both target and current ns are same, avoid setns and close all fds
    915   if (self_stat.st_ino == target_stat.st_ino)
    916     goto error;
    917 
    918   if (setns(target_fd, CLONE_NEWNS)) {
    919     perror("setns(target)");
    920     goto error;
    921   }
    922 
    923   close(target_fd);
    924   return self_fd;
    925 
    926 error:
    927   if (self_fd >= 0)
    928     close(self_fd);
    929   if (target_fd >= 0)
    930     close(target_fd);
    931   return -1;
    932 }
    933 
    934 static void exit_mount_ns(int fd) {
    935   if (fd < 0)
    936     return;
    937 
    938   if (setns(fd, CLONE_NEWNS))
    939     perror("setns");
    940   close(fd);
    941 }
    942 
    943 int bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type,
    944                       const char *ev_name, const char *binary_path,
    945                       uint64_t offset, pid_t pid)
    946 {
    947   char buf[PATH_MAX];
    948   char event_alias[PATH_MAX];
    949   static char *event_type = "uprobe";
    950   int res, kfd = -1, pfd = -1, ns_fd = -1;
    951   // Try create the uprobe Perf Event with perf_event_open API.
    952   pfd = bpf_try_perf_event_open_with_probe(binary_path, offset, pid, event_type,
    953                                            attach_type != BPF_PROBE_ENTRY);
    954   // If failed, most likely Kernel doesn't support the new perf_event_open API
    955   // yet. Try create the event using debugfs.
    956   if (pfd < 0) {
    957     snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
    958     kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0);
    959     if (kfd < 0) {
    960       fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
    961       goto error;
    962     }
    963 
    964     res = snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
    965     if (res < 0 || res >= (int)sizeof(event_alias)) {
    966       fprintf(stderr, "Event name (%s) is too long for buffer\n", ev_name);
    967       goto error;
    968     }
    969     res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r',
    970                    event_type, event_alias, binary_path, (unsigned long)offset);
    971     if (res < 0 || res >= (int)sizeof(buf)) {
    972       fprintf(stderr, "Event alias (%s) too long for buffer\n", event_alias);
    973       goto error;
    974     }
    975 
    976     ns_fd = enter_mount_ns(pid);
    977     if (write(kfd, buf, strlen(buf)) < 0) {
    978       if (errno == EINVAL)
    979         fprintf(stderr, "check dmesg output for possible cause\n");
    980       goto error;
    981     }
    982     close(kfd);
    983     kfd = -1;
    984     exit_mount_ns(ns_fd);
    985     ns_fd = -1;
    986 
    987     snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
    988   }
    989   // If perf_event_open succeeded, bpf_attach_tracing_event will use the created
    990   // Perf Event FD directly and buf would be empty and unused.
    991   // Otherwise it will read the event ID from the path in buf, create the
    992   // Perf Event event using that ID, and updated value of pfd.
    993   if (bpf_attach_tracing_event(progfd, buf, pid, &pfd) == 0)
    994     return pfd;
    995 
    996 error:
    997   if (kfd >= 0)
    998     close(kfd);
    999   exit_mount_ns(ns_fd);
   1000   bpf_close_perf_event_fd(pfd);
   1001   return -1;
   1002 }
   1003 
   1004 static int bpf_detach_probe(const char *ev_name, const char *event_type)
   1005 {
   1006   int kfd = -1, res;
   1007   char buf[PATH_MAX];
   1008   int found_event = 0;
   1009   size_t bufsize = 0;
   1010   char *cptr = NULL;
   1011   FILE *fp;
   1012 
   1013   /*
   1014    * For [k,u]probe created with perf_event_open (on newer kernel), it is
   1015    * not necessary to clean it up in [k,u]probe_events. We first look up
   1016    * the %s_bcc_%d line in [k,u]probe_events. If the event is not found,
   1017    * it is safe to skip the cleaning up process (write -:... to the file).
   1018    */
   1019   snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
   1020   fp = fopen(buf, "re");
   1021   if (!fp) {
   1022     fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
   1023     goto error;
   1024   }
   1025 
   1026   res = snprintf(buf, sizeof(buf), "%ss/%s_bcc_%d", event_type, ev_name, getpid());
   1027   if (res < 0 || res >= (int)sizeof(buf)) {
   1028     fprintf(stderr, "snprintf(%s): %d\n", ev_name, res);
   1029     goto error;
   1030   }
   1031 
   1032   while (getline(&cptr, &bufsize, fp) != -1)
   1033     if (strstr(cptr, buf) != NULL) {
   1034       found_event = 1;
   1035       break;
   1036     }
   1037   free(cptr);
   1038   fclose(fp);
   1039   fp = NULL;
   1040 
   1041   if (!found_event)
   1042     return 0;
   1043 
   1044   snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
   1045   kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0);
   1046   if (kfd < 0) {
   1047     fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
   1048     goto error;
   1049   }
   1050 
   1051   res = snprintf(buf, sizeof(buf), "-:%ss/%s_bcc_%d", event_type, ev_name, getpid());
   1052   if (res < 0 || res >= (int)sizeof(buf)) {
   1053     fprintf(stderr, "snprintf(%s): %d\n", ev_name, res);
   1054     goto error;
   1055   }
   1056   if (write(kfd, buf, strlen(buf)) < 0) {
   1057     fprintf(stderr, "write(%s): %s\n", buf, strerror(errno));
   1058     goto error;
   1059   }
   1060 
   1061   close(kfd);
   1062   return 0;
   1063 
   1064 error:
   1065   if (kfd >= 0)
   1066     close(kfd);
   1067   if (fp)
   1068     fclose(fp);
   1069   return -1;
   1070 }
   1071 
   1072 int bpf_detach_kprobe(const char *ev_name)
   1073 {
   1074   return bpf_detach_probe(ev_name, "kprobe");
   1075 }
   1076 
   1077 int bpf_detach_uprobe(const char *ev_name)
   1078 {
   1079   return bpf_detach_probe(ev_name, "uprobe");
   1080 }
   1081 
   1082 
   1083 int bpf_attach_tracepoint(int progfd, const char *tp_category,
   1084                           const char *tp_name)
   1085 {
   1086   char buf[256];
   1087   int pfd = -1;
   1088 
   1089   snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%s/%s",
   1090            tp_category, tp_name);
   1091   if (bpf_attach_tracing_event(progfd, buf, -1 /* PID */, &pfd) == 0)
   1092     return pfd;
   1093 
   1094   bpf_close_perf_event_fd(pfd);
   1095   return -1;
   1096 }
   1097 
   1098 int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) {
   1099   tp_category = NULL;
   1100   tp_name = NULL;
   1101   // Right now, there is nothing to do, but it's a good idea to encourage
   1102   // callers to detach anything they attach.
   1103   return 0;
   1104 }
   1105 
   1106 int bpf_attach_raw_tracepoint(int progfd, char *tp_name)
   1107 {
   1108   union bpf_attr attr;
   1109   int ret;
   1110 
   1111   bzero(&attr, sizeof(attr));
   1112   attr.raw_tracepoint.name = ptr_to_u64(tp_name);
   1113   attr.raw_tracepoint.prog_fd = progfd;
   1114 
   1115   ret = syscall(__NR_bpf, BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
   1116   if (ret < 0)
   1117     fprintf(stderr, "bpf_attach_raw_tracepoint (%s): %s\n", tp_name, strerror(errno));
   1118   return ret;
   1119 }
   1120 
   1121 void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
   1122                             perf_reader_lost_cb lost_cb, void *cb_cookie,
   1123                             int pid, int cpu, int page_cnt) {
   1124   int pfd;
   1125   struct perf_event_attr attr = {};
   1126   struct perf_reader *reader = NULL;
   1127 
   1128   reader = perf_reader_new(raw_cb, lost_cb, cb_cookie, page_cnt);
   1129   if (!reader)
   1130     goto error;
   1131 
   1132   attr.config = 10;//PERF_COUNT_SW_BPF_OUTPUT;
   1133   attr.type = PERF_TYPE_SOFTWARE;
   1134   attr.sample_type = PERF_SAMPLE_RAW;
   1135   attr.sample_period = 1;
   1136   attr.wakeup_events = 1;
   1137   pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
   1138   if (pfd < 0) {
   1139     fprintf(stderr, "perf_event_open: %s\n", strerror(errno));
   1140     fprintf(stderr, "   (check your kernel for PERF_COUNT_SW_BPF_OUTPUT support, 4.4 or newer)\n");
   1141     goto error;
   1142   }
   1143   perf_reader_set_fd(reader, pfd);
   1144 
   1145   if (perf_reader_mmap(reader) < 0)
   1146     goto error;
   1147 
   1148   if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
   1149     perror("ioctl(PERF_EVENT_IOC_ENABLE)");
   1150     goto error;
   1151   }
   1152 
   1153   return reader;
   1154 
   1155 error:
   1156   if (reader)
   1157     perf_reader_free(reader);
   1158 
   1159   return NULL;
   1160 }
   1161 
   1162 static int invalid_perf_config(uint32_t type, uint64_t config) {
   1163   switch (type) {
   1164   case PERF_TYPE_HARDWARE:
   1165     if (config >= PERF_COUNT_HW_MAX) {
   1166       fprintf(stderr, "HARDWARE perf event config out of range\n");
   1167       goto is_invalid;
   1168     }
   1169     return 0;
   1170   case PERF_TYPE_SOFTWARE:
   1171     if (config >= PERF_COUNT_SW_MAX) {
   1172       fprintf(stderr, "SOFTWARE perf event config out of range\n");
   1173       goto is_invalid;
   1174     } else if (config == 10 /* PERF_COUNT_SW_BPF_OUTPUT */) {
   1175       fprintf(stderr, "Unable to open or attach perf event for BPF_OUTPUT\n");
   1176       goto is_invalid;
   1177     }
   1178     return 0;
   1179   case PERF_TYPE_HW_CACHE:
   1180     if (((config >> 16) >= PERF_COUNT_HW_CACHE_RESULT_MAX) ||
   1181         (((config >> 8) & 0xff) >= PERF_COUNT_HW_CACHE_OP_MAX) ||
   1182         ((config & 0xff) >= PERF_COUNT_HW_CACHE_MAX)) {
   1183       fprintf(stderr, "HW_CACHE perf event config out of range\n");
   1184       goto is_invalid;
   1185     }
   1186     return 0;
   1187   case PERF_TYPE_TRACEPOINT:
   1188   case PERF_TYPE_BREAKPOINT:
   1189     fprintf(stderr,
   1190             "Unable to open or attach TRACEPOINT or BREAKPOINT events\n");
   1191     goto is_invalid;
   1192   default:
   1193     return 0;
   1194   }
   1195 is_invalid:
   1196   fprintf(stderr, "Invalid perf event type %" PRIu32 " config %" PRIu64 "\n",
   1197           type, config);
   1198   return 1;
   1199 }
   1200 
   1201 int bpf_open_perf_event(uint32_t type, uint64_t config, int pid, int cpu) {
   1202   int fd;
   1203   struct perf_event_attr attr = {};
   1204 
   1205   if (invalid_perf_config(type, config)) {
   1206     return -1;
   1207   }
   1208 
   1209   attr.sample_period = LONG_MAX;
   1210   attr.type = type;
   1211   attr.config = config;
   1212 
   1213   fd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
   1214   if (fd < 0) {
   1215     fprintf(stderr, "perf_event_open: %s\n", strerror(errno));
   1216     return -1;
   1217   }
   1218 
   1219   if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
   1220     perror("ioctl(PERF_EVENT_IOC_ENABLE)");
   1221     close(fd);
   1222     return -1;
   1223   }
   1224 
   1225   return fd;
   1226 }
   1227 
   1228 int bpf_attach_xdp(const char *dev_name, int progfd, uint32_t flags) {
   1229     struct sockaddr_nl sa;
   1230     int sock, seq = 0, len, ret = -1;
   1231     char buf[4096];
   1232     struct nlattr *nla, *nla_xdp;
   1233     struct {
   1234         struct nlmsghdr  nh;
   1235         struct ifinfomsg ifinfo;
   1236         char             attrbuf[64];
   1237     } req;
   1238     struct nlmsghdr *nh;
   1239     struct nlmsgerr *err;
   1240     socklen_t addrlen;
   1241 
   1242     memset(&sa, 0, sizeof(sa));
   1243     sa.nl_family = AF_NETLINK;
   1244 
   1245     sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
   1246     if (sock < 0) {
   1247         fprintf(stderr, "bpf: opening a netlink socket: %s\n", strerror(errno));
   1248         return -1;
   1249     }
   1250 
   1251     if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
   1252         fprintf(stderr, "bpf: bind to netlink: %s\n", strerror(errno));
   1253         goto cleanup;
   1254     }
   1255 
   1256     addrlen = sizeof(sa);
   1257     if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
   1258         fprintf(stderr, "bpf: get sock name of netlink: %s\n", strerror(errno));
   1259         goto cleanup;
   1260     }
   1261 
   1262     if (addrlen != sizeof(sa)) {
   1263         fprintf(stderr, "bpf: wrong netlink address length: %d\n", addrlen);
   1264         goto cleanup;
   1265     }
   1266 
   1267     memset(&req, 0, sizeof(req));
   1268     req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
   1269     req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
   1270     req.nh.nlmsg_type = RTM_SETLINK;
   1271     req.nh.nlmsg_pid = 0;
   1272     req.nh.nlmsg_seq = ++seq;
   1273     req.ifinfo.ifi_family = AF_UNSPEC;
   1274     req.ifinfo.ifi_index = if_nametoindex(dev_name);
   1275     if (req.ifinfo.ifi_index == 0) {
   1276         fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno));
   1277         goto cleanup;
   1278     }
   1279 
   1280     nla = (struct nlattr *)(((char *)&req)
   1281                             + NLMSG_ALIGN(req.nh.nlmsg_len));
   1282     nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
   1283 
   1284     nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);
   1285     nla->nla_len = NLA_HDRLEN;
   1286 
   1287     // we specify the FD passed over by the user
   1288     nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
   1289     nla_xdp->nla_len = NLA_HDRLEN + sizeof(progfd);
   1290     memcpy((char *)nla_xdp + NLA_HDRLEN, &progfd, sizeof(progfd));
   1291     nla->nla_len += nla_xdp->nla_len;
   1292 
   1293     // parse flags as passed by the user
   1294     if (flags) {
   1295         nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
   1296         nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/;
   1297         nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
   1298         memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
   1299         nla->nla_len += nla_xdp->nla_len;
   1300     }
   1301 
   1302     req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
   1303 
   1304     if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
   1305         fprintf(stderr, "bpf: send to netlink: %s\n", strerror(errno));
   1306         goto cleanup;
   1307     }
   1308 
   1309     len = recv(sock, buf, sizeof(buf), 0);
   1310     if (len < 0) {
   1311         fprintf(stderr, "bpf: recv from netlink: %s\n", strerror(errno));
   1312         goto cleanup;
   1313     }
   1314 
   1315     for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, (unsigned int)len);
   1316          nh = NLMSG_NEXT(nh, len)) {
   1317         if (nh->nlmsg_pid != sa.nl_pid) {
   1318             fprintf(stderr, "bpf: Wrong pid %u, expected %u\n",
   1319                    nh->nlmsg_pid, sa.nl_pid);
   1320             errno = EBADMSG;
   1321             goto cleanup;
   1322         }
   1323         if (nh->nlmsg_seq != (unsigned int)seq) {
   1324             fprintf(stderr, "bpf: Wrong seq %d, expected %d\n",
   1325                    nh->nlmsg_seq, seq);
   1326             errno = EBADMSG;
   1327             goto cleanup;
   1328         }
   1329         switch (nh->nlmsg_type) {
   1330             case NLMSG_ERROR:
   1331                 err = (struct nlmsgerr *)NLMSG_DATA(nh);
   1332                 if (!err->error)
   1333                     continue;
   1334                 fprintf(stderr, "bpf: nlmsg error %s\n", strerror(-err->error));
   1335                 errno = -err->error;
   1336                 goto cleanup;
   1337             case NLMSG_DONE:
   1338                 break;
   1339         }
   1340     }
   1341 
   1342     ret = 0;
   1343 
   1344 cleanup:
   1345     close(sock);
   1346     return ret;
   1347 }
   1348 
   1349 int bpf_attach_perf_event_raw(int progfd, void *perf_event_attr, pid_t pid,
   1350                               int cpu, int group_fd, unsigned long extra_flags) {
   1351   int fd = syscall(__NR_perf_event_open, perf_event_attr, pid, cpu, group_fd,
   1352                    PERF_FLAG_FD_CLOEXEC | extra_flags);
   1353   if (fd < 0) {
   1354     perror("perf_event_open failed");
   1355     return -1;
   1356   }
   1357   if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, progfd) != 0) {
   1358     perror("ioctl(PERF_EVENT_IOC_SET_BPF) failed");
   1359     close(fd);
   1360     return -1;
   1361   }
   1362   if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) != 0) {
   1363     perror("ioctl(PERF_EVENT_IOC_ENABLE) failed");
   1364     close(fd);
   1365     return -1;
   1366   }
   1367 
   1368   return fd;
   1369 }
   1370 
   1371 int bpf_attach_perf_event(int progfd, uint32_t ev_type, uint32_t ev_config,
   1372                           uint64_t sample_period, uint64_t sample_freq,
   1373                           pid_t pid, int cpu, int group_fd) {
   1374   if (invalid_perf_config(ev_type, ev_config)) {
   1375     return -1;
   1376   }
   1377   if (!((sample_period > 0) ^ (sample_freq > 0))) {
   1378     fprintf(
   1379       stderr, "Exactly one of sample_period / sample_freq should be set\n"
   1380     );
   1381     return -1;
   1382   }
   1383 
   1384   struct perf_event_attr attr = {};
   1385   attr.type = ev_type;
   1386   attr.config = ev_config;
   1387   if (pid > 0)
   1388     attr.inherit = 1;
   1389   if (sample_freq > 0) {
   1390     attr.freq = 1;
   1391     attr.sample_freq = sample_freq;
   1392   } else {
   1393     attr.sample_period = sample_period;
   1394   }
   1395 
   1396   return bpf_attach_perf_event_raw(progfd, &attr, pid, cpu, group_fd, 0);
   1397 }
   1398 
   1399 int bpf_close_perf_event_fd(int fd) {
   1400   int res, error = 0;
   1401   if (fd >= 0) {
   1402     res = ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
   1403     if (res != 0) {
   1404       perror("ioctl(PERF_EVENT_IOC_DISABLE) failed");
   1405       error = res;
   1406     }
   1407     res = close(fd);
   1408     if (res != 0) {
   1409       perror("close perf event FD failed");
   1410       error = (res && !error) ? res : error;
   1411     }
   1412   }
   1413   return error;
   1414 }
   1415 
   1416 int bpf_obj_pin(int fd, const char *pathname)
   1417 {
   1418   union bpf_attr attr;
   1419 
   1420   memset(&attr, 0, sizeof(attr));
   1421   attr.pathname = ptr_to_u64((void *)pathname);
   1422   attr.bpf_fd = fd;
   1423 
   1424   return syscall(__NR_bpf, BPF_OBJ_PIN, &attr, sizeof(attr));
   1425 }
   1426 
   1427 int bpf_obj_get(const char *pathname)
   1428 {
   1429   union bpf_attr attr;
   1430 
   1431   memset(&attr, 0, sizeof(attr));
   1432   attr.pathname = ptr_to_u64((void *)pathname);
   1433 
   1434   return syscall(__NR_bpf, BPF_OBJ_GET, &attr, sizeof(attr));
   1435 }
   1436 
   1437 int bpf_prog_get_next_id(uint32_t start_id, uint32_t *next_id)
   1438 {
   1439   union bpf_attr attr;
   1440   int err;
   1441 
   1442   memset(&attr, 0, sizeof(attr));
   1443   attr.start_id = start_id;
   1444 
   1445   err = syscall(__NR_bpf, BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr));
   1446   if (!err)
   1447     *next_id = attr.next_id;
   1448 
   1449   return err;
   1450 }
   1451 
   1452 int bpf_prog_get_fd_by_id(uint32_t id)
   1453 {
   1454   union bpf_attr attr;
   1455 
   1456   memset(&attr, 0, sizeof(attr));
   1457   attr.prog_id = id;
   1458 
   1459   return syscall(__NR_bpf, BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr));
   1460 }
   1461 
   1462 int bpf_map_get_fd_by_id(uint32_t id)
   1463 {
   1464   union bpf_attr attr;
   1465 
   1466   memset(&attr, 0, sizeof(attr));
   1467   attr.map_id = id;
   1468 
   1469   return syscall(__NR_bpf, BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr));
   1470 }
   1471