1 #!/usr/bin/python 2 # @lint-avoid-python-3-compatibility-imports 3 # 4 # profile Profile CPU usage by sampling stack traces at a timed interval. 5 # For Linux, uses BCC, BPF, perf_events. Embedded C. 6 # 7 # This is an efficient profiler, as stack traces are frequency counted in 8 # kernel context, rather than passing every stack to user space for frequency 9 # counting there. Only the unique stacks and counts are passed to user space 10 # at the end of the profile, greatly reducing the kernel<->user transfer. 11 # 12 # This uses perf_event_open to setup a timer which is instrumented by BPF, 13 # and for efficiency it does not initialize the perf ring buffer, so the 14 # redundant perf samples are not collected. 15 # 16 # Kernel stacks are post-process in user-land to skip the interrupt framework 17 # frames. You can improve efficiency a little by specifying the exact number 18 # of frames to skip with -s, provided you know what that is. If you get -s 19 # wrong, note that the first line is the IP, and then the (skipped) stack. 20 # 21 # Note: if another perf-based sampling session is active, the output may become 22 # polluted with their events. On older kernels, the ouptut may also become 23 # polluted with tracing sessions (when the kprobe is used instead of the 24 # tracepoint). If this becomes a problem, logic can be added to filter events. 25 # 26 # REQUIRES: Linux 4.6+ (BPF_MAP_TYPE_STACK_TRACE support), and the 27 # perf_misc_flags() function symbol to exist. The latter may or may not 28 # exist depending on your kernel build. Linux 4.9 provides a proper solution 29 # to this (this tool will be updated). 30 # 31 # Copyright 2016 Netflix, Inc. 32 # Licensed under the Apache License, Version 2.0 (the "License") 33 # 34 # THANKS: Sasha Goldshtein, Andrew Birchall, and Evgeny Vereshchagin, who wrote 35 # much of the code here, borrowed from tracepoint.py and offcputime.py. 36 # 37 # 15-Jul-2016 Brendan Gregg Created this. 38 39 from __future__ import print_function 40 from bcc import BPF, Perf 41 from sys import stderr 42 from time import sleep 43 import argparse 44 import signal 45 import os 46 import errno 47 import multiprocessing 48 import ctypes as ct 49 50 # 51 # Process Arguments 52 # 53 54 # arg validation 55 def positive_int(val): 56 try: 57 ival = int(val) 58 except ValueError: 59 raise argparse.ArgumentTypeError("must be an integer") 60 61 if ival < 0: 62 raise argparse.ArgumentTypeError("must be positive") 63 return ival 64 65 def positive_nonzero_int(val): 66 ival = positive_int(val) 67 if ival == 0: 68 raise argparse.ArgumentTypeError("must be nonzero") 69 return ival 70 71 # arguments 72 examples = """examples: 73 ./profile # profile stack traces at 49 Hertz until Ctrl-C 74 ./profile -F 99 # profile stack traces at 99 Hertz 75 ./profile 5 # profile at 49 Hertz for 5 seconds only 76 ./profile -f 5 # output in folded format for flame graphs 77 ./profile -p 185 # only profile threads for PID 185 78 ./profile -U # only show user space stacks (no kernel) 79 ./profile -K # only show kernel space stacks (no user) 80 ./profile -S 11 # always skip 11 frames of kernel stack 81 """ 82 parser = argparse.ArgumentParser( 83 description="Profile CPU stack traces at a timed interval", 84 formatter_class=argparse.RawDescriptionHelpFormatter, 85 epilog=examples) 86 thread_group = parser.add_mutually_exclusive_group() 87 thread_group.add_argument("-p", "--pid", type=positive_int, 88 help="profile this PID only") 89 # TODO: add options for user/kernel threads only 90 stack_group = parser.add_mutually_exclusive_group() 91 stack_group.add_argument("-U", "--user-stacks-only", action="store_true", 92 help="show stacks from user space only (no kernel space stacks)") 93 stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true", 94 help="show stacks from kernel space only (no user space stacks)") 95 parser.add_argument("-F", "--frequency", type=positive_int, default=49, 96 help="sample frequency, Hertz (default 49)") 97 parser.add_argument("-d", "--delimited", action="store_true", 98 help="insert delimiter between kernel/user stacks") 99 parser.add_argument("-a", "--annotations", action="store_true", 100 help="add _[k] annotations to kernel frames") 101 parser.add_argument("-f", "--folded", action="store_true", 102 help="output folded format, one line per stack (for flame graphs)") 103 parser.add_argument("--stack-storage-size", default=2048, 104 type=positive_nonzero_int, 105 help="the number of unique stack traces that can be stored and " 106 "displayed (default 2048)") 107 parser.add_argument("-S", "--kernel-skip", type=positive_int, default=0, 108 help="skip this many kernel frames (default 3)") 109 parser.add_argument("duration", nargs="?", default=99999999, 110 type=positive_nonzero_int, 111 help="duration of trace, in seconds") 112 113 # option logic 114 args = parser.parse_args() 115 skip = args.kernel_skip 116 pid = int(args.pid) if args.pid is not None else -1 117 duration = int(args.duration) 118 debug = 0 119 need_delimiter = args.delimited and not (args.kernel_stacks_only or 120 args.user_stacks_only) 121 # TODO: add stack depth, and interval 122 123 # 124 # Setup BPF 125 # 126 127 # define BPF program 128 bpf_text = """ 129 #include <uapi/linux/ptrace.h> 130 #include <linux/sched.h> 131 132 struct key_t { 133 u32 pid; 134 u64 kernel_ip; 135 u64 kernel_ret_ip; 136 int user_stack_id; 137 int kernel_stack_id; 138 char name[TASK_COMM_LEN]; 139 }; 140 BPF_HASH(counts, struct key_t); 141 BPF_HASH(start, u32); 142 BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); 143 144 // This code gets a bit complex. Probably not suitable for casual hacking. 145 146 PERF_TRACE_EVENT { 147 u32 pid = bpf_get_current_pid_tgid(); 148 if (!(THREAD_FILTER)) 149 return 0; 150 151 // create map key 152 u64 zero = 0, *val; 153 struct key_t key = {.pid = pid}; 154 bpf_get_current_comm(&key.name, sizeof(key.name)); 155 156 // get stacks 157 key.user_stack_id = USER_STACK_GET; 158 key.kernel_stack_id = KERNEL_STACK_GET; 159 160 if (key.kernel_stack_id >= 0) { 161 // populate extras to fix the kernel stack 162 struct pt_regs regs = {}; 163 bpf_probe_read(®s, sizeof(regs), (void *)REGS_LOCATION); 164 u64 ip = PT_REGS_IP(®s); 165 166 // if ip isn't sane, leave key ips as zero for later checking 167 #ifdef CONFIG_RANDOMIZE_MEMORY 168 if (ip > __PAGE_OFFSET_BASE) { 169 #else 170 if (ip > PAGE_OFFSET) { 171 #endif 172 key.kernel_ip = ip; 173 if (DO_KERNEL_RIP) { 174 /* 175 * User didn't specify a skip value (-s), so we will figure 176 * out how many interrupt framework frames to skip by recording 177 * the kernel rip, then later scanning for it on the stack. 178 * This is likely x86_64 specific; can use -s as a workaround 179 * until this supports your architecture. 180 */ 181 bpf_probe_read(&key.kernel_ret_ip, sizeof(key.kernel_ret_ip), 182 (void *)(regs.bp + 8)); 183 } 184 } 185 } 186 187 val = counts.lookup_or_init(&key, &zero); 188 (*val)++; 189 return 0; 190 } 191 """ 192 193 # set thread filter 194 thread_context = "" 195 perf_filter = "-a" 196 if args.pid is not None: 197 thread_context = "PID %s" % args.pid 198 thread_filter = 'pid == %s' % args.pid 199 perf_filter = '-p %s' % args.pid 200 else: 201 thread_context = "all threads" 202 thread_filter = '1' 203 bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) 204 205 # set stack storage size 206 bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) 207 208 # handle stack args 209 kernel_stack_get = "stack_traces.get_stackid(args, " \ 210 "%d | BPF_F_REUSE_STACKID)" % skip 211 user_stack_get = \ 212 "stack_traces.get_stackid(args, BPF_F_REUSE_STACKID | BPF_F_USER_STACK)" 213 stack_context = "" 214 if args.user_stacks_only: 215 stack_context = "user" 216 kernel_stack_get = "-1" 217 elif args.kernel_stacks_only: 218 stack_context = "kernel" 219 user_stack_get = "-1" 220 else: 221 stack_context = "user + kernel" 222 bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get) 223 bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get) 224 if skip: 225 # don't record the rip, as we won't use it 226 bpf_text = bpf_text.replace('DO_KERNEL_RIP', '0') 227 else: 228 # rip is used to skip interrupt infrastructure frames 229 bpf_text = bpf_text.replace('DO_KERNEL_RIP', '1') 230 231 # header 232 if not args.folded: 233 print("Sampling at %d Hertz of %s by %s stack" % 234 (args.frequency, thread_context, stack_context), end="") 235 if duration < 99999999: 236 print(" for %d secs." % duration) 237 else: 238 print("... Hit Ctrl-C to end.") 239 240 # kprobe perf_misc_flags() 241 bpf_text = bpf_text.replace('PERF_TRACE_EVENT', 242 'int kprobe__perf_misc_flags(struct pt_regs *args)') 243 bpf_text = bpf_text.replace('REGS_LOCATION', 'PT_REGS_PARM1(args)') 244 if debug: 245 print(bpf_text) 246 247 # initialize BPF 248 try: 249 b = BPF(text=bpf_text) 250 except: 251 print("BPF initialization failed. perf_misc_flags() may be inlined in " + 252 "your kernel build.\nThis tool will be updated in the future to " + 253 "support Linux 4.9, which has reliable profiling support. Exiting.") 254 exit() 255 256 # signal handler 257 def signal_ignore(signal, frame): 258 print() 259 260 # 261 # Setup perf_events 262 # 263 264 # use perf_events to sample 265 try: 266 Perf.perf_event_open(0, pid=-1, ptype=Perf.PERF_TYPE_SOFTWARE, 267 freq=args.frequency) 268 except: 269 print("ERROR: initializing perf_events for sampling.\n" 270 "To debug this, try running the following command:\n" 271 " perf record -F 49 -e cpu-clock %s -- sleep 1\n" 272 "If that also doesn't work, fix it first." % perf_filter, file=stderr) 273 exit(0) 274 275 # 276 # Output Report 277 # 278 279 # collect samples 280 try: 281 sleep(duration) 282 except KeyboardInterrupt: 283 # as cleanup can take some time, trap Ctrl-C: 284 signal.signal(signal.SIGINT, signal_ignore) 285 286 if not args.folded: 287 print() 288 289 def aksym(addr): 290 if args.annotations: 291 return b.ksym(addr) + "_[k]" 292 else: 293 return b.ksym(addr) 294 295 # output stacks 296 missing_stacks = 0 297 has_enomem = False 298 counts = b.get_table("counts") 299 stack_traces = b.get_table("stack_traces") 300 for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): 301 # handle get_stackid erorrs 302 if (not args.user_stacks_only and k.kernel_stack_id < 0 and 303 k.kernel_stack_id != -errno.EFAULT) or \ 304 (not args.kernel_stacks_only and k.user_stack_id < 0 and 305 k.user_stack_id != -errno.EFAULT): 306 missing_stacks += 1 307 # check for an ENOMEM error 308 if k.kernel_stack_id == -errno.ENOMEM or \ 309 k.user_stack_id == -errno.ENOMEM: 310 has_enomem = True 311 312 user_stack = [] if k.user_stack_id < 0 else \ 313 stack_traces.walk(k.user_stack_id) 314 kernel_tmp = [] if k.kernel_stack_id < 0 else \ 315 stack_traces.walk(k.kernel_stack_id) 316 317 # fix kernel stack 318 kernel_stack = [] 319 if k.kernel_stack_id >= 0: 320 if skip: 321 # fixed skip 322 for addr in kernel_tmp: 323 kernel_stack.append(addr) 324 kernel_stack = kernel_stack[skip:] 325 else: 326 # skip the interrupt framework stack by searching for our RIP 327 skipping = 1 328 for addr in kernel_tmp: 329 if k.kernel_ret_ip == addr: 330 skipping = 0 331 if not skipping: 332 kernel_stack.append(addr) 333 if k.kernel_ip: 334 kernel_stack.insert(0, k.kernel_ip) 335 336 do_delimiter = need_delimiter and kernel_stack 337 338 if args.folded: 339 # print folded stack output 340 user_stack = list(user_stack) 341 kernel_stack = list(kernel_stack) 342 line = [k.name.decode('utf-8', 'replace')] + \ 343 [b.sym(addr, k.pid) for addr in reversed(user_stack)] + \ 344 (do_delimiter and ["-"] or []) + \ 345 [aksym(addr) for addr in reversed(kernel_stack)] 346 print("%s %d" % (";".join(line), v.value)) 347 else: 348 # print default multi-line stack output. 349 for addr in kernel_stack: 350 print(" %s" % aksym(addr)) 351 if do_delimiter: 352 print(" --") 353 for addr in user_stack: 354 print(" %s" % b.sym(addr, k.pid)) 355 print(" %-16s %s (%d)" % ("-", k.name, k.pid)) 356 print(" %d\n" % v.value) 357 358 # check missing 359 if missing_stacks > 0: 360 enomem_str = "" if not has_enomem else \ 361 " Consider increasing --stack-storage-size." 362 print("WARNING: %d stack traces could not be displayed.%s" % 363 (missing_stacks, enomem_str), 364 file=stderr) 365