1 #!/usr/bin/python 2 # @lint-avoid-python-3-compatibility-imports 3 # 4 # profile Profile CPU usage by sampling stack traces at a timed interval. 5 # For Linux, uses BCC, BPF, perf_events. Embedded C. 6 # 7 # This is an efficient profiler, as stack traces are frequency counted in 8 # kernel context, rather than passing every stack to user space for frequency 9 # counting there. Only the unique stacks and counts are passed to user space 10 # at the end of the profile, greatly reducing the kernel<->user transfer. 11 # 12 # REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is 13 # a version of this tool that may work on Linux 4.6 - 4.8. 14 # 15 # Copyright 2016 Netflix, Inc. 16 # Licensed under the Apache License, Version 2.0 (the "License") 17 # 18 # THANKS: Alexei Starovoitov, who added proper BPF profiling support to Linux; 19 # Sasha Goldshtein, Andrew Birchall, and Evgeny Vereshchagin, who wrote much 20 # of the code here, borrowed from tracepoint.py and offcputime.py; and 21 # Teng Qin, who added perf support in bcc. 22 # 23 # 15-Jul-2016 Brendan Gregg Created this. 24 # 20-Oct-2016 " " Switched to use the new 4.9 support. 25 26 from __future__ import print_function 27 from bcc import BPF, PerfType, PerfSWConfig 28 from sys import stderr 29 from time import sleep 30 import argparse 31 import signal 32 import os 33 import errno 34 import multiprocessing 35 import ctypes as ct 36 37 # 38 # Process Arguments 39 # 40 41 # arg validation 42 def positive_int(val): 43 try: 44 ival = int(val) 45 except ValueError: 46 raise argparse.ArgumentTypeError("must be an integer") 47 48 if ival < 0: 49 raise argparse.ArgumentTypeError("must be positive") 50 return ival 51 52 def positive_nonzero_int(val): 53 ival = positive_int(val) 54 if ival == 0: 55 raise argparse.ArgumentTypeError("must be nonzero") 56 return ival 57 58 def stack_id_err(stack_id): 59 # -EFAULT in get_stackid normally means the stack-trace is not availible, 60 # Such as getting kernel stack trace in userspace code 61 return (stack_id < 0) and (stack_id != -errno.EFAULT) 62 63 # arguments 64 examples = """examples: 65 ./profile # profile stack traces at 49 Hertz until Ctrl-C 66 ./profile -F 99 # profile stack traces at 99 Hertz 67 ./profile -c 1000000 # profile stack traces every 1 in a million events 68 ./profile 5 # profile at 49 Hertz for 5 seconds only 69 ./profile -f 5 # output in folded format for flame graphs 70 ./profile -p 185 # only profile threads for PID 185 71 ./profile -U # only show user space stacks (no kernel) 72 ./profile -K # only show kernel space stacks (no user) 73 """ 74 parser = argparse.ArgumentParser( 75 description="Profile CPU stack traces at a timed interval", 76 formatter_class=argparse.RawDescriptionHelpFormatter, 77 epilog=examples) 78 thread_group = parser.add_mutually_exclusive_group() 79 thread_group.add_argument("-p", "--pid", type=positive_int, 80 help="profile this PID only") 81 # TODO: add options for user/kernel threads only 82 stack_group = parser.add_mutually_exclusive_group() 83 stack_group.add_argument("-U", "--user-stacks-only", action="store_true", 84 help="show stacks from user space only (no kernel space stacks)") 85 stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true", 86 help="show stacks from kernel space only (no user space stacks)") 87 sample_group = parser.add_mutually_exclusive_group() 88 sample_group.add_argument("-F", "--frequency", type=positive_int, 89 help="sample frequency, Hertz") 90 sample_group.add_argument("-c", "--count", type=positive_int, 91 help="sample period, number of events") 92 parser.add_argument("-d", "--delimited", action="store_true", 93 help="insert delimiter between kernel/user stacks") 94 parser.add_argument("-a", "--annotations", action="store_true", 95 help="add _[k] annotations to kernel frames") 96 parser.add_argument("-f", "--folded", action="store_true", 97 help="output folded format, one line per stack (for flame graphs)") 98 parser.add_argument("--stack-storage-size", default=16384, 99 type=positive_nonzero_int, 100 help="the number of unique stack traces that can be stored and " 101 "displayed (default %(default)s)") 102 parser.add_argument("duration", nargs="?", default=99999999, 103 type=positive_nonzero_int, 104 help="duration of trace, in seconds") 105 parser.add_argument("-C", "--cpu", type=int, default=-1, 106 help="cpu number to run profile on") 107 parser.add_argument("--ebpf", action="store_true", 108 help=argparse.SUPPRESS) 109 110 # option logic 111 args = parser.parse_args() 112 pid = int(args.pid) if args.pid is not None else -1 113 duration = int(args.duration) 114 debug = 0 115 need_delimiter = args.delimited and not (args.kernel_stacks_only or 116 args.user_stacks_only) 117 # TODO: add stack depth, and interval 118 119 # 120 # Setup BPF 121 # 122 123 # define BPF program 124 bpf_text = """ 125 #include <uapi/linux/ptrace.h> 126 #include <uapi/linux/bpf_perf_event.h> 127 #include <linux/sched.h> 128 129 struct key_t { 130 u32 pid; 131 u64 kernel_ip; 132 u64 kernel_ret_ip; 133 int user_stack_id; 134 int kernel_stack_id; 135 char name[TASK_COMM_LEN]; 136 }; 137 BPF_HASH(counts, struct key_t); 138 BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); 139 140 // This code gets a bit complex. Probably not suitable for casual hacking. 141 142 int do_perf_event(struct bpf_perf_event_data *ctx) { 143 u32 pid = bpf_get_current_pid_tgid() >> 32; 144 if (!(THREAD_FILTER)) 145 return 0; 146 147 // create map key 148 struct key_t key = {.pid = pid}; 149 bpf_get_current_comm(&key.name, sizeof(key.name)); 150 151 // get stacks 152 key.user_stack_id = USER_STACK_GET; 153 key.kernel_stack_id = KERNEL_STACK_GET; 154 155 if (key.kernel_stack_id >= 0) { 156 // populate extras to fix the kernel stack 157 u64 ip = PT_REGS_IP(&ctx->regs); 158 u64 page_offset; 159 160 // if ip isn't sane, leave key ips as zero for later checking 161 #if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE) 162 // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it 163 page_offset = __PAGE_OFFSET_BASE; 164 #elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4) 165 // x64, 4.17, and later 166 #if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL) 167 page_offset = __PAGE_OFFSET_BASE_L5; 168 #else 169 page_offset = __PAGE_OFFSET_BASE_L4; 170 #endif 171 #else 172 // earlier x86_64 kernels, e.g., 4.6, comes here 173 // arm64, s390, powerpc, x86_32 174 page_offset = PAGE_OFFSET; 175 #endif 176 177 if (ip > page_offset) { 178 key.kernel_ip = ip; 179 } 180 } 181 182 counts.increment(key); 183 return 0; 184 } 185 """ 186 187 # set thread filter 188 thread_context = "" 189 perf_filter = "-a" 190 if args.pid is not None: 191 thread_context = "PID %s" % args.pid 192 thread_filter = 'pid == %s' % args.pid 193 perf_filter = '-p %s' % args.pid 194 else: 195 thread_context = "all threads" 196 thread_filter = '1' 197 bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) 198 199 # set stack storage size 200 bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) 201 202 # handle stack args 203 kernel_stack_get = "stack_traces.get_stackid(&ctx->regs, 0)" 204 user_stack_get = "stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK)" 205 stack_context = "" 206 if args.user_stacks_only: 207 stack_context = "user" 208 kernel_stack_get = "-1" 209 elif args.kernel_stacks_only: 210 stack_context = "kernel" 211 user_stack_get = "-1" 212 else: 213 stack_context = "user + kernel" 214 bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get) 215 bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get) 216 217 sample_freq = 0 218 sample_period = 0 219 if args.frequency: 220 sample_freq = args.frequency 221 elif args.count: 222 sample_period = args.count 223 else: 224 # If user didn't specify anything, use default 49Hz sampling 225 sample_freq = 49 226 sample_context = "%s%d %s" % (("", sample_freq, "Hertz") if sample_freq 227 else ("every ", sample_period, "events")) 228 229 # header 230 if not args.folded: 231 print("Sampling at %s of %s by %s stack" % 232 (sample_context, thread_context, stack_context), end="") 233 if args.cpu >= 0: 234 print(" on CPU#{}".format(args.cpu), end="") 235 if duration < 99999999: 236 print(" for %d secs." % duration) 237 else: 238 print("... Hit Ctrl-C to end.") 239 240 if debug or args.ebpf: 241 print(bpf_text) 242 if args.ebpf: 243 exit() 244 245 # initialize BPF & perf_events 246 b = BPF(text=bpf_text) 247 b.attach_perf_event(ev_type=PerfType.SOFTWARE, 248 ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event", 249 sample_period=sample_period, sample_freq=sample_freq, cpu=args.cpu) 250 251 # signal handler 252 def signal_ignore(signal, frame): 253 print() 254 255 # 256 # Output Report 257 # 258 259 # collect samples 260 try: 261 sleep(duration) 262 except KeyboardInterrupt: 263 # as cleanup can take some time, trap Ctrl-C: 264 signal.signal(signal.SIGINT, signal_ignore) 265 266 if not args.folded: 267 print() 268 269 def aksym(addr): 270 if args.annotations: 271 return b.ksym(addr) + "_[k]".encode() 272 else: 273 return b.ksym(addr) 274 275 # output stacks 276 missing_stacks = 0 277 has_enomem = False 278 counts = b.get_table("counts") 279 stack_traces = b.get_table("stack_traces") 280 need_delimiter = args.delimited and not (args.kernel_stacks_only or 281 args.user_stacks_only) 282 for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): 283 # handle get_stackid errors 284 if not args.user_stacks_only and stack_id_err(k.kernel_stack_id): 285 missing_stacks += 1 286 has_enomem = has_enomem or k.kernel_stack_id == -errno.ENOMEM 287 if not args.kernel_stacks_only and stack_id_err(k.user_stack_id): 288 missing_stacks += 1 289 has_enomem = has_enomem or k.user_stack_id == -errno.ENOMEM 290 291 user_stack = [] if k.user_stack_id < 0 else \ 292 stack_traces.walk(k.user_stack_id) 293 kernel_tmp = [] if k.kernel_stack_id < 0 else \ 294 stack_traces.walk(k.kernel_stack_id) 295 296 # fix kernel stack 297 kernel_stack = [] 298 if k.kernel_stack_id >= 0: 299 for addr in kernel_tmp: 300 kernel_stack.append(addr) 301 # the later IP checking 302 if k.kernel_ip: 303 kernel_stack.insert(0, k.kernel_ip) 304 305 if args.folded: 306 # print folded stack output 307 user_stack = list(user_stack) 308 kernel_stack = list(kernel_stack) 309 line = [k.name] 310 # if we failed to get the stack is, such as due to no space (-ENOMEM) or 311 # hash collision (-EEXIST), we still print a placeholder for consistency 312 if not args.kernel_stacks_only: 313 if stack_id_err(k.user_stack_id): 314 line.append("[Missed User Stack]") 315 else: 316 line.extend([b.sym(addr, k.pid) for addr in reversed(user_stack)]) 317 if not args.user_stacks_only: 318 line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else []) 319 if stack_id_err(k.kernel_stack_id): 320 line.append("[Missed Kernel Stack]") 321 else: 322 line.extend([b.ksym(addr) for addr in reversed(kernel_stack)]) 323 print("%s %d" % (b";".join(line).decode('utf-8', 'replace'), v.value)) 324 else: 325 # print default multi-line stack output 326 if not args.user_stacks_only: 327 if stack_id_err(k.kernel_stack_id): 328 print(" [Missed Kernel Stack]") 329 else: 330 for addr in kernel_stack: 331 print(" %s" % aksym(addr)) 332 if not args.kernel_stacks_only: 333 if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0: 334 print(" --") 335 if stack_id_err(k.user_stack_id): 336 print(" [Missed User Stack]") 337 else: 338 for addr in user_stack: 339 print(" %s" % b.sym(addr, k.pid).decode('utf-8', 'replace')) 340 print(" %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid)) 341 print(" %d\n" % v.value) 342 343 # check missing 344 if missing_stacks > 0: 345 enomem_str = "" if not has_enomem else \ 346 " Consider increasing --stack-storage-size." 347 print("WARNING: %d stack traces could not be displayed.%s" % 348 (missing_stacks, enomem_str), 349 file=stderr) 350