Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/python
      2 # @lint-avoid-python-3-compatibility-imports
      3 #
      4 # profile  Profile CPU usage by sampling stack traces at a timed interval.
      5 #          For Linux, uses BCC, BPF, perf_events. Embedded C.
      6 #
      7 # This is an efficient profiler, as stack traces are frequency counted in
      8 # kernel context, rather than passing every stack to user space for frequency
      9 # counting there. Only the unique stacks and counts are passed to user space
     10 # at the end of the profile, greatly reducing the kernel<->user transfer.
     11 #
     12 # REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
     13 # a version of this tool that may work on Linux 4.6 - 4.8.
     14 #
     15 # Copyright 2016 Netflix, Inc.
     16 # Licensed under the Apache License, Version 2.0 (the "License")
     17 #
     18 # THANKS: Alexei Starovoitov, who added proper BPF profiling support to Linux;
     19 # Sasha Goldshtein, Andrew Birchall, and Evgeny Vereshchagin, who wrote much
     20 # of the code here, borrowed from tracepoint.py and offcputime.py; and
     21 # Teng Qin, who added perf support in bcc.
     22 #
     23 # 15-Jul-2016   Brendan Gregg   Created this.
     24 # 20-Oct-2016      "      "     Switched to use the new 4.9 support.
     25 
     26 from __future__ import print_function
     27 from bcc import BPF, PerfType, PerfSWConfig
     28 from sys import stderr
     29 from time import sleep
     30 import argparse
     31 import signal
     32 import os
     33 import errno
     34 import multiprocessing
     35 import ctypes as ct
     36 
     37 #
     38 # Process Arguments
     39 #
     40 
     41 # arg validation
     42 def positive_int(val):
     43     try:
     44         ival = int(val)
     45     except ValueError:
     46         raise argparse.ArgumentTypeError("must be an integer")
     47 
     48     if ival < 0:
     49         raise argparse.ArgumentTypeError("must be positive")
     50     return ival
     51 
     52 def positive_nonzero_int(val):
     53     ival = positive_int(val)
     54     if ival == 0:
     55         raise argparse.ArgumentTypeError("must be nonzero")
     56     return ival
     57 
     58 def stack_id_err(stack_id):
     59     # -EFAULT in get_stackid normally means the stack-trace is not availible,
     60     # Such as getting kernel stack trace in userspace code
     61     return (stack_id < 0) and (stack_id != -errno.EFAULT)
     62 
     63 # arguments
     64 examples = """examples:
     65     ./profile             # profile stack traces at 49 Hertz until Ctrl-C
     66     ./profile -F 99       # profile stack traces at 99 Hertz
     67     ./profile -c 1000000  # profile stack traces every 1 in a million events
     68     ./profile 5           # profile at 49 Hertz for 5 seconds only
     69     ./profile -f 5        # output in folded format for flame graphs
     70     ./profile -p 185      # only profile threads for PID 185
     71     ./profile -U          # only show user space stacks (no kernel)
     72     ./profile -K          # only show kernel space stacks (no user)
     73 """
     74 parser = argparse.ArgumentParser(
     75     description="Profile CPU stack traces at a timed interval",
     76     formatter_class=argparse.RawDescriptionHelpFormatter,
     77     epilog=examples)
     78 thread_group = parser.add_mutually_exclusive_group()
     79 thread_group.add_argument("-p", "--pid", type=positive_int,
     80     help="profile this PID only")
     81 # TODO: add options for user/kernel threads only
     82 stack_group = parser.add_mutually_exclusive_group()
     83 stack_group.add_argument("-U", "--user-stacks-only", action="store_true",
     84     help="show stacks from user space only (no kernel space stacks)")
     85 stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true",
     86     help="show stacks from kernel space only (no user space stacks)")
     87 sample_group = parser.add_mutually_exclusive_group()
     88 sample_group.add_argument("-F", "--frequency", type=positive_int,
     89     help="sample frequency, Hertz")
     90 sample_group.add_argument("-c", "--count", type=positive_int,
     91     help="sample period, number of events")
     92 parser.add_argument("-d", "--delimited", action="store_true",
     93     help="insert delimiter between kernel/user stacks")
     94 parser.add_argument("-a", "--annotations", action="store_true",
     95     help="add _[k] annotations to kernel frames")
     96 parser.add_argument("-f", "--folded", action="store_true",
     97     help="output folded format, one line per stack (for flame graphs)")
     98 parser.add_argument("--stack-storage-size", default=16384,
     99     type=positive_nonzero_int,
    100     help="the number of unique stack traces that can be stored and "
    101         "displayed (default %(default)s)")
    102 parser.add_argument("duration", nargs="?", default=99999999,
    103     type=positive_nonzero_int,
    104     help="duration of trace, in seconds")
    105 parser.add_argument("-C", "--cpu", type=int, default=-1,
    106     help="cpu number to run profile on")
    107 parser.add_argument("--ebpf", action="store_true",
    108     help=argparse.SUPPRESS)
    109 
    110 # option logic
    111 args = parser.parse_args()
    112 pid = int(args.pid) if args.pid is not None else -1
    113 duration = int(args.duration)
    114 debug = 0
    115 need_delimiter = args.delimited and not (args.kernel_stacks_only or
    116     args.user_stacks_only)
    117 # TODO: add stack depth, and interval
    118 
    119 #
    120 # Setup BPF
    121 #
    122 
    123 # define BPF program
    124 bpf_text = """
    125 #include <uapi/linux/ptrace.h>
    126 #include <uapi/linux/bpf_perf_event.h>
    127 #include <linux/sched.h>
    128 
    129 struct key_t {
    130     u32 pid;
    131     u64 kernel_ip;
    132     u64 kernel_ret_ip;
    133     int user_stack_id;
    134     int kernel_stack_id;
    135     char name[TASK_COMM_LEN];
    136 };
    137 BPF_HASH(counts, struct key_t);
    138 BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
    139 
    140 // This code gets a bit complex. Probably not suitable for casual hacking.
    141 
    142 int do_perf_event(struct bpf_perf_event_data *ctx) {
    143     u32 pid = bpf_get_current_pid_tgid() >> 32;
    144     if (!(THREAD_FILTER))
    145         return 0;
    146 
    147     // create map key
    148     struct key_t key = {.pid = pid};
    149     bpf_get_current_comm(&key.name, sizeof(key.name));
    150 
    151     // get stacks
    152     key.user_stack_id = USER_STACK_GET;
    153     key.kernel_stack_id = KERNEL_STACK_GET;
    154 
    155     if (key.kernel_stack_id >= 0) {
    156         // populate extras to fix the kernel stack
    157         u64 ip = PT_REGS_IP(&ctx->regs);
    158         u64 page_offset;
    159 
    160         // if ip isn't sane, leave key ips as zero for later checking
    161 #if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
    162         // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
    163         page_offset = __PAGE_OFFSET_BASE;
    164 #elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
    165         // x64, 4.17, and later
    166 #if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
    167         page_offset = __PAGE_OFFSET_BASE_L5;
    168 #else
    169         page_offset = __PAGE_OFFSET_BASE_L4;
    170 #endif
    171 #else
    172         // earlier x86_64 kernels, e.g., 4.6, comes here
    173         // arm64, s390, powerpc, x86_32
    174         page_offset = PAGE_OFFSET;
    175 #endif
    176 
    177         if (ip > page_offset) {
    178             key.kernel_ip = ip;
    179         }
    180     }
    181 
    182     counts.increment(key);
    183     return 0;
    184 }
    185 """
    186 
    187 # set thread filter
    188 thread_context = ""
    189 perf_filter = "-a"
    190 if args.pid is not None:
    191     thread_context = "PID %s" % args.pid
    192     thread_filter = 'pid == %s' % args.pid
    193     perf_filter = '-p %s' % args.pid
    194 else:
    195     thread_context = "all threads"
    196     thread_filter = '1'
    197 bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
    198 
    199 # set stack storage size
    200 bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
    201 
    202 # handle stack args
    203 kernel_stack_get = "stack_traces.get_stackid(&ctx->regs, 0)"
    204 user_stack_get = "stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK)"
    205 stack_context = ""
    206 if args.user_stacks_only:
    207     stack_context = "user"
    208     kernel_stack_get = "-1"
    209 elif args.kernel_stacks_only:
    210     stack_context = "kernel"
    211     user_stack_get = "-1"
    212 else:
    213     stack_context = "user + kernel"
    214 bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get)
    215 bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get)
    216 
    217 sample_freq = 0
    218 sample_period = 0
    219 if args.frequency:
    220     sample_freq = args.frequency
    221 elif args.count:
    222     sample_period = args.count
    223 else:
    224     # If user didn't specify anything, use default 49Hz sampling
    225     sample_freq = 49
    226 sample_context = "%s%d %s" % (("", sample_freq, "Hertz") if sample_freq
    227                          else ("every ", sample_period, "events"))
    228 
    229 # header
    230 if not args.folded:
    231     print("Sampling at %s of %s by %s stack" %
    232         (sample_context, thread_context, stack_context), end="")
    233     if args.cpu >= 0:
    234         print(" on CPU#{}".format(args.cpu), end="")
    235     if duration < 99999999:
    236         print(" for %d secs." % duration)
    237     else:
    238         print("... Hit Ctrl-C to end.")
    239 
    240 if debug or args.ebpf:
    241     print(bpf_text)
    242     if args.ebpf:
    243         exit()
    244 
    245 # initialize BPF & perf_events
    246 b = BPF(text=bpf_text)
    247 b.attach_perf_event(ev_type=PerfType.SOFTWARE,
    248     ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event",
    249     sample_period=sample_period, sample_freq=sample_freq, cpu=args.cpu)
    250 
    251 # signal handler
    252 def signal_ignore(signal, frame):
    253     print()
    254 
    255 #
    256 # Output Report
    257 #
    258 
    259 # collect samples
    260 try:
    261     sleep(duration)
    262 except KeyboardInterrupt:
    263     # as cleanup can take some time, trap Ctrl-C:
    264     signal.signal(signal.SIGINT, signal_ignore)
    265 
    266 if not args.folded:
    267     print()
    268 
    269 def aksym(addr):
    270     if args.annotations:
    271         return b.ksym(addr) + "_[k]".encode()
    272     else:
    273         return b.ksym(addr)
    274 
    275 # output stacks
    276 missing_stacks = 0
    277 has_enomem = False
    278 counts = b.get_table("counts")
    279 stack_traces = b.get_table("stack_traces")
    280 need_delimiter = args.delimited and not (args.kernel_stacks_only or
    281                                          args.user_stacks_only)
    282 for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
    283     # handle get_stackid errors
    284     if not args.user_stacks_only and stack_id_err(k.kernel_stack_id):
    285         missing_stacks += 1
    286         has_enomem = has_enomem or k.kernel_stack_id == -errno.ENOMEM
    287     if not args.kernel_stacks_only and stack_id_err(k.user_stack_id):
    288         missing_stacks += 1
    289         has_enomem = has_enomem or k.user_stack_id == -errno.ENOMEM
    290 
    291     user_stack = [] if k.user_stack_id < 0 else \
    292         stack_traces.walk(k.user_stack_id)
    293     kernel_tmp = [] if k.kernel_stack_id < 0 else \
    294         stack_traces.walk(k.kernel_stack_id)
    295 
    296     # fix kernel stack
    297     kernel_stack = []
    298     if k.kernel_stack_id >= 0:
    299         for addr in kernel_tmp:
    300             kernel_stack.append(addr)
    301         # the later IP checking
    302         if k.kernel_ip:
    303             kernel_stack.insert(0, k.kernel_ip)
    304 
    305     if args.folded:
    306         # print folded stack output
    307         user_stack = list(user_stack)
    308         kernel_stack = list(kernel_stack)
    309         line = [k.name]
    310         # if we failed to get the stack is, such as due to no space (-ENOMEM) or
    311         # hash collision (-EEXIST), we still print a placeholder for consistency
    312         if not args.kernel_stacks_only:
    313             if stack_id_err(k.user_stack_id):
    314                 line.append("[Missed User Stack]")
    315             else:
    316                 line.extend([b.sym(addr, k.pid) for addr in reversed(user_stack)])
    317         if not args.user_stacks_only:
    318             line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else [])
    319             if stack_id_err(k.kernel_stack_id):
    320                 line.append("[Missed Kernel Stack]")
    321             else:
    322                 line.extend([b.ksym(addr) for addr in reversed(kernel_stack)])
    323         print("%s %d" % (b";".join(line).decode('utf-8', 'replace'), v.value))
    324     else:
    325         # print default multi-line stack output
    326         if not args.user_stacks_only:
    327             if stack_id_err(k.kernel_stack_id):
    328                 print("    [Missed Kernel Stack]")
    329             else:
    330                 for addr in kernel_stack:
    331                     print("    %s" % aksym(addr))
    332         if not args.kernel_stacks_only:
    333             if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0:
    334                 print("    --")
    335             if stack_id_err(k.user_stack_id):
    336                 print("    [Missed User Stack]")
    337             else:
    338                 for addr in user_stack:
    339                     print("    %s" % b.sym(addr, k.pid).decode('utf-8', 'replace'))
    340         print("    %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid))
    341         print("        %d\n" % v.value)
    342 
    343 # check missing
    344 if missing_stacks > 0:
    345     enomem_str = "" if not has_enomem else \
    346         " Consider increasing --stack-storage-size."
    347     print("WARNING: %d stack traces could not be displayed.%s" %
    348         (missing_stacks, enomem_str),
    349         file=stderr)
    350