Home | History | Annotate | Download | only in old
      1 #!/usr/bin/python
      2 # @lint-avoid-python-3-compatibility-imports
      3 #
      4 # profile  Profile CPU usage by sampling stack traces at a timed interval.
      5 #          For Linux, uses BCC, BPF, perf_events. Embedded C.
      6 #
      7 # This is an efficient profiler, as stack traces are frequency counted in
      8 # kernel context, rather than passing every stack to user space for frequency
      9 # counting there. Only the unique stacks and counts are passed to user space
     10 # at the end of the profile, greatly reducing the kernel<->user transfer.
     11 #
     12 # This uses perf_event_open to setup a timer which is instrumented by BPF,
     13 # and for efficiency it does not initialize the perf ring buffer, so the
     14 # redundant perf samples are not collected.
     15 #
     16 # Kernel stacks are post-process in user-land to skip the interrupt framework
     17 # frames. You can improve efficiency a little by specifying the exact number
     18 # of frames to skip with -s, provided you know what that is. If you get -s
     19 # wrong, note that the first line is the IP, and then the (skipped) stack.
     20 #
     21 # Note: if another perf-based sampling session is active, the output may become
     22 # polluted with their events. On older kernels, the ouptut may also become
     23 # polluted with tracing sessions (when the kprobe is used instead of the
     24 # tracepoint). If this becomes a problem, logic can be added to filter events.
     25 #
     26 # REQUIRES: Linux 4.6+ (BPF_MAP_TYPE_STACK_TRACE support), and the
     27 # perf_misc_flags() function symbol to exist. The latter may or may not
     28 # exist depending on your kernel build. Linux 4.9 provides a proper solution
     29 # to this (this tool will be updated).
     30 #
     31 # Copyright 2016 Netflix, Inc.
     32 # Licensed under the Apache License, Version 2.0 (the "License")
     33 #
     34 # THANKS: Sasha Goldshtein, Andrew Birchall, and Evgeny Vereshchagin, who wrote
     35 # much of the code here, borrowed from tracepoint.py and offcputime.py.
     36 #
     37 # 15-Jul-2016   Brendan Gregg   Created this.
     38 
     39 from __future__ import print_function
     40 from bcc import BPF, Perf
     41 from sys import stderr
     42 from time import sleep
     43 import argparse
     44 import signal
     45 import os
     46 import errno
     47 import multiprocessing
     48 import ctypes as ct
     49 
     50 #
     51 # Process Arguments
     52 #
     53 
     54 # arg validation
     55 def positive_int(val):
     56     try:
     57         ival = int(val)
     58     except ValueError:
     59         raise argparse.ArgumentTypeError("must be an integer")
     60 
     61     if ival < 0:
     62         raise argparse.ArgumentTypeError("must be positive")
     63     return ival
     64 
     65 def positive_nonzero_int(val):
     66     ival = positive_int(val)
     67     if ival == 0:
     68         raise argparse.ArgumentTypeError("must be nonzero")
     69     return ival
     70 
     71 # arguments
     72 examples = """examples:
     73     ./profile             # profile stack traces at 49 Hertz until Ctrl-C
     74     ./profile -F 99       # profile stack traces at 99 Hertz
     75     ./profile 5           # profile at 49 Hertz for 5 seconds only
     76     ./profile -f 5        # output in folded format for flame graphs
     77     ./profile -p 185      # only profile threads for PID 185
     78     ./profile -U          # only show user space stacks (no kernel)
     79     ./profile -K          # only show kernel space stacks (no user)
     80     ./profile -S 11       # always skip 11 frames of kernel stack
     81 """
     82 parser = argparse.ArgumentParser(
     83     description="Profile CPU stack traces at a timed interval",
     84     formatter_class=argparse.RawDescriptionHelpFormatter,
     85     epilog=examples)
     86 thread_group = parser.add_mutually_exclusive_group()
     87 thread_group.add_argument("-p", "--pid", type=positive_int,
     88     help="profile this PID only")
     89 # TODO: add options for user/kernel threads only
     90 stack_group = parser.add_mutually_exclusive_group()
     91 stack_group.add_argument("-U", "--user-stacks-only", action="store_true",
     92     help="show stacks from user space only (no kernel space stacks)")
     93 stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true",
     94     help="show stacks from kernel space only (no user space stacks)")
     95 parser.add_argument("-F", "--frequency", type=positive_int, default=49,
     96     help="sample frequency, Hertz (default 49)")
     97 parser.add_argument("-d", "--delimited", action="store_true",
     98     help="insert delimiter between kernel/user stacks")
     99 parser.add_argument("-a", "--annotations", action="store_true",
    100     help="add _[k] annotations to kernel frames")
    101 parser.add_argument("-f", "--folded", action="store_true",
    102     help="output folded format, one line per stack (for flame graphs)")
    103 parser.add_argument("--stack-storage-size", default=2048,
    104     type=positive_nonzero_int,
    105     help="the number of unique stack traces that can be stored and "
    106         "displayed (default 2048)")
    107 parser.add_argument("-S", "--kernel-skip", type=positive_int, default=0,
    108     help="skip this many kernel frames (default 3)")
    109 parser.add_argument("duration", nargs="?", default=99999999,
    110     type=positive_nonzero_int,
    111     help="duration of trace, in seconds")
    112 
    113 # option logic
    114 args = parser.parse_args()
    115 skip = args.kernel_skip
    116 pid = int(args.pid) if args.pid is not None else -1
    117 duration = int(args.duration)
    118 debug = 0
    119 need_delimiter = args.delimited and not (args.kernel_stacks_only or
    120     args.user_stacks_only)
    121 # TODO: add stack depth, and interval
    122 
    123 #
    124 # Setup BPF
    125 #
    126 
    127 # define BPF program
    128 bpf_text = """
    129 #include <uapi/linux/ptrace.h>
    130 #include <linux/sched.h>
    131 
    132 struct key_t {
    133     u32 pid;
    134     u64 kernel_ip;
    135     u64 kernel_ret_ip;
    136     int user_stack_id;
    137     int kernel_stack_id;
    138     char name[TASK_COMM_LEN];
    139 };
    140 BPF_HASH(counts, struct key_t);
    141 BPF_HASH(start, u32);
    142 BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
    143 
    144 // This code gets a bit complex. Probably not suitable for casual hacking.
    145 
    146 PERF_TRACE_EVENT {
    147     u32 pid = bpf_get_current_pid_tgid();
    148     if (!(THREAD_FILTER))
    149         return 0;
    150 
    151     // create map key
    152     u64 zero = 0, *val;
    153     struct key_t key = {.pid = pid};
    154     bpf_get_current_comm(&key.name, sizeof(key.name));
    155 
    156     // get stacks
    157     key.user_stack_id = USER_STACK_GET;
    158     key.kernel_stack_id = KERNEL_STACK_GET;
    159 
    160     if (key.kernel_stack_id >= 0) {
    161         // populate extras to fix the kernel stack
    162         struct pt_regs regs = {};
    163         bpf_probe_read(&regs, sizeof(regs), (void *)REGS_LOCATION);
    164         u64 ip = PT_REGS_IP(&regs);
    165 
    166         // if ip isn't sane, leave key ips as zero for later checking
    167 #ifdef CONFIG_RANDOMIZE_MEMORY
    168         if (ip > __PAGE_OFFSET_BASE) {
    169 #else
    170         if (ip > PAGE_OFFSET) {
    171 #endif
    172             key.kernel_ip = ip;
    173             if (DO_KERNEL_RIP) {
    174                 /*
    175                  * User didn't specify a skip value (-s), so we will figure
    176                  * out how many interrupt framework frames to skip by recording
    177                  * the kernel rip, then later scanning for it on the stack.
    178                  * This is likely x86_64 specific; can use -s as a workaround
    179                  * until this supports your architecture.
    180                  */
    181                 bpf_probe_read(&key.kernel_ret_ip, sizeof(key.kernel_ret_ip),
    182                 (void *)(regs.bp + 8));
    183             }
    184         }
    185     }
    186 
    187     val = counts.lookup_or_init(&key, &zero);
    188     (*val)++;
    189     return 0;
    190 }
    191 """
    192 
    193 # set thread filter
    194 thread_context = ""
    195 perf_filter = "-a"
    196 if args.pid is not None:
    197     thread_context = "PID %s" % args.pid
    198     thread_filter = 'pid == %s' % args.pid
    199     perf_filter = '-p %s' % args.pid
    200 else:
    201     thread_context = "all threads"
    202     thread_filter = '1'
    203 bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
    204 
    205 # set stack storage size
    206 bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
    207 
    208 # handle stack args
    209 kernel_stack_get = "stack_traces.get_stackid(args, " \
    210     "%d | BPF_F_REUSE_STACKID)" % skip
    211 user_stack_get = \
    212     "stack_traces.get_stackid(args, BPF_F_REUSE_STACKID | BPF_F_USER_STACK)"
    213 stack_context = ""
    214 if args.user_stacks_only:
    215     stack_context = "user"
    216     kernel_stack_get = "-1"
    217 elif args.kernel_stacks_only:
    218     stack_context = "kernel"
    219     user_stack_get = "-1"
    220 else:
    221     stack_context = "user + kernel"
    222 bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get)
    223 bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get)
    224 if skip:
    225     # don't record the rip, as we won't use it
    226     bpf_text = bpf_text.replace('DO_KERNEL_RIP', '0')
    227 else:
    228     # rip is used to skip interrupt infrastructure frames
    229     bpf_text = bpf_text.replace('DO_KERNEL_RIP', '1')
    230 
    231 # header
    232 if not args.folded:
    233     print("Sampling at %d Hertz of %s by %s stack" %
    234         (args.frequency, thread_context, stack_context), end="")
    235     if duration < 99999999:
    236         print(" for %d secs." % duration)
    237     else:
    238         print("... Hit Ctrl-C to end.")
    239 
    240 # kprobe perf_misc_flags()
    241 bpf_text = bpf_text.replace('PERF_TRACE_EVENT',
    242     'int kprobe__perf_misc_flags(struct pt_regs *args)')
    243 bpf_text = bpf_text.replace('REGS_LOCATION', 'PT_REGS_PARM1(args)')
    244 if debug:
    245     print(bpf_text)
    246 
    247 # initialize BPF
    248 try:
    249     b = BPF(text=bpf_text)
    250 except:
    251     print("BPF initialization failed. perf_misc_flags() may be inlined in " +
    252         "your kernel build.\nThis tool will be updated in the future to " +
    253         "support Linux 4.9, which has reliable profiling support. Exiting.")
    254     exit()
    255 
    256 # signal handler
    257 def signal_ignore(signal, frame):
    258     print()
    259 
    260 #
    261 # Setup perf_events
    262 #
    263 
    264 # use perf_events to sample
    265 try:
    266     Perf.perf_event_open(0, pid=-1, ptype=Perf.PERF_TYPE_SOFTWARE,
    267         freq=args.frequency)
    268 except:
    269     print("ERROR: initializing perf_events for sampling.\n"
    270         "To debug this, try running the following command:\n"
    271         "    perf record -F 49 -e cpu-clock %s -- sleep 1\n"
    272         "If that also doesn't work, fix it first." % perf_filter, file=stderr)
    273     exit(0)
    274 
    275 #
    276 # Output Report
    277 #
    278 
    279 # collect samples
    280 try:
    281     sleep(duration)
    282 except KeyboardInterrupt:
    283     # as cleanup can take some time, trap Ctrl-C:
    284     signal.signal(signal.SIGINT, signal_ignore)
    285 
    286 if not args.folded:
    287     print()
    288 
    289 def aksym(addr):
    290     if args.annotations:
    291         return b.ksym(addr) + "_[k]"
    292     else:
    293         return b.ksym(addr)
    294 
    295 # output stacks
    296 missing_stacks = 0
    297 has_enomem = False
    298 counts = b.get_table("counts")
    299 stack_traces = b.get_table("stack_traces")
    300 for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
    301     # handle get_stackid erorrs
    302     if (not args.user_stacks_only and k.kernel_stack_id < 0 and
    303             k.kernel_stack_id != -errno.EFAULT) or \
    304             (not args.kernel_stacks_only and k.user_stack_id < 0 and
    305             k.user_stack_id != -errno.EFAULT):
    306         missing_stacks += 1
    307         # check for an ENOMEM error
    308         if k.kernel_stack_id == -errno.ENOMEM or \
    309                 k.user_stack_id == -errno.ENOMEM:
    310             has_enomem = True
    311 
    312     user_stack = [] if k.user_stack_id < 0 else \
    313         stack_traces.walk(k.user_stack_id)
    314     kernel_tmp = [] if k.kernel_stack_id < 0 else \
    315         stack_traces.walk(k.kernel_stack_id)
    316 
    317     # fix kernel stack
    318     kernel_stack = []
    319     if k.kernel_stack_id >= 0:
    320         if skip:
    321             # fixed skip
    322             for addr in kernel_tmp:
    323                 kernel_stack.append(addr)
    324             kernel_stack = kernel_stack[skip:]
    325         else:
    326             # skip the interrupt framework stack by searching for our RIP
    327             skipping = 1
    328             for addr in kernel_tmp:
    329                 if k.kernel_ret_ip == addr:
    330                     skipping = 0
    331                 if not skipping:
    332                     kernel_stack.append(addr)
    333         if k.kernel_ip:
    334             kernel_stack.insert(0, k.kernel_ip)
    335 
    336     do_delimiter = need_delimiter and kernel_stack
    337 
    338     if args.folded:
    339         # print folded stack output
    340         user_stack = list(user_stack)
    341         kernel_stack = list(kernel_stack)
    342         line = [k.name.decode('utf-8', 'replace')] + \
    343             [b.sym(addr, k.pid) for addr in reversed(user_stack)] + \
    344             (do_delimiter and ["-"] or []) + \
    345             [aksym(addr) for addr in reversed(kernel_stack)]
    346         print("%s %d" % (";".join(line), v.value))
    347     else:
    348         # print default multi-line stack output.
    349         for addr in kernel_stack:
    350             print("    %s" % aksym(addr))
    351         if do_delimiter:
    352             print("    --")
    353         for addr in user_stack:
    354             print("    %s" % b.sym(addr, k.pid))
    355         print("    %-16s %s (%d)" % ("-", k.name, k.pid))
    356         print("        %d\n" % v.value)
    357 
    358 # check missing
    359 if missing_stacks > 0:
    360     enomem_str = "" if not has_enomem else \
    361         " Consider increasing --stack-storage-size."
    362     print("WARNING: %d stack traces could not be displayed.%s" %
    363         (missing_stacks, enomem_str),
    364         file=stderr)
    365