Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/python
      2 # @lint-avoid-python-3-compatibility-imports
      3 #
      4 # funcslower  Trace slow kernel or user function calls.
      5 #             For Linux, uses BCC, eBPF.
      6 #
      7 # USAGE: funcslower [-h] [-p PID] [-m MIN_MS] [-u MIN_US] [-a ARGUMENTS]
      8 #                   [-T] [-t] [-v] function [function ...]
      9 #
     10 # WARNING: This tool traces function calls by instrumenting the entry and
     11 # return from each function. For commonly-invoked functions like memory allocs
     12 # or file writes, this can be extremely expensive. Mind the overhead.
     13 #
     14 # NOTE: This tool cannot trace nested functions in the same invocation
     15 # due to instrumentation specifics, only innermost calls will be visible.
     16 #
     17 # By default, a minimum millisecond threshold of 1 is used.
     18 #
     19 # Copyright 2017, Sasha Goldshtein
     20 # Licensed under the Apache License, Version 2.0 (the "License")
     21 #
     22 # 30-Mar-2017   Sasha Goldshtein    Created this.
     23 
     24 from __future__ import print_function
     25 from bcc import BPF
     26 import argparse
     27 import ctypes as ct
     28 import time
     29 
     30 examples = """examples:
     31   ./funcslower vfs_write        # trace vfs_write calls slower than 1ms
     32   ./funcslower -m 10 vfs_write  # same, but slower than 10ms
     33   ./funcslower -u 10 c:open     # trace open calls slower than 10us
     34   ./funcslower -p 135 c:open    # trace pid 135 only
     35   ./funcslower c:malloc c:free  # trace both malloc and free slower than 1ms
     36   ./funcslower -a 2 c:open      # show first two arguments to open
     37   ./funcslower -UK -m 10 c:open # Show user and kernel stack frame of open calls slower than 10ms
     38   ./funcslower -f -UK c:open    # Output in folded format for flame graphs
     39 """
     40 parser = argparse.ArgumentParser(
     41     description="Trace slow kernel or user function calls.",
     42     formatter_class=argparse.RawDescriptionHelpFormatter,
     43     epilog=examples)
     44 parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
     45     help="trace this PID only")
     46 parser.add_argument("-m", "--min-ms", type=float, dest="min_ms",
     47     help="minimum duration to trace (ms)")
     48 parser.add_argument("-u", "--min-us", type=float, dest="min_us",
     49     help="minimum duration to trace (us)")
     50 parser.add_argument("-a", "--arguments", type=int,
     51     help="print this many entry arguments, as hex")
     52 parser.add_argument("-T", "--time", action="store_true",
     53     help="show HH:MM:SS timestamp")
     54 parser.add_argument("-t", "--timestamp", action="store_true",
     55     help="show timestamp in seconds at us resolution")
     56 parser.add_argument("-v", "--verbose", action="store_true",
     57     help="print the BPF program for debugging purposes")
     58 parser.add_argument(metavar="function", nargs="+", dest="functions",
     59     help="function(s) to trace")
     60 parser.add_argument("--ebpf", action="store_true",
     61     help=argparse.SUPPRESS)
     62 parser.add_argument("-f", "--folded", action="store_true",
     63     help="output folded format, one line per stack (for flame graphs)")
     64 parser.add_argument("-U", "--user-stack",
     65   action="store_true", help="output user stack trace")
     66 parser.add_argument("-K", "--kernel-stack",
     67   action="store_true", help="output kernel stack trace")
     68 
     69 args = parser.parse_args()
     70 # fractions are allowed, but rounded to an integer nanosecond
     71 if args.min_ms:
     72     duration_ns = int(args.min_ms * 1000000)
     73 elif args.min_us:
     74     duration_ns = int(args.min_us * 1000)
     75 else:
     76     duration_ns = 1000000   # default to 1ms
     77 
     78 bpf_text = """
     79 #include <uapi/linux/ptrace.h>
     80 #include <linux/sched.h>    // for TASK_COMM_LEN
     81 
     82 struct entry_t {
     83     u64 id;
     84     u64 start_ns;
     85 #ifdef GRAB_ARGS
     86     u64 args[6];
     87 #endif
     88 };
     89 
     90 struct data_t {
     91     u64 id;
     92     u64 tgid_pid;
     93     u64 start_ns;
     94     u64 duration_ns;
     95     u64 retval;
     96     char comm[TASK_COMM_LEN];
     97 #ifdef USER_STACKS
     98     int user_stack_id;
     99 #endif
    100 #ifdef KERNEL_STACKS
    101     int kernel_stack_id;
    102     u64 kernel_ip;
    103 #endif
    104 #ifdef GRAB_ARGS
    105     u64 args[6];
    106 #endif
    107 };
    108 
    109 BPF_HASH(entryinfo, u64, struct entry_t);
    110 BPF_PERF_OUTPUT(events);
    111 
    112 #if defined(USER_STACKS) || defined(KERNEL_STACKS)
    113 BPF_STACK_TRACE(stacks, 2048);
    114 #endif
    115 
    116 static int trace_entry(struct pt_regs *ctx, int id)
    117 {
    118     u64 tgid_pid = bpf_get_current_pid_tgid();
    119     u32 tgid = tgid_pid >> 32;
    120     if (TGID_FILTER)
    121         return 0;
    122 
    123     u32 pid = tgid_pid;
    124 
    125     struct entry_t entry = {};
    126     entry.start_ns = bpf_ktime_get_ns();
    127     entry.id = id;
    128 #ifdef GRAB_ARGS
    129     entry.args[0] = PT_REGS_PARM1(ctx);
    130     entry.args[1] = PT_REGS_PARM2(ctx);
    131     entry.args[2] = PT_REGS_PARM3(ctx);
    132     entry.args[3] = PT_REGS_PARM4(ctx);
    133     entry.args[4] = PT_REGS_PARM5(ctx);
    134     entry.args[5] = PT_REGS_PARM6(ctx);
    135 #endif
    136 
    137     entryinfo.update(&tgid_pid, &entry);
    138 
    139     return 0;
    140 }
    141 
    142 int trace_return(struct pt_regs *ctx)
    143 {
    144     struct entry_t *entryp;
    145     u64 tgid_pid = bpf_get_current_pid_tgid();
    146 
    147     entryp = entryinfo.lookup(&tgid_pid);
    148     if (entryp == 0) {
    149         return 0;
    150     }
    151 
    152     u64 delta_ns = bpf_ktime_get_ns() - entryp->start_ns;
    153     entryinfo.delete(&tgid_pid);
    154 
    155     if (delta_ns < DURATION_NS)
    156         return 0;
    157 
    158     struct data_t data = {};
    159     data.id = entryp->id;
    160     data.tgid_pid = tgid_pid;
    161     data.start_ns = entryp->start_ns;
    162     data.duration_ns = delta_ns;
    163     data.retval = PT_REGS_RC(ctx);
    164 
    165 #ifdef USER_STACKS
    166     data.user_stack_id = stacks.get_stackid(ctx, BPF_F_USER_STACK);
    167 #endif
    168 
    169 #ifdef KERNEL_STACKS
    170     data.kernel_stack_id = stacks.get_stackid(ctx, 0);
    171 
    172     if (data.kernel_stack_id >= 0) {
    173         u64 ip = PT_REGS_IP(ctx);
    174         u64 page_offset;
    175 
    176         // if ip isn't sane, leave key ips as zero for later checking
    177 #if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
    178         // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
    179         page_offset = __PAGE_OFFSET_BASE;
    180 #elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
    181         // x64, 4.17, and later
    182 #if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
    183         page_offset = __PAGE_OFFSET_BASE_L5;
    184 #else
    185         page_offset = __PAGE_OFFSET_BASE_L4;
    186 #endif
    187 #else
    188         // earlier x86_64 kernels, e.g., 4.6, comes here
    189         // arm64, s390, powerpc, x86_32
    190         page_offset = PAGE_OFFSET;
    191 #endif
    192 
    193         if (ip > page_offset) {
    194             data.kernel_ip = ip;
    195         }
    196     }
    197 #endif
    198 
    199 #ifdef GRAB_ARGS
    200     bpf_probe_read(&data.args[0], sizeof(data.args), entryp->args);
    201 #endif
    202     bpf_get_current_comm(&data.comm, sizeof(data.comm));
    203     events.perf_submit(ctx, &data, sizeof(data));
    204 
    205     return 0;
    206 }
    207 """
    208 
    209 bpf_text = bpf_text.replace('DURATION_NS', str(duration_ns))
    210 if args.arguments:
    211     bpf_text = "#define GRAB_ARGS\n" + bpf_text
    212 if args.user_stack:
    213     bpf_text = "#define USER_STACKS\n" + bpf_text
    214 if args.kernel_stack:
    215     bpf_text = "#define KERNEL_STACKS\n" + bpf_text
    216 if args.tgid:
    217     bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % args.tgid)
    218 else:
    219     bpf_text = bpf_text.replace('TGID_FILTER', '0')
    220 
    221 for i in range(len(args.functions)):
    222     bpf_text += """
    223 int trace_%d(struct pt_regs *ctx) {
    224     return trace_entry(ctx, %d);
    225 }
    226 """ % (i, i)
    227 
    228 if args.verbose or args.ebpf:
    229     print(bpf_text)
    230     if args.ebpf:
    231         exit()
    232 
    233 b = BPF(text=bpf_text)
    234 
    235 for i, function in enumerate(args.functions):
    236     if ":" in function:
    237         library, func = function.split(":")
    238         b.attach_uprobe(name=library, sym=func, fn_name="trace_%d" % i)
    239         b.attach_uretprobe(name=library, sym=func, fn_name="trace_return")
    240     else:
    241         b.attach_kprobe(event=function, fn_name="trace_%d" % i)
    242         b.attach_kretprobe(event=function, fn_name="trace_return")
    243 
    244 TASK_COMM_LEN = 16  # linux/sched.h
    245 
    246 class Data(ct.Structure):
    247     _fields_ = [
    248         ("id", ct.c_ulonglong),
    249         ("tgid_pid", ct.c_ulonglong),
    250         ("start_ns", ct.c_ulonglong),
    251         ("duration_ns", ct.c_ulonglong),
    252         ("retval", ct.c_ulonglong),
    253         ("comm", ct.c_char * TASK_COMM_LEN)
    254     ] + ([("args", ct.c_ulonglong * 6)] if args.arguments else []) + \
    255             ([("user_stack_id", ct.c_int)] if args.user_stack else []) + \
    256             ([("kernel_stack_id", ct.c_int),("kernel_ip", ct.c_ulonglong)] if args.kernel_stack else [])
    257 
    258 time_designator = "us" if args.min_us else "ms"
    259 time_value = args.min_us or args.min_ms or 1
    260 time_multiplier = 1000 if args.min_us else 1000000
    261 time_col = args.time or args.timestamp
    262 
    263 # Do not print header when folded
    264 if not args.folded:
    265     print("Tracing function calls slower than %g %s... Ctrl+C to quit." %
    266           (time_value, time_designator))
    267     print((("%-10s " % "TIME" if time_col else "") + "%-14s %-6s %7s %16s %s") %
    268         ("COMM", "PID", "LAT(%s)" % time_designator, "RVAL",
    269         "FUNC" + (" ARGS" if args.arguments else "")))
    270 
    271 earliest_ts = 0
    272 
    273 def time_str(event):
    274     if args.time:
    275         return "%-10s " % time.strftime("%H:%M:%S")
    276     if args.timestamp:
    277         global earliest_ts
    278         if earliest_ts == 0:
    279             earliest_ts = event.start_ns
    280         return "%-10.6f " % ((event.start_ns - earliest_ts) / 1000000000.0)
    281     return ""
    282 
    283 def args_str(event):
    284     if not args.arguments:
    285         return ""
    286     return str.join(" ", ["0x%x" % arg for arg in event.args[:args.arguments]])
    287 
    288 def print_stack(event):
    289     user_stack = []
    290     stack_traces = b.get_table("stacks")
    291 
    292     if args.user_stack and event.user_stack_id > 0:
    293         user_stack = stack_traces.walk(event.user_stack_id)
    294 
    295     kernel_stack = []
    296     if args.kernel_stack and event.kernel_stack_id > 0:
    297         kernel_tmp = stack_traces.walk(event.kernel_stack_id)
    298 
    299         # fix kernel stack
    300         for addr in kernel_tmp:
    301             kernel_stack.append(addr)
    302 
    303     do_delimiter = user_stack and kernel_stack
    304 
    305     if args.folded:
    306         # print folded stack output
    307         user_stack = list(user_stack)
    308         kernel_stack = list(kernel_stack)
    309         line = [event.comm.decode('utf-8', 'replace')] + \
    310             [b.sym(addr, event.tgid_pid) for addr in reversed(user_stack)] + \
    311             (do_delimiter and ["-"] or []) + \
    312             [b.ksym(addr) for addr in reversed(kernel_stack)]
    313         print("%s %d" % (";".join(line), 1))
    314     else:
    315         # print default multi-line stack output.
    316         for addr in kernel_stack:
    317             print("    %s" % b.ksym(addr))
    318         for addr in user_stack:
    319             print("    %s" % b.sym(addr, event.tgid_pid))
    320 
    321 def print_event(cpu, data, size):
    322     event = ct.cast(data, ct.POINTER(Data)).contents
    323     ts = float(event.duration_ns) / time_multiplier
    324     if not args.folded:
    325         print((time_str(event) + "%-14.14s %-6s %7.2f %16x %s %s") %
    326             (event.comm.decode('utf-8', 'replace'), event.tgid_pid >> 32,
    327              ts, event.retval, args.functions[event.id], args_str(event)))
    328     if args.user_stack or args.kernel_stack:
    329         print_stack(event)
    330 
    331 b["events"].open_perf_buffer(print_event, page_cnt=64)
    332 while True:
    333     b.perf_buffer_poll()
    334