Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/python
      2 # @lint-avoid-python-3-compatibility-imports
      3 #
      4 # cpuunclaimed   Sample CPU run queues and calculate unclaimed idle CPU.
      5 #                For Linux, uses BCC, eBPF.
      6 #
      7 # This samples the length of the run queues and determine when there are idle
      8 # CPUs, yet queued threads waiting their turn. Report the amount of idle
      9 # (yet unclaimed by waiting threads) CPU as a system-wide percentage.
     10 #
     11 # This situation can happen for a number of reasons:
     12 #
     13 # - An application has been bound to some, but not all, CPUs, and has runnable
     14 #   threads that cannot migrate to other CPUs due to this configuration.
     15 # - CPU affinity: an optimization that leaves threads on CPUs where the CPU
     16 #   caches are warm, even if this means short periods of waiting while other
     17 #   CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
     18 # - Scheduler bugs.
     19 #
     20 # An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
     21 # cause for concern. By leaving the CPU idle, overall throughput of the system
     22 # may be improved. This tool is best for identifying larger issues, > 2%, due
     23 # to the coarseness of its 99 Hertz samples.
     24 #
     25 # This is an experimental tool that currently works by use of sampling to
     26 # keep overheads low. Tool assumptions:
     27 #
     28 # - CPU samples consistently fire around the same offset. There will sometimes
     29 #   be a lag as a sample is delayed by higher-priority interrupts, but it is
     30 #   assumed the subsequent samples will catch up to the expected offsets (as
     31 #   is seen in practice). You can use -J to inspect sample offsets. Some
     32 #   systems can power down CPUs when idle, and when they wake up again they
     33 #   may begin firing at a skewed offset: this tool will detect the skew, print
     34 #   an error, and exit.
     35 # - All CPUs are online (see ncpu).
     36 #
     37 # If this identifies unclaimed CPU, you can double check it by dumping raw
     38 # samples (-j), as well as using other tracing tools to instrument scheduler
     39 # events (although this latter approach has much higher overhead).
     40 #
     41 # This tool passes all sampled events to user space for post processing.
     42 # I originally wrote this to do the calculations entirerly in kernel context,
     43 # and only pass a summary. That involves a number of challenges, and the
     44 # overhead savings may not outweigh the caveats. You can see my WIP here:
     45 # https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9
     46 #
     47 # USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count]
     48 #
     49 # If you see "Lost 1881 samples" warnings, try increasing wakeup_hz.
     50 #
     51 # REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
     52 # a version of this tool that may work on Linux 4.6 - 4.8.
     53 #
     54 # Copyright 2016 Netflix, Inc.
     55 # Licensed under the Apache License, Version 2.0 (the "License")
     56 #
     57 # 20-Dec-2016   Brendan Gregg   Created this.
     58 
     59 from __future__ import print_function
     60 from bcc import BPF, PerfType, PerfSWConfig
     61 from time import sleep, strftime
     62 from ctypes import c_int
     63 import argparse
     64 import multiprocessing
     65 from os import getpid, system
     66 import ctypes as ct
     67 
     68 # arguments
     69 examples = """examples:
     70     ./cpuunclaimed            # sample and calculate unclaimed idle CPUs,
     71                               # output every 1 second (default)
     72     ./cpuunclaimed 5 10       # print 5 second summaries, 10 times
     73     ./cpuunclaimed -T 1       # 1s summaries and timestamps
     74     ./cpuunclaimed -j         # raw dump of all samples (verbose), CSV
     75 """
     76 parser = argparse.ArgumentParser(
     77     description="Sample CPU run queues and calculate unclaimed idle CPU",
     78     formatter_class=argparse.RawDescriptionHelpFormatter,
     79     epilog=examples)
     80 parser.add_argument("-j", "--csv", action="store_true",
     81     help="print sample summaries (verbose) as comma-separated values")
     82 parser.add_argument("-J", "--fullcsv", action="store_true",
     83     help="print sample summaries with extra fields: CPU sample offsets")
     84 parser.add_argument("-T", "--timestamp", action="store_true",
     85     help="include timestamp on output")
     86 parser.add_argument("interval", nargs="?", default=-1,
     87     help="output interval, in seconds")
     88 parser.add_argument("count", nargs="?", default=99999999,
     89     help="number of outputs")
     90 parser.add_argument("--ebpf", action="store_true",
     91     help=argparse.SUPPRESS)
     92 args = parser.parse_args()
     93 countdown = int(args.count)
     94 frequency = 99
     95 dobind = 1
     96 wakeup_hz = 10                      # frequency to read buffers
     97 wakeup_s = float(1) / wakeup_hz
     98 ncpu = multiprocessing.cpu_count()  # assume all are online
     99 debug = 0
    100 
    101 # process arguments
    102 if args.fullcsv:
    103     args.csv = True
    104 if args.csv:
    105     interval = 0.2
    106 if args.interval != -1 and (args.fullcsv or args.csv):
    107     print("ERROR: cannot use interval with either -j or -J. Exiting.")
    108     exit()
    109 if args.interval == -1:
    110     args.interval = "1"
    111 interval = float(args.interval)
    112 
    113 # define BPF program
    114 bpf_text = """
    115 #include <uapi/linux/ptrace.h>
    116 #include <uapi/linux/bpf_perf_event.h>
    117 #include <linux/sched.h>
    118 
    119 struct data_t {
    120     u64 ts;
    121     u64 cpu;
    122     u64 len;
    123 };
    124 
    125 BPF_PERF_OUTPUT(events);
    126 
    127 // Declare enough of cfs_rq to find nr_running, since we can't #import the
    128 // header. This will need maintenance. It is from kernel/sched/sched.h:
    129 struct cfs_rq_partial {
    130     struct load_weight load;
    131     unsigned int nr_running, h_nr_running;
    132 };
    133 
    134 int do_perf_event(struct bpf_perf_event_data *ctx)
    135 {
    136     int cpu = bpf_get_smp_processor_id();
    137     u64 now = bpf_ktime_get_ns();
    138 
    139     /*
    140      * Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
    141      * unstable interface and may need maintenance. Perhaps a future version
    142      * of BPF will support task_rq(p) or something similar as a more reliable
    143      * interface.
    144      */
    145     unsigned int len = 0;
    146     struct task_struct *task = NULL;
    147     struct cfs_rq_partial *my_q = NULL;
    148     task = (struct task_struct *)bpf_get_current_task();
    149     my_q = (struct cfs_rq_partial *)task->se.cfs_rq;
    150     len = my_q->nr_running;
    151 
    152     struct data_t data = {.ts = now, .cpu = cpu, .len = len};
    153     events.perf_submit(ctx, &data, sizeof(data));
    154 
    155     return 0;
    156 }
    157 """
    158 
    159 # code substitutions
    160 if debug or args.ebpf:
    161     print(bpf_text)
    162     if args.ebpf:
    163         exit()
    164 
    165 # initialize BPF & perf_events
    166 b = BPF(text=bpf_text)
    167 # TODO: check for HW counters first and use if more accurate
    168 b.attach_perf_event(ev_type=PerfType.SOFTWARE,
    169     ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event",
    170     sample_period=0, sample_freq=frequency)
    171 
    172 if args.csv:
    173     if args.timestamp:
    174         print("TIME", end=",")
    175     print("TIMESTAMP_ns", end=",")
    176     print(",".join("CPU" + str(c) for c in range(ncpu)), end="")
    177     if args.fullcsv:
    178         print(",", end="")
    179         print(",".join("OFFSET_ns_CPU" + str(c) for c in range(ncpu)), end="")
    180     print()
    181 else:
    182     print(("Sampling run queues... Output every %s seconds. " +
    183           "Hit Ctrl-C to end.") % args.interval)
    184 class Data(ct.Structure):
    185     _fields_ = [
    186         ("ts", ct.c_ulonglong),
    187         ("cpu", ct.c_ulonglong),
    188         ("len", ct.c_ulonglong)
    189     ]
    190 
    191 samples = {}
    192 group = {}
    193 last = 0
    194 
    195 # process event
    196 def print_event(cpu, data, size):
    197     event = ct.cast(data, ct.POINTER(Data)).contents
    198     samples[event.ts] = {}
    199     samples[event.ts]['cpu'] = event.cpu
    200     samples[event.ts]['len'] = event.len
    201 
    202 exiting = 0 if args.interval else 1
    203 slept = float(0)
    204 
    205 # Choose the elapsed time from one sample group to the next that identifies a
    206 # new sample group (a group being a set of samples from all CPUs). The
    207 # earliest timestamp is compared in each group. This trigger is also used
    208 # for sanity testing, if a group's samples exceed half this value.
    209 trigger = int(0.8 * (1000000000 / frequency))
    210 
    211 # read events
    212 b["events"].open_perf_buffer(print_event, page_cnt=64)
    213 while 1:
    214     # allow some buffering by calling sleep(), to reduce the context switch
    215     # rate and lower overhead.
    216     try:
    217         if not exiting:
    218             sleep(wakeup_s)
    219     except KeyboardInterrupt:
    220         exiting = 1
    221     b.perf_buffer_poll()
    222     slept += wakeup_s
    223 
    224     if slept < 0.999 * interval:   # floating point workaround
    225         continue
    226     slept = 0
    227 
    228     positive = 0  # number of samples where an idle CPU could have run work
    229     running = 0
    230     idle = 0
    231     if debug >= 2:
    232         print("DEBUG: begin samples loop, count %d" % len(samples))
    233     for e in sorted(samples):
    234         if debug >= 2:
    235             print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e,
    236                   samples[e]['cpu'], samples[e]['len'], e - last,
    237                   e - last > trigger))
    238 
    239         # look for time jumps to identify a new sample group
    240         if e - last > trigger:
    241 
    242             # first first group timestamp, and sanity test
    243             g_time = 0
    244             g_max = 0
    245             for ge in sorted(group):
    246                 if g_time == 0:
    247                     g_time = ge
    248                 g_max = ge
    249 
    250             # process previous sample group
    251             if args.csv:
    252                 lens = [0] * ncpu
    253                 offs = [0] * ncpu
    254                 for ge in sorted(group):
    255                     lens[samples[ge]['cpu']] = samples[ge]['len']
    256                     if args.fullcsv:
    257                         offs[samples[ge]['cpu']] = ge - g_time
    258                 if g_time > 0:      # else first sample
    259                     if args.timestamp:
    260                         print("%-8s" % strftime("%H:%M:%S"), end=",")
    261                     print("%d" % g_time, end=",")
    262                     print(",".join(str(lens[c]) for c in range(ncpu)), end="")
    263                     if args.fullcsv:
    264                         print(",", end="")
    265                         print(",".join(str(offs[c]) for c in range(ncpu)))
    266                     else:
    267                         print()
    268             else:
    269                 # calculate stats
    270                 g_running = 0
    271                 g_queued = 0
    272                 for ge in group:
    273                     if samples[ge]['len'] > 0:
    274                         g_running += 1
    275                     if samples[ge]['len'] > 1:
    276                         g_queued += samples[ge]['len'] - 1
    277                 g_idle = ncpu - g_running
    278 
    279                 # calculate the number of threads that could have run as the
    280                 # minimum of idle and queued
    281                 if g_idle > 0 and g_queued > 0:
    282                     if g_queued > g_idle:
    283                         i = g_idle
    284                     else:
    285                         i = g_queued
    286                     positive += i
    287                 running += g_running
    288                 idle += g_idle
    289 
    290             # now sanity test, after -J output
    291             g_range = g_max - g_time
    292             if g_range > trigger / 2:
    293                 # if a sample group exceeds half the interval, we can no
    294                 # longer draw conclusions about some CPUs idle while others
    295                 # have queued work. Error and exit. This can happen when
    296                 # CPUs power down, then start again on different offsets.
    297                 # TODO: Since this is a sampling tool, an error margin should
    298                 # be anticipated, so an improvement may be to bump a counter
    299                 # instead of exiting, and only exit if this counter shows
    300                 # a skewed sample rate of over, say, 1%. Such an approach
    301                 # would allow a small rate of outliers (sampling error),
    302                 # and, we could tighten the trigger to be, say, trigger / 5.
    303                 # In the case of a power down, if it's detectable, perhaps
    304                 # the tool could reinitialize the timers (although exiting
    305                 # is simple and works).
    306                 print(("ERROR: CPU samples arrived at skewed offsets " +
    307                       "(CPUs may have powered down when idle), " +
    308                       "spanning %d ns (expected < %d ns). Debug with -J, " +
    309                       "and see the man page. As output may begin to be " +
    310                       "unreliable, exiting.") % (g_range, trigger / 2))
    311                 exit()
    312 
    313             # these are done, remove
    314             for ge in sorted(group):
    315                 del samples[ge]
    316 
    317             # begin next group
    318             group = {}
    319             last = e
    320 
    321         # stash this timestamp in a sample group dict
    322         group[e] = 1
    323 
    324     if not args.csv:
    325         total = running + idle
    326         unclaimed = util = 0
    327 
    328         if debug:
    329             print("DEBUG: hit %d running %d idle %d total %d buffered %d" % (
    330                   positive, running, idle, total, len(samples)))
    331 
    332         if args.timestamp:
    333             print("%-8s " % strftime("%H:%M:%S"), end="")
    334 
    335         # output
    336         if total:
    337             unclaimed = float(positive) / total
    338             util = float(running) / total
    339         print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util,
    340               100 * unclaimed))
    341 
    342     countdown -= 1
    343     if exiting or countdown == 0:
    344         exit()
    345