1 #!/usr/bin/python 2 # @lint-avoid-python-3-compatibility-imports 3 # 4 # cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU. 5 # For Linux, uses BCC, eBPF. 6 # 7 # This samples the length of the run queues and determine when there are idle 8 # CPUs, yet queued threads waiting their turn. Report the amount of idle 9 # (yet unclaimed by waiting threads) CPU as a system-wide percentage. 10 # 11 # This situation can happen for a number of reasons: 12 # 13 # - An application has been bound to some, but not all, CPUs, and has runnable 14 # threads that cannot migrate to other CPUs due to this configuration. 15 # - CPU affinity: an optimization that leaves threads on CPUs where the CPU 16 # caches are warm, even if this means short periods of waiting while other 17 # CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*). 18 # - Scheduler bugs. 19 # 20 # An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a 21 # cause for concern. By leaving the CPU idle, overall throughput of the system 22 # may be improved. This tool is best for identifying larger issues, > 2%, due 23 # to the coarseness of its 99 Hertz samples. 24 # 25 # This is an experimental tool that currently works by use of sampling to 26 # keep overheads low. Tool assumptions: 27 # 28 # - CPU samples consistently fire around the same offset. There will sometimes 29 # be a lag as a sample is delayed by higher-priority interrupts, but it is 30 # assumed the subsequent samples will catch up to the expected offsets (as 31 # is seen in practice). You can use -J to inspect sample offsets. Some 32 # systems can power down CPUs when idle, and when they wake up again they 33 # may begin firing at a skewed offset: this tool will detect the skew, print 34 # an error, and exit. 35 # - All CPUs are online (see ncpu). 36 # 37 # If this identifies unclaimed CPU, you can double check it by dumping raw 38 # samples (-j), as well as using other tracing tools to instrument scheduler 39 # events (although this latter approach has much higher overhead). 40 # 41 # This tool passes all sampled events to user space for post processing. 42 # I originally wrote this to do the calculations entirerly in kernel context, 43 # and only pass a summary. That involves a number of challenges, and the 44 # overhead savings may not outweigh the caveats. You can see my WIP here: 45 # https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9 46 # 47 # USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count] 48 # 49 # If you see "Lost 1881 samples" warnings, try increasing wakeup_hz. 50 # 51 # REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is 52 # a version of this tool that may work on Linux 4.6 - 4.8. 53 # 54 # Copyright 2016 Netflix, Inc. 55 # Licensed under the Apache License, Version 2.0 (the "License") 56 # 57 # 20-Dec-2016 Brendan Gregg Created this. 58 59 from __future__ import print_function 60 from bcc import BPF, PerfType, PerfSWConfig 61 from time import sleep, strftime 62 from ctypes import c_int 63 import argparse 64 import multiprocessing 65 from os import getpid, system 66 import ctypes as ct 67 68 # arguments 69 examples = """examples: 70 ./cpuunclaimed # sample and calculate unclaimed idle CPUs, 71 # output every 1 second (default) 72 ./cpuunclaimed 5 10 # print 5 second summaries, 10 times 73 ./cpuunclaimed -T 1 # 1s summaries and timestamps 74 ./cpuunclaimed -j # raw dump of all samples (verbose), CSV 75 """ 76 parser = argparse.ArgumentParser( 77 description="Sample CPU run queues and calculate unclaimed idle CPU", 78 formatter_class=argparse.RawDescriptionHelpFormatter, 79 epilog=examples) 80 parser.add_argument("-j", "--csv", action="store_true", 81 help="print sample summaries (verbose) as comma-separated values") 82 parser.add_argument("-J", "--fullcsv", action="store_true", 83 help="print sample summaries with extra fields: CPU sample offsets") 84 parser.add_argument("-T", "--timestamp", action="store_true", 85 help="include timestamp on output") 86 parser.add_argument("interval", nargs="?", default=-1, 87 help="output interval, in seconds") 88 parser.add_argument("count", nargs="?", default=99999999, 89 help="number of outputs") 90 parser.add_argument("--ebpf", action="store_true", 91 help=argparse.SUPPRESS) 92 args = parser.parse_args() 93 countdown = int(args.count) 94 frequency = 99 95 dobind = 1 96 wakeup_hz = 10 # frequency to read buffers 97 wakeup_s = float(1) / wakeup_hz 98 ncpu = multiprocessing.cpu_count() # assume all are online 99 debug = 0 100 101 # process arguments 102 if args.fullcsv: 103 args.csv = True 104 if args.csv: 105 interval = 0.2 106 if args.interval != -1 and (args.fullcsv or args.csv): 107 print("ERROR: cannot use interval with either -j or -J. Exiting.") 108 exit() 109 if args.interval == -1: 110 args.interval = "1" 111 interval = float(args.interval) 112 113 # define BPF program 114 bpf_text = """ 115 #include <uapi/linux/ptrace.h> 116 #include <uapi/linux/bpf_perf_event.h> 117 #include <linux/sched.h> 118 119 struct data_t { 120 u64 ts; 121 u64 cpu; 122 u64 len; 123 }; 124 125 BPF_PERF_OUTPUT(events); 126 127 // Declare enough of cfs_rq to find nr_running, since we can't #import the 128 // header. This will need maintenance. It is from kernel/sched/sched.h: 129 struct cfs_rq_partial { 130 struct load_weight load; 131 unsigned int nr_running, h_nr_running; 132 }; 133 134 int do_perf_event(struct bpf_perf_event_data *ctx) 135 { 136 int cpu = bpf_get_smp_processor_id(); 137 u64 now = bpf_ktime_get_ns(); 138 139 /* 140 * Fetch the run queue length from task->se.cfs_rq->nr_running. This is an 141 * unstable interface and may need maintenance. Perhaps a future version 142 * of BPF will support task_rq(p) or something similar as a more reliable 143 * interface. 144 */ 145 unsigned int len = 0; 146 struct task_struct *task = NULL; 147 struct cfs_rq_partial *my_q = NULL; 148 task = (struct task_struct *)bpf_get_current_task(); 149 my_q = (struct cfs_rq_partial *)task->se.cfs_rq; 150 len = my_q->nr_running; 151 152 struct data_t data = {.ts = now, .cpu = cpu, .len = len}; 153 events.perf_submit(ctx, &data, sizeof(data)); 154 155 return 0; 156 } 157 """ 158 159 # code substitutions 160 if debug or args.ebpf: 161 print(bpf_text) 162 if args.ebpf: 163 exit() 164 165 # initialize BPF & perf_events 166 b = BPF(text=bpf_text) 167 # TODO: check for HW counters first and use if more accurate 168 b.attach_perf_event(ev_type=PerfType.SOFTWARE, 169 ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event", 170 sample_period=0, sample_freq=frequency) 171 172 if args.csv: 173 if args.timestamp: 174 print("TIME", end=",") 175 print("TIMESTAMP_ns", end=",") 176 print(",".join("CPU" + str(c) for c in range(ncpu)), end="") 177 if args.fullcsv: 178 print(",", end="") 179 print(",".join("OFFSET_ns_CPU" + str(c) for c in range(ncpu)), end="") 180 print() 181 else: 182 print(("Sampling run queues... Output every %s seconds. " + 183 "Hit Ctrl-C to end.") % args.interval) 184 class Data(ct.Structure): 185 _fields_ = [ 186 ("ts", ct.c_ulonglong), 187 ("cpu", ct.c_ulonglong), 188 ("len", ct.c_ulonglong) 189 ] 190 191 samples = {} 192 group = {} 193 last = 0 194 195 # process event 196 def print_event(cpu, data, size): 197 event = ct.cast(data, ct.POINTER(Data)).contents 198 samples[event.ts] = {} 199 samples[event.ts]['cpu'] = event.cpu 200 samples[event.ts]['len'] = event.len 201 202 exiting = 0 if args.interval else 1 203 slept = float(0) 204 205 # Choose the elapsed time from one sample group to the next that identifies a 206 # new sample group (a group being a set of samples from all CPUs). The 207 # earliest timestamp is compared in each group. This trigger is also used 208 # for sanity testing, if a group's samples exceed half this value. 209 trigger = int(0.8 * (1000000000 / frequency)) 210 211 # read events 212 b["events"].open_perf_buffer(print_event, page_cnt=64) 213 while 1: 214 # allow some buffering by calling sleep(), to reduce the context switch 215 # rate and lower overhead. 216 try: 217 if not exiting: 218 sleep(wakeup_s) 219 except KeyboardInterrupt: 220 exiting = 1 221 b.perf_buffer_poll() 222 slept += wakeup_s 223 224 if slept < 0.999 * interval: # floating point workaround 225 continue 226 slept = 0 227 228 positive = 0 # number of samples where an idle CPU could have run work 229 running = 0 230 idle = 0 231 if debug >= 2: 232 print("DEBUG: begin samples loop, count %d" % len(samples)) 233 for e in sorted(samples): 234 if debug >= 2: 235 print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e, 236 samples[e]['cpu'], samples[e]['len'], e - last, 237 e - last > trigger)) 238 239 # look for time jumps to identify a new sample group 240 if e - last > trigger: 241 242 # first first group timestamp, and sanity test 243 g_time = 0 244 g_max = 0 245 for ge in sorted(group): 246 if g_time == 0: 247 g_time = ge 248 g_max = ge 249 250 # process previous sample group 251 if args.csv: 252 lens = [0] * ncpu 253 offs = [0] * ncpu 254 for ge in sorted(group): 255 lens[samples[ge]['cpu']] = samples[ge]['len'] 256 if args.fullcsv: 257 offs[samples[ge]['cpu']] = ge - g_time 258 if g_time > 0: # else first sample 259 if args.timestamp: 260 print("%-8s" % strftime("%H:%M:%S"), end=",") 261 print("%d" % g_time, end=",") 262 print(",".join(str(lens[c]) for c in range(ncpu)), end="") 263 if args.fullcsv: 264 print(",", end="") 265 print(",".join(str(offs[c]) for c in range(ncpu))) 266 else: 267 print() 268 else: 269 # calculate stats 270 g_running = 0 271 g_queued = 0 272 for ge in group: 273 if samples[ge]['len'] > 0: 274 g_running += 1 275 if samples[ge]['len'] > 1: 276 g_queued += samples[ge]['len'] - 1 277 g_idle = ncpu - g_running 278 279 # calculate the number of threads that could have run as the 280 # minimum of idle and queued 281 if g_idle > 0 and g_queued > 0: 282 if g_queued > g_idle: 283 i = g_idle 284 else: 285 i = g_queued 286 positive += i 287 running += g_running 288 idle += g_idle 289 290 # now sanity test, after -J output 291 g_range = g_max - g_time 292 if g_range > trigger / 2: 293 # if a sample group exceeds half the interval, we can no 294 # longer draw conclusions about some CPUs idle while others 295 # have queued work. Error and exit. This can happen when 296 # CPUs power down, then start again on different offsets. 297 # TODO: Since this is a sampling tool, an error margin should 298 # be anticipated, so an improvement may be to bump a counter 299 # instead of exiting, and only exit if this counter shows 300 # a skewed sample rate of over, say, 1%. Such an approach 301 # would allow a small rate of outliers (sampling error), 302 # and, we could tighten the trigger to be, say, trigger / 5. 303 # In the case of a power down, if it's detectable, perhaps 304 # the tool could reinitialize the timers (although exiting 305 # is simple and works). 306 print(("ERROR: CPU samples arrived at skewed offsets " + 307 "(CPUs may have powered down when idle), " + 308 "spanning %d ns (expected < %d ns). Debug with -J, " + 309 "and see the man page. As output may begin to be " + 310 "unreliable, exiting.") % (g_range, trigger / 2)) 311 exit() 312 313 # these are done, remove 314 for ge in sorted(group): 315 del samples[ge] 316 317 # begin next group 318 group = {} 319 last = e 320 321 # stash this timestamp in a sample group dict 322 group[e] = 1 323 324 if not args.csv: 325 total = running + idle 326 unclaimed = util = 0 327 328 if debug: 329 print("DEBUG: hit %d running %d idle %d total %d buffered %d" % ( 330 positive, running, idle, total, len(samples))) 331 332 if args.timestamp: 333 print("%-8s " % strftime("%H:%M:%S"), end="") 334 335 # output 336 if total: 337 unclaimed = float(positive) / total 338 util = float(running) / total 339 print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util, 340 100 * unclaimed)) 341 342 countdown -= 1 343 if exiting or countdown == 0: 344 exit() 345