Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/python
      2 # @lint-avoid-python-3-compatibility-imports
      3 #
      4 # tcplife   Trace the lifespan of TCP sessions and summarize.
      5 #           For Linux, uses BCC, BPF. Embedded C.
      6 #
      7 # USAGE: tcplife [-h] [-C] [-S] [-p PID] [interval [count]]
      8 #
      9 # This uses the sock:inet_sock_set_state tracepoint if it exists (added to
     10 # Linux 4.16, and replacing the earlier tcp:tcp_set_state), else it uses
     11 # kernel dynamic tracing of tcp_set_state().
     12 #
     13 # While throughput counters are emitted, they are fetched in a low-overhead
     14 # manner: reading members of the tcp_info struct on TCP close. ie, we do not
     15 # trace send/receive.
     16 #
     17 # Copyright 2016 Netflix, Inc.
     18 # Licensed under the Apache License, Version 2.0 (the "License")
     19 #
     20 # IDEA: Julia Evans
     21 #
     22 # 18-Oct-2016   Brendan Gregg   Created this.
     23 # 29-Dec-2017      "      "     Added tracepoint support.
     24 
     25 from __future__ import print_function
     26 from bcc import BPF
     27 import argparse
     28 from socket import inet_ntop, ntohs, AF_INET, AF_INET6
     29 from struct import pack
     30 import ctypes as ct
     31 from time import strftime
     32 
     33 # arguments
     34 examples = """examples:
     35     ./tcplife           # trace all TCP connect()s
     36     ./tcplife -t        # include time column (HH:MM:SS)
     37     ./tcplife -w        # wider colums (fit IPv6)
     38     ./tcplife -stT      # csv output, with times & timestamps
     39     ./tcplife -p 181    # only trace PID 181
     40     ./tcplife -L 80     # only trace local port 80
     41     ./tcplife -L 80,81  # only trace local ports 80 and 81
     42     ./tcplife -D 80     # only trace remote port 80
     43 """
     44 parser = argparse.ArgumentParser(
     45     description="Trace the lifespan of TCP sessions and summarize",
     46     formatter_class=argparse.RawDescriptionHelpFormatter,
     47     epilog=examples)
     48 parser.add_argument("-T", "--time", action="store_true",
     49     help="include time column on output (HH:MM:SS)")
     50 parser.add_argument("-t", "--timestamp", action="store_true",
     51     help="include timestamp on output (seconds)")
     52 parser.add_argument("-w", "--wide", action="store_true",
     53     help="wide column output (fits IPv6 addresses)")
     54 parser.add_argument("-s", "--csv", action="store_true",
     55     help="comma separated values output")
     56 parser.add_argument("-p", "--pid",
     57     help="trace this PID only")
     58 parser.add_argument("-L", "--localport",
     59     help="comma-separated list of local ports to trace.")
     60 parser.add_argument("-D", "--remoteport",
     61     help="comma-separated list of remote ports to trace.")
     62 parser.add_argument("--ebpf", action="store_true",
     63     help=argparse.SUPPRESS)
     64 args = parser.parse_args()
     65 debug = 0
     66 
     67 # define BPF program
     68 bpf_text = """
     69 #include <uapi/linux/ptrace.h>
     70 #define KBUILD_MODNAME "foo"
     71 #include <linux/tcp.h>
     72 #include <net/sock.h>
     73 #include <bcc/proto.h>
     74 
     75 BPF_HASH(birth, struct sock *, u64);
     76 
     77 // separate data structs for ipv4 and ipv6
     78 struct ipv4_data_t {
     79     u64 ts_us;
     80     u32 pid;
     81     u32 saddr;
     82     u32 daddr;
     83     u64 ports;
     84     u64 rx_b;
     85     u64 tx_b;
     86     u64 span_us;
     87     char task[TASK_COMM_LEN];
     88 };
     89 BPF_PERF_OUTPUT(ipv4_events);
     90 
     91 struct ipv6_data_t {
     92     u64 ts_us;
     93     u32 pid;
     94     unsigned __int128 saddr;
     95     unsigned __int128 daddr;
     96     u64 ports;
     97     u64 rx_b;
     98     u64 tx_b;
     99     u64 span_us;
    100     char task[TASK_COMM_LEN];
    101 };
    102 BPF_PERF_OUTPUT(ipv6_events);
    103 
    104 struct id_t {
    105     u32 pid;
    106     char task[TASK_COMM_LEN];
    107 };
    108 BPF_HASH(whoami, struct sock *, struct id_t);
    109 """
    110 
    111 #
    112 # XXX: The following is temporary code for older kernels, Linux 4.14 and
    113 # older. It uses kprobes to instrument tcp_set_state(). On Linux 4.16 and
    114 # later, the sock:inet_sock_set_state tracepoint should be used instead, as
    115 # is done by the code that follows this. In the distant future (2021?), this
    116 # kprobe code can be removed. This is why there is so much code
    117 # duplication: to make removal easier.
    118 #
    119 bpf_text_kprobe = """
    120 int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
    121 {
    122     u32 pid = bpf_get_current_pid_tgid() >> 32;
    123 
    124     // lport is either used in a filter here, or later
    125     u16 lport = sk->__sk_common.skc_num;
    126     FILTER_LPORT
    127 
    128     // dport is either used in a filter here, or later
    129     u16 dport = sk->__sk_common.skc_dport;
    130     dport = ntohs(dport);
    131     FILTER_DPORT
    132 
    133     /*
    134      * This tool includes PID and comm context. It's best effort, and may
    135      * be wrong in some situations. It currently works like this:
    136      * - record timestamp on any state < TCP_FIN_WAIT1
    137      * - cache task context on:
    138      *       TCP_SYN_SENT: tracing from client
    139      *       TCP_LAST_ACK: client-closed from server
    140      * - do output on TCP_CLOSE:
    141      *       fetch task context if cached, or use current task
    142      */
    143 
    144     // capture birth time
    145     if (state < TCP_FIN_WAIT1) {
    146         /*
    147          * Matching just ESTABLISHED may be sufficient, provided no code-path
    148          * sets ESTABLISHED without a tcp_set_state() call. Until we know
    149          * that for sure, match all early states to increase chances a
    150          * timestamp is set.
    151          * Note that this needs to be set before the PID filter later on,
    152          * since the PID isn't reliable for these early stages, so we must
    153          * save all timestamps and do the PID filter later when we can.
    154          */
    155         u64 ts = bpf_ktime_get_ns();
    156         birth.update(&sk, &ts);
    157     }
    158 
    159     // record PID & comm on SYN_SENT
    160     if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) {
    161         // now we can PID filter, both here and a little later on for CLOSE
    162         FILTER_PID
    163         struct id_t me = {.pid = pid};
    164         bpf_get_current_comm(&me.task, sizeof(me.task));
    165         whoami.update(&sk, &me);
    166     }
    167 
    168     if (state != TCP_CLOSE)
    169         return 0;
    170 
    171     // calculate lifespan
    172     u64 *tsp, delta_us;
    173     tsp = birth.lookup(&sk);
    174     if (tsp == 0) {
    175         whoami.delete(&sk);     // may not exist
    176         return 0;               // missed create
    177     }
    178     delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
    179     birth.delete(&sk);
    180 
    181     // fetch possible cached data, and filter
    182     struct id_t *mep;
    183     mep = whoami.lookup(&sk);
    184     if (mep != 0)
    185         pid = mep->pid;
    186     FILTER_PID
    187 
    188     // get throughput stats. see tcp_get_info().
    189     u64 rx_b = 0, tx_b = 0, sport = 0;
    190     struct tcp_sock *tp = (struct tcp_sock *)sk;
    191     rx_b = tp->bytes_received;
    192     tx_b = tp->bytes_acked;
    193 
    194     u16 family = sk->__sk_common.skc_family;
    195 
    196     if (family == AF_INET) {
    197         struct ipv4_data_t data4 = {};
    198         data4.span_us = delta_us;
    199         data4.rx_b = rx_b;
    200         data4.tx_b = tx_b;
    201         data4.ts_us = bpf_ktime_get_ns() / 1000;
    202         data4.saddr = sk->__sk_common.skc_rcv_saddr;
    203         data4.daddr = sk->__sk_common.skc_daddr;
    204         // a workaround until data4 compiles with separate lport/dport
    205         data4.pid = pid;
    206         data4.ports = dport + ((0ULL + lport) << 32);
    207         if (mep == 0) {
    208             bpf_get_current_comm(&data4.task, sizeof(data4.task));
    209         } else {
    210             bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
    211         }
    212         ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
    213 
    214     } else /* 6 */ {
    215         struct ipv6_data_t data6 = {};
    216         data6.span_us = delta_us;
    217         data6.rx_b = rx_b;
    218         data6.tx_b = tx_b;
    219         data6.ts_us = bpf_ktime_get_ns() / 1000;
    220         bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
    221             sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
    222         bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
    223             sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
    224         // a workaround until data6 compiles with separate lport/dport
    225         data6.ports = dport + ((0ULL + lport) << 32);
    226         data6.pid = pid;
    227         if (mep == 0) {
    228             bpf_get_current_comm(&data6.task, sizeof(data6.task));
    229         } else {
    230             bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
    231         }
    232         ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
    233     }
    234 
    235     if (mep != 0)
    236         whoami.delete(&sk);
    237 
    238     return 0;
    239 }
    240 """
    241 
    242 bpf_text_tracepoint = """
    243 TRACEPOINT_PROBE(sock, inet_sock_set_state)
    244 {
    245     if (args->protocol != IPPROTO_TCP)
    246         return 0;
    247 
    248     u32 pid = bpf_get_current_pid_tgid() >> 32;
    249     // sk is mostly used as a UUID, and for two tcp stats:
    250     struct sock *sk = (struct sock *)args->skaddr;
    251 
    252     // lport is either used in a filter here, or later
    253     u16 lport = args->sport;
    254     FILTER_LPORT
    255 
    256     // dport is either used in a filter here, or later
    257     u16 dport = args->dport;
    258     FILTER_DPORT
    259 
    260     /*
    261      * This tool includes PID and comm context. It's best effort, and may
    262      * be wrong in some situations. It currently works like this:
    263      * - record timestamp on any state < TCP_FIN_WAIT1
    264      * - cache task context on:
    265      *       TCP_SYN_SENT: tracing from client
    266      *       TCP_LAST_ACK: client-closed from server
    267      * - do output on TCP_CLOSE:
    268      *       fetch task context if cached, or use current task
    269      */
    270 
    271     // capture birth time
    272     if (args->newstate < TCP_FIN_WAIT1) {
    273         /*
    274          * Matching just ESTABLISHED may be sufficient, provided no code-path
    275          * sets ESTABLISHED without a tcp_set_state() call. Until we know
    276          * that for sure, match all early states to increase chances a
    277          * timestamp is set.
    278          * Note that this needs to be set before the PID filter later on,
    279          * since the PID isn't reliable for these early stages, so we must
    280          * save all timestamps and do the PID filter later when we can.
    281          */
    282         u64 ts = bpf_ktime_get_ns();
    283         birth.update(&sk, &ts);
    284     }
    285 
    286     // record PID & comm on SYN_SENT
    287     if (args->newstate == TCP_SYN_SENT || args->newstate == TCP_LAST_ACK) {
    288         // now we can PID filter, both here and a little later on for CLOSE
    289         FILTER_PID
    290         struct id_t me = {.pid = pid};
    291         bpf_get_current_comm(&me.task, sizeof(me.task));
    292         whoami.update(&sk, &me);
    293     }
    294 
    295     if (args->newstate != TCP_CLOSE)
    296         return 0;
    297 
    298     // calculate lifespan
    299     u64 *tsp, delta_us;
    300     tsp = birth.lookup(&sk);
    301     if (tsp == 0) {
    302         whoami.delete(&sk);     // may not exist
    303         return 0;               // missed create
    304     }
    305     delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
    306     birth.delete(&sk);
    307 
    308     // fetch possible cached data, and filter
    309     struct id_t *mep;
    310     mep = whoami.lookup(&sk);
    311     if (mep != 0)
    312         pid = mep->pid;
    313     FILTER_PID
    314 
    315     // get throughput stats. see tcp_get_info().
    316     u64 rx_b = 0, tx_b = 0, sport = 0;
    317     struct tcp_sock *tp = (struct tcp_sock *)sk;
    318     rx_b = tp->bytes_received;
    319     tx_b = tp->bytes_acked;
    320 
    321     if (args->family == AF_INET) {
    322         struct ipv4_data_t data4 = {};
    323         data4.span_us = delta_us;
    324         data4.rx_b = rx_b;
    325         data4.tx_b = tx_b;
    326         data4.ts_us = bpf_ktime_get_ns() / 1000;
    327         __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
    328         __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
    329         // a workaround until data4 compiles with separate lport/dport
    330         data4.ports = dport + ((0ULL + lport) << 32);
    331         data4.pid = pid;
    332 
    333         if (mep == 0) {
    334             bpf_get_current_comm(&data4.task, sizeof(data4.task));
    335         } else {
    336             bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
    337         }
    338         ipv4_events.perf_submit(args, &data4, sizeof(data4));
    339 
    340     } else /* 6 */ {
    341         struct ipv6_data_t data6 = {};
    342         data6.span_us = delta_us;
    343         data6.rx_b = rx_b;
    344         data6.tx_b = tx_b;
    345         data6.ts_us = bpf_ktime_get_ns() / 1000;
    346         __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr));
    347         __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr));
    348         // a workaround until data6 compiles with separate lport/dport
    349         data6.ports = dport + ((0ULL + lport) << 32);
    350         data6.pid = pid;
    351         if (mep == 0) {
    352             bpf_get_current_comm(&data6.task, sizeof(data6.task));
    353         } else {
    354             bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
    355         }
    356         ipv6_events.perf_submit(args, &data6, sizeof(data6));
    357     }
    358 
    359     if (mep != 0)
    360         whoami.delete(&sk);
    361 
    362     return 0;
    363 }
    364 """
    365 
    366 if (BPF.tracepoint_exists("sock", "inet_sock_set_state")):
    367     bpf_text += bpf_text_tracepoint
    368 else:
    369     bpf_text += bpf_text_kprobe
    370 
    371 # code substitutions
    372 if args.pid:
    373     bpf_text = bpf_text.replace('FILTER_PID',
    374         'if (pid != %s) { return 0; }' % args.pid)
    375 if args.remoteport:
    376     dports = [int(dport) for dport in args.remoteport.split(',')]
    377     dports_if = ' && '.join(['dport != %d' % dport for dport in dports])
    378     bpf_text = bpf_text.replace('FILTER_DPORT',
    379         'if (%s) { birth.delete(&sk); return 0; }' % dports_if)
    380 if args.localport:
    381     lports = [int(lport) for lport in args.localport.split(',')]
    382     lports_if = ' && '.join(['lport != %d' % lport for lport in lports])
    383     bpf_text = bpf_text.replace('FILTER_LPORT',
    384         'if (%s) { birth.delete(&sk); return 0; }' % lports_if)
    385 bpf_text = bpf_text.replace('FILTER_PID', '')
    386 bpf_text = bpf_text.replace('FILTER_DPORT', '')
    387 bpf_text = bpf_text.replace('FILTER_LPORT', '')
    388 
    389 if debug or args.ebpf:
    390     print(bpf_text)
    391     if args.ebpf:
    392         exit()
    393 
    394 # event data
    395 TASK_COMM_LEN = 16      # linux/sched.h
    396 
    397 class Data_ipv4(ct.Structure):
    398     _fields_ = [
    399         ("ts_us", ct.c_ulonglong),
    400         ("pid", ct.c_uint),
    401         ("saddr", ct.c_uint),
    402         ("daddr", ct.c_uint),
    403         ("ports", ct.c_ulonglong),
    404         ("rx_b", ct.c_ulonglong),
    405         ("tx_b", ct.c_ulonglong),
    406         ("span_us", ct.c_ulonglong),
    407         ("task", ct.c_char * TASK_COMM_LEN)
    408     ]
    409 
    410 class Data_ipv6(ct.Structure):
    411     _fields_ = [
    412         ("ts_us", ct.c_ulonglong),
    413         ("pid", ct.c_uint),
    414         ("saddr", (ct.c_ulonglong * 2)),
    415         ("daddr", (ct.c_ulonglong * 2)),
    416         ("ports", ct.c_ulonglong),
    417         ("rx_b", ct.c_ulonglong),
    418         ("tx_b", ct.c_ulonglong),
    419         ("span_us", ct.c_ulonglong),
    420         ("task", ct.c_char * TASK_COMM_LEN)
    421     ]
    422 
    423 #
    424 # Setup output formats
    425 #
    426 # Don't change the default output (next 2 lines): this fits in 80 chars. I
    427 # know it doesn't have NS or UIDs etc. I know. If you really, really, really
    428 # need to add columns, columns that solve real actual problems, I'd start by
    429 # adding an extended mode (-x) to included those columns.
    430 #
    431 header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s"
    432 format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f"
    433 if args.wide:
    434     header_string = "%-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %6s %6s %s"
    435     format_string = "%-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %6d %6d %.2f"
    436 if args.csv:
    437     header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
    438     format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f"
    439 
    440 # process event
    441 def print_ipv4_event(cpu, data, size):
    442     event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
    443     global start_ts
    444     if args.time:
    445         if args.csv:
    446             print("%s," % strftime("%H:%M:%S"), end="")
    447         else:
    448             print("%-8s " % strftime("%H:%M:%S"), end="")
    449     if args.timestamp:
    450         if start_ts == 0:
    451             start_ts = event.ts_us
    452         delta_s = (float(event.ts_us) - start_ts) / 1000000
    453         if args.csv:
    454             print("%.6f," % delta_s, end="")
    455         else:
    456             print("%-9.6f " % delta_s, end="")
    457     print(format_string % (event.pid, event.task.decode('utf-8', 'replace'),
    458         "4" if args.wide or args.csv else "",
    459         inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 32,
    460         inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff,
    461         event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
    462 
    463 def print_ipv6_event(cpu, data, size):
    464     event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
    465     global start_ts
    466     if args.time:
    467         if args.csv:
    468             print("%s," % strftime("%H:%M:%S"), end="")
    469         else:
    470             print("%-8s " % strftime("%H:%M:%S"), end="")
    471     if args.timestamp:
    472         if start_ts == 0:
    473             start_ts = event.ts_us
    474         delta_s = (float(event.ts_us) - start_ts) / 1000000
    475         if args.csv:
    476             print("%.6f," % delta_s, end="")
    477         else:
    478             print("%-9.6f " % delta_s, end="")
    479     print(format_string % (event.pid, event.task.decode('utf-8', 'replace'),
    480         "6" if args.wide or args.csv else "",
    481         inet_ntop(AF_INET6, event.saddr), event.ports >> 32,
    482         inet_ntop(AF_INET6, event.daddr), event.ports & 0xffffffff,
    483         event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
    484 
    485 # initialize BPF
    486 b = BPF(text=bpf_text)
    487 
    488 # header
    489 if args.time:
    490     if args.csv:
    491         print("%s," % ("TIME"), end="")
    492     else:
    493         print("%-8s " % ("TIME"), end="")
    494 if args.timestamp:
    495     if args.csv:
    496         print("%s," % ("TIME(s)"), end="")
    497     else:
    498         print("%-9s " % ("TIME(s)"), end="")
    499 print(header_string % ("PID", "COMM",
    500     "IP" if args.wide or args.csv else "", "LADDR",
    501     "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"))
    502 
    503 start_ts = 0
    504 
    505 # read events
    506 b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
    507 b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
    508 while 1:
    509     b.perf_buffer_poll()
    510