Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/env bcc-lua
      2 --[[
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7 http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 
     15 18-Mar-2017  Simon Liu Created this.
     16 --]]
     17 
     18 local ffi = require("ffi")
     19 local bit = require("bit")
     20 
     21 ffi.cdef[[
     22 const char *inet_ntop(int af, const void *src, char *dst, int size);
     23 uint16_t ntohs(uint16_t netshort);
     24 ]]
     25 
     26 local program = [[
     27 #include <uapi/linux/ptrace.h>
     28 #define KBUILD_MODNAME "foo"
     29 #include <linux/tcp.h>
     30 #include <net/sock.h>
     31 #include <bcc/proto.h>
     32 
     33 BPF_HASH(birth, struct sock *, u64);
     34 
     35 // separate data structs for ipv4 and ipv6
     36 struct ipv4_data_t {
     37     // XXX: switch some to u32's when supported
     38     u64 ts_us;
     39     u64 pid;
     40     u64 saddr;
     41     u64 daddr;
     42     u64 ports;
     43     u64 rx_b;
     44     u64 tx_b;
     45     u64 span_us;
     46     char task[TASK_COMM_LEN];
     47 };
     48 BPF_PERF_OUTPUT(ipv4_events);
     49 
     50 struct ipv6_data_t {
     51     u64 ts_us;
     52     u64 pid;
     53     u64 saddr[2];
     54     u64 daddr[2];
     55     u64 ports;
     56     u64 rx_b;
     57     u64 tx_b;
     58     u64 span_us;
     59     char task[TASK_COMM_LEN];
     60 };
     61 BPF_PERF_OUTPUT(ipv6_events);
     62 
     63 struct id_t {
     64     u32 pid;
     65     char task[TASK_COMM_LEN];
     66 };
     67 BPF_HASH(whoami, struct sock *, struct id_t);
     68 
     69 int trace_tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
     70 {
     71     bpf_trace_printk("tcp_set_stat");
     72     u32 pid = bpf_get_current_pid_tgid() >> 32;
     73 
     74     // lport is either used in a filter here, or later
     75     u16 lport = sk->__sk_common.skc_num;
     76     FILTER_LPORT
     77 
     78     // dport is either used in a filter here, or later
     79     u16 dport = sk->__sk_common.skc_dport;
     80     FILTER_DPORT
     81 
     82     /*
     83      * This tool includes PID and comm context. It's best effort, and may
     84      * be wrong in some situations. It currently works like this:
     85      * - record timestamp on any state < TCP_FIN_WAIT1
     86      * - cache task context on:
     87      *       TCP_SYN_SENT: tracing from client
     88      *       TCP_LAST_ACK: client-closed from server
     89      * - do output on TCP_CLOSE:
     90      *       fetch task context if cached, or use current task
     91      */
     92 
     93     // capture birth time
     94     if (state < TCP_FIN_WAIT1) {
     95         /*
     96          * Matching just ESTABLISHED may be sufficient, provided no code-path
     97          * sets ESTABLISHED without a tcp_set_state() call. Until we know
     98          * that for sure, match all early states to increase chances a
     99          * timestamp is set.
    100          * Note that this needs to be set before the PID filter later on,
    101          * since the PID isn't reliable for these early stages, so we must
    102          * save all timestamps and do the PID filter later when we can.
    103          */
    104         u64 ts = bpf_ktime_get_ns();
    105         birth.update(&sk, &ts);
    106     }
    107 
    108     // record PID & comm on SYN_SENT
    109     if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) {
    110         // now we can PID filter, both here and a little later on for CLOSE
    111         FILTER_PID
    112         struct id_t me = {.pid = pid};
    113         bpf_get_current_comm(&me.task, sizeof(me.task));
    114         whoami.update(&sk, &me);
    115     }
    116 
    117     if (state != TCP_CLOSE)
    118         return 0;
    119 
    120     // calculate lifespan
    121     u64 *tsp, delta_us;
    122     tsp = birth.lookup(&sk);
    123     if (tsp == 0) {
    124         whoami.delete(&sk);     // may not exist
    125         return 0;               // missed create
    126     }
    127     delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
    128     birth.delete(&sk);
    129 
    130     // fetch possible cached data, and filter
    131     struct id_t *mep;
    132     mep = whoami.lookup(&sk);
    133     if (mep != 0)
    134         pid = mep->pid;
    135     FILTER_PID
    136 
    137     // get throughput stats. see tcp_get_info().
    138     u64 rx_b = 0, tx_b = 0, sport = 0;
    139     struct tcp_sock *tp = (struct tcp_sock *)sk;
    140     rx_b = tp->bytes_received;
    141     tx_b = tp->bytes_acked;
    142 
    143     u16 family = sk->__sk_common.skc_family;
    144 
    145     if (family == AF_INET) {
    146         struct ipv4_data_t data4 = {.span_us = delta_us,
    147             .rx_b = rx_b, .tx_b = tx_b};
    148         data4.ts_us = bpf_ktime_get_ns() / 1000;
    149         data4.saddr = sk->__sk_common.skc_rcv_saddr;
    150         data4.daddr = sk->__sk_common.skc_daddr;
    151         // a workaround until data4 compiles with separate lport/dport
    152         data4.pid = pid;
    153         data4.ports = ntohs(dport) + ((0ULL + lport) << 32);
    154         if (mep == 0) {
    155             bpf_get_current_comm(&data4.task, sizeof(data4.task));
    156         } else {
    157             bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
    158         }
    159         ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
    160 
    161     } else /* 6 */ {
    162         struct ipv6_data_t data6 = {.span_us = delta_us,
    163             .rx_b = rx_b, .tx_b = tx_b};
    164         data6.ts_us = bpf_ktime_get_ns() / 1000;
    165         bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
    166             sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
    167         bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
    168             sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
    169         // a workaround until data6 compiles with separate lport/dport
    170         data6.ports = ntohs(dport) + ((0ULL + lport) << 32);
    171         data6.pid = pid;
    172         if (mep == 0) {
    173             bpf_get_current_comm(&data6.task, sizeof(data6.task));
    174         } else {
    175             bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
    176         }
    177         ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
    178     }
    179 
    180     if (mep != 0)
    181         whoami.delete(&sk);
    182 
    183     return 0;
    184 }
    185 ]]
    186 
    187 local debug = false
    188 local start_ts = 0
    189 
    190 local inet_addresslen = #"255.255.255.255"
    191 local inet6_addresslen = #"ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"
    192 local AF_INET = 2
    193 local AF_INET6 = 10
    194 
    195 local header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s"
    196 local format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f"
    197 local ip_string = ""
    198 local ip_version = false
    199 local arg_timestamp = false
    200 local arg_csv = false
    201 local arg_time = false
    202 
    203 local examples = [[examples:
    204     ./tcplife           # trace all TCP connect()s
    205     ./tcplife -t        # include time column (HH:MM:SS)
    206     ./tcplife -w        # wider colums (fit IPv6)
    207     ./tcplife -stT      # csv output, with times & timestamps
    208     ./tcplife -p 181    # only trace PID 181
    209     ./tcplife -L 80     # only trace local port 80
    210     ./tcplife -L 80,81  # only trace local ports 80 and 81
    211     ./tcplife -D 80     # only trace remote port 80
    212 ]]
    213 
    214 local function split(str,sep)
    215    local t = {}
    216    for w in string.gmatch(str, '([^,]+)') do
    217       table.insert(t, w)
    218    end
    219    return t
    220 end
    221 
    222 local function inet_ntop(af, addr, len)
    223    local addr_dst = ffi.new("char[?]", len)
    224    local addr_src
    225    if af == AF_INET then
    226       addr_src = ffi.new("uint64_t[1]", addr)
    227    else
    228       addr_src = ffi.new("uint64_t[2]", addr)
    229    end
    230    ffi.C.inet_ntop(af, addr_src, addr_dst, len)
    231    return ffi.string(addr_dst, len)
    232 end
    233 
    234 local function inet_ntohs(port)
    235    local p = tonumber(port)
    236    return ffi.C.ntohs(p)
    237 end
    238 
    239 local function print_ipv4_event(cpu, event)
    240 
    241    local event_pid = tonumber(event.pid)
    242    local event_task = ffi.string(event.task)
    243    local event_ports = tonumber(event.ports)
    244    local event_tx_b = tonumber(event.tx_b)
    245    local event_rx_b = tonumber(event.rx_b)
    246    local event_span_us = tonumber(event.span_us)
    247    local event_ts_us = tonumber(event.ts_us)
    248    local event_saddr = inet_ntop(AF_INET, tonumber(event.saddr), inet_addresslen)
    249    local event_daddr = inet_ntop(AF_INET, tonumber(event.daddr), inet_addresslen)
    250    if arg_time then
    251       if arg_csv then
    252          io.write("%s," % os.date("%H:%M:%S"))
    253       else
    254          io.write("%-8s " % os.date("%H:%M:%S"))
    255       end
    256    end
    257    if arg_timestamp then
    258       if start_ts == 0 then
    259          start_ts = event_ts_us
    260       end
    261       local delta_s = (event_ts_us - start_ts) / 1000000
    262       if arg.csv then
    263          io.write("%.6f," % delta_s)
    264       else
    265          io.write("%-9.6f " % delta_s)
    266       end
    267    end
    268    local iv = ""
    269    if ip_version then
    270       iv = "4"
    271    end
    272    print(string.format(format_string, event_pid, event_task, iv,
    273                        event_saddr, bit.rshift(event_ports,32),
    274                        event_daddr, bit.band(event_ports,0xffffffff),
    275                        (event_tx_b / 1024), (event_rx_b / 1024), event_span_us/ 1000))
    276 end
    277 
    278 
    279 local function print_ipv6_event(cpu, event)
    280    local event_pid = tonumber(event.pid)
    281    local event_task = ffi.string(event.task)
    282    local event_ports = tonumber(event.ports)
    283    local event_tx_b = tonumber(event.tx_b)
    284    local event_rx_b = tonumber(event.rx_b)
    285    local event_span_us = tonumber(event.span_us)
    286    local event_ts_us = tonumber(event.ts_us)
    287    local event_saddr = inet_ntop(AF_INET6, {tonumber(event.saddr[0]), tonumber(event.saddr[1])}, inet6_addresslen)
    288    local event_daddr = inet_ntop(AF_INET6, {tonumber(event.daddr[0]), tonumber(event.daddr[1])}, inet6_addresslen)
    289    if arg_time then
    290       if arg_csv then
    291          io.write("%s," % os.date("%H:%M:%S"))
    292       else
    293          io.write("%-8s " % os.date("%H:%M:%S"))
    294       end
    295    end
    296    if arg_timestamp then
    297       if start_ts == 0 then
    298          start_ts = event_ts_us
    299       end
    300       local delta_s = (event_ts_us - start_ts) / 1000000
    301       if arg.csv then
    302          io.write("%.6f," % delta_s)
    303       else
    304          io.write("%-9.6f " % delta_s)
    305       end
    306    end
    307    local iv = ""
    308    if ip_version then
    309       iv = "6"
    310    end
    311    print(string.format(format_string, event_pid, event_task, iv,
    312                        event_saddr, bit.rshift(event_ports,32),
    313                        event_daddr, bit.band(event_ports,0xffffffff),
    314                        (event_tx_b / 1024), (event_rx_b / 1024), event_span_us/ 1000))
    315 end
    316 
    317 local function parse_arg(utils)
    318    local parser = utils.argparse("tcplife",
    319                                  "Trace the lifespan of TCP sessions and summarize", examples)
    320 
    321    parser:flag("-T --time", "include time column on output (HH:MM:SS)")
    322    parser:flag("-t --timestamp", "include timestamp on output (seconds)")
    323    parser:flag("-w --wide", "wide column output (fits IPv6 addresses)")
    324    parser:flag("-s --csv", "comma separated values output")
    325    parser:option("-p --pid", "trace this PID only"):convert(tonumber)
    326    parser:option("-L --localport", "comma-separated list of local ports to trace.")
    327    parser:option("-D --remoteport", "comma-separated list of remote ports to trace.")
    328 
    329    local args = parser:parse()
    330    if args.pid then
    331       local filter = 'if (pid != %d) { return 0; }' % args.pid
    332       program = program.gsub('FILTER_PID', filter)
    333    end
    334 
    335    if args.remoteport then
    336       local dports = split(args.remoteport, ",")
    337       local dports_if = ""
    338       for i,d in ipairs(dports) do
    339          if dports_if == "" then
    340             dports_if = 'dport != %d' % inet_ntohs(d)
    341          else
    342             dports_if = dports_if .. ' && ' .. ('dport != %d' % inet_ntohs(d))
    343          end
    344       end
    345       local filter = "if (%s) { birth.delete(&sk); return 0; }" % dports_if
    346       program = program:gsub('FILTER_DPORT', filter)
    347    end
    348    if args.localport then
    349       local lports = split(args.localport,",")
    350       local lports_if = ""
    351       for i,l in ipairs(lports) do
    352          if lports_if == "" then
    353             lports_if = 'lport != %d' % inet_ntohs(l)
    354          else
    355             lports_if = lports_if .. ' && ' .. ('lport != %d' % inet_ntohs(l))
    356          end
    357       end
    358       local filter = "if (%s) { birth.delete(&sk); return 0; }" % lports_if
    359       program = program:gsub('FILTER_LPORT', filter)
    360    end
    361    program = program:gsub('FILTER_PID', '')
    362    program = program:gsub('FILTER_DPORT', '')
    363    program = program:gsub('FILTER_LPORT', '')
    364 
    365    if args.wide then
    366       header_string = "%-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %6s %6s %s"
    367       format_string = "%-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %6d %6d %.2f"
    368       ip_string = "IP"
    369       ip_version = true
    370    end
    371    if args.csv then
    372       header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
    373       format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f"
    374       ip_string = "IP"
    375       ip_version = true
    376       arg_csv = true
    377    end
    378 
    379    if args.time then
    380       arg_time = true
    381       if args.csv then
    382          io.write("%s," % ("TIME"))
    383       else
    384          io.write("%-8s " % ("TIME"))
    385       end
    386    end
    387 
    388    if args.timestamp then
    389       arg_timestamp = true
    390       if args.csv then
    391          io.write("%s," % ("TIME(s)"))
    392       else
    393          io.write("%-9s " % ("TIME(s)"))
    394       end
    395    end
    396 
    397 end
    398 
    399 return function(BPF, utils)
    400    parse_arg(utils)
    401    if debug then
    402       print(program)
    403    end
    404 
    405    local bpf = BPF:new{text=program}
    406    bpf:attach_kprobe{event="tcp_set_state", fn_name="trace_tcp_set_state"}
    407    print(header_string % {"PID", "COMM",
    408                           ip_string, "LADDR",
    409                           "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"})
    410    local TASK_COMM_LEN = 16 -- linux/sched.h
    411    bpf:get_table("ipv4_events"):open_perf_buffer(print_ipv4_event, [[
    412     struct {
    413       uint64_t ts_us;
    414       uint64_t pid;
    415       uint64_t saddr;
    416       uint64_t daddr;
    417       uint64_t ports;
    418       uint64_t rx_b;
    419       uint64_t tx_b;
    420       uint64_t span_us;
    421       char task[$];
    422     }
    423    ]], {TASK_COMM_LEN}, 64)
    424    bpf:get_table("ipv6_events"):open_perf_buffer(print_ipv6_event, [[
    425     struct {
    426       uint64_t ts_us;
    427       uint64_t pid;
    428       uint64_t saddr[2];
    429       uint64_t daddr[2];
    430       uint64_t ports;
    431       uint64_t rx_b;
    432       uint64_t tx_b;
    433       uint64_t span_us;
    434       char task[$];
    435     }
    436    ]], {TASK_COMM_LEN}, 64)
    437 
    438    bpf:perf_buffer_poll_loop()
    439 end
    440