#!/usr/bin/python # # lockstat Trace and display lock contention stats # # USAGE: lockstat # Licensed under the Apache License, Version 2.0 (the "License") # 28-Jul-2017 Gisle Dankel Created this. from bcc import BPF from ctypes import c_int from time import sleep from datetime import datetime import argparse import subprocess import os # One Lock object per TGID and uaddr. class Lock(object): def __init__(self): self.contention_count = 0 self.elapsed_blocked = 0 self.thread_count = 0 self.last_stack_syms = [] def update(self, count, block_time, last_stack_syms): self.contention_count += count self.elapsed_blocked += block_time self.thread_count += 1 self.last_stack_syms = last_stack_syms def run_command_get_pid(command): p = subprocess.Popen(command.split()) return p.pid examples = """ EXAMPLES: ./lockstat Trace calls to sys_futex and display contented locks every 5 seconds for all processes running on the system ./lockstat -p <pid> Trace only for the specified pid and display contended locks every 5 seconds ./lockstat -p <pid> -t Trace for a specified pid and print a message on each entry and exit to sys_futex until interrupted or killed ./lockstat -p <pid> 10 Trace the specified pid and show a message every 10 seconds ./lockstat -c <command> 1 30 Run the specified command and display contended locks every 1 second for a total of 30 times """ description = """ Trace kernel futex events. These often occur because of lock contention, e.g. involving a pthread_mutex. This script resemblers the following SystemTap example: https://sourceware.org/systemtap/SystemTap_Beginners_Guide/futexcontentionsect.html """ parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) parser.add_argument("-p", "--pid", type=int, default=-1, help="the PID to trace; if not specified, trace all") parser.add_argument("-t", "--trace", action="store_true", help="print trace messages for each futex enter/exit") parser.add_argument("interval", nargs="?", default=5, type=int, help="interval in seconds to print summary") parser.add_argument("count", nargs="?", type=int, help="number of times to print the report before exiting") parser.add_argument("-c", "--command", help="execute and trace the specified command") args = parser.parse_args() pid = args.pid command = args.command interval = args.interval num_prints = args.count trace_all = args.trace if command is not None: print("Executing '%s' and tracing the resulting process." % command) pid = run_command_get_pid(command) bpf_source = """ #include <uapi/linux/futex.h> #include <uapi/linux/ptrace.h> #include <linux/sched.h> #include <linux/time.h> struct comm_t { char name[TASK_COMM_LEN]; }; struct lock_key_t { u64 uaddr; u32 pid; u32 tgid; }; struct lock_info_t { u64 elapsed_blocked; u64 contention_count; u64 sid; }; BPF_HASH(pid_lock, u32, u64); BPF_HASH(pid_blocktime, u32, u64); BPF_HASH(tgid_comm, u32, struct comm_t); BPF_HASH(lock_stats, struct lock_key_t, struct lock_info_t, 1000000); BPF_STACK_TRACE(stack_traces, 16384); static inline int update_stats(u32 pid, u32 tgid, u64 uaddr, u64 block_time, u64 sid) { struct lock_key_t key = {}; struct lock_info_t zero = {}; struct lock_info_t *info; key.pid = pid; key.tgid = tgid; key.uaddr = uaddr; info = lock_stats.lookup_or_init(&key, &zero); info->elapsed_blocked += block_time; info->contention_count++; info->sid = sid; if (0 == tgid_comm.lookup(&tgid)) { struct comm_t comm; bpf_get_current_comm(&comm.name, sizeof(comm.name)); tgid_comm.update(&tgid, &comm); } return 0; } // FIXME: Should attach to sys_enter_futex and sys_exit_futex tracepoints here, // but that does not currently work int sys_futex_enter(struct pt_regs *ctx, u32 *uaddr, int op, u32 val, struct timespec *utime, u32 *uaddr2, u32 val3) { int cmd = op & FUTEX_CMD_MASK; if (cmd != FUTEX_WAIT) return 0; u64 pid_tgid = bpf_get_current_pid_tgid(); u32 pid = pid_tgid; u32 tgid = pid_tgid >> 32; if (!(THREAD_FILTER)) return 0; u64 timestamp = bpf_ktime_get_ns(); u64 uaddr64 = (u64) uaddr; pid_lock.update(&pid, &uaddr64); pid_blocktime.update(&pid, ×tamp); if (SHOULD_PRINT) bpf_trace_printk("enter sys_futex, pid = %u, uaddr = %x, " "cmd = %u\\n", pid, uaddr64, cmd); return 0; } int sys_futex_exit(struct pt_regs *ctx) { u64 pid_tgid = bpf_get_current_pid_tgid(); u32 pid = pid_tgid; u32 tgid = pid_tgid >> 32; if (!(THREAD_FILTER)) return 0; u64 *blocktime = pid_blocktime.lookup(&pid); u64 *uaddr = pid_lock.lookup(&pid); u64 timestamp = bpf_ktime_get_ns(); u64 elapsed; u64 sid; if (blocktime == 0 || uaddr == 0) return 0; // not FUTEX_WAIT, or (less likely) missed futex_enter elapsed = timestamp - *blocktime; sid = stack_traces.get_stackid(ctx, BPF_F_USER_STACK); update_stats(pid, tgid, *uaddr, elapsed, sid); pid_lock.delete(&pid); pid_blocktime.delete(&pid); if (SHOULD_PRINT) { bpf_trace_printk("exit sys_futex, uaddr = %x, elapsed = %uns\\n", uaddr == 0 ? 0 : *uaddr, elapsed); } return 0; } """ bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0") thread_filter = '1' if pid != -1: print("Tracing pid %d, Ctrl+C to quit." % pid) # 'tgid' in kernel space is what people thin of as 'pid' in userspace thread_filter = "tgid == %d" % pid else: print("Tracing all processes, Ctrl+C to quit.") bpf_source = bpf_source.replace("THREAD_FILTER", thread_filter) bpf_program = BPF(text=bpf_source) bpf_program.attach_kprobe(event="SyS_futex", fn_name="sys_futex_enter") bpf_program.attach_kretprobe(event="SyS_futex", fn_name="sys_futex_exit") def get_syms(stack, pid): global bpf_program syms = [] for addr in stack: s = bpf_program.sym(addr, pid, show_offset=True) syms.append(s) return syms def print_syms(syms): print("=========") for f in syms: print(f) print("=========") def is_android_monitor_lock(syms): for s in syms: if 'art::Monitor::Lock' in s: return True return False def disp_stack(stack, pid): for addr in stack: s = bpf_program.sym(addr, pid, show_offset=True) def create_tgid_stats(): global bpf_program stats = bpf_program["lock_stats"] res = {} stack_traces = bpf_program['stack_traces'] for key, val in stats.items(): # Only display Android monitor locks if val.sid >= 0: ust = stack_traces.walk(val.sid) syms = get_syms(ust, key.pid) if not is_android_monitor_lock(syms): continue else: continue if not key.tgid in res: res[key.tgid] = {} if not key.uaddr in res[key.tgid]: res[key.tgid][key.uaddr] = Lock() lock = res[key.tgid][key.uaddr] lock.update(val.contention_count, val.elapsed_blocked, syms) return res def print_comm_stats(stats): if stats == {}: return comms = bpf_program["tgid_comm"] print("\n%s:" % (datetime.now().strftime("%H:%M:%S"))) for tgid, locks in stats.items(): comm = comms[c_int(tgid)].name print("\n %s (%d):" % (comm, tgid)) sorted_locks = sorted(locks.items(), key=lambda x: x[1].elapsed_blocked, reverse=True) for addr, stats in sorted_locks: print(" %x: %dms (%d contentions involving %d threads, avg %dus)" % (addr, stats.elapsed_blocked / 1000000, stats.contention_count, stats.thread_count, stats.elapsed_blocked / stats.contention_count / 1000)) # No use of displaying lock stacks since we're only # filtering for Android monitor locks. # # print("Last stack for this lock:") # print_syms(stats.last_stack_syms) count_so_far = 0 while True: if trace_all: print(bpf_program.trace_fields()) else: try: sleep(interval) except KeyboardInterrupt: exit() print_comm_stats(create_tgid_stats()) count_so_far += 1 bpf_program['tgid_comm'].clear() bpf_program['lock_stats'].clear() bpf_program['pid_lock'].clear() bpf_program['pid_blocktime'].clear() if num_prints is not None and count_so_far >= num_prints: exit()