--[[ Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com> Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ]] -- LuaJIT to BPF bytecode compiler. -- -- The code generation phase is currently one-pass and produces: -- * Compiled code in BPF bytecode format (https://www.kernel.org/doc/Documentation/networking/filter.txt) -- * Variables with liveness analysis and other meta (spill information, compile-time value) -- -- The code generator optimises as much as possible in single pass: -- * Fold compile-time expressions and constant propagation -- * Basic control flow analysis with dead code elimination (based on compile-time expressions) -- * Single-pass optimistic register allocation -- -- The first pass doesn't have variable lifetime visibility yet, so it relies on rewriter for further -- optimisations such as: -- * Dead store elimination (first-pass doesn't know if/when the variable is going to be used) -- * Common sub-expression elimination (relies on DCE and liveness analysis) -- * Orphan JMP elimination (removing this in first pass would break previous JMP targets) -- * Better register allocation (needs to be recomputed after optimisations) local ffi = require('ffi') local bit = require('bit') local S = require('syscall') local bytecode = require('bpf.ljbytecode') local cdef = require('bpf.cdef') local proto = require('bpf.proto') local builtins = require('bpf.builtins') -- Constants local ALWAYS, NEVER = -1, -2 local BPF = ffi.typeof('struct bpf') local HELPER = ffi.typeof('struct bpf_func_id') -- Symbolic table of constant expressions over numbers local const_expr = { ADD = function (a, b) return a + b end, SUB = function (a, b) return a - b end, DIV = function (a, b) return a / b end, MOD = function (a, b) return a % b end, JEQ = function (a, b) return a == b end, JNE = function (a, b) return a ~= b end, JGE = function (a, b) return a >= b end, JGT = function (a, b) return a > b end, } local const_width = { [1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW, } -- Built-ins that are strict only (never compile-time expandable) local builtins_strict = { [ffi.new] = true, [print] = true, } -- Deep copy a table local function table_copy(t) local copy = {} for n,v in pairs(t) do if type(v) == 'table' then v = table_copy(v) end copy[n] = v end return copy end -- Return true if the constant part is a proxy local function is_proxy(x) return type(x) == 'table' and (x.__dissector or x.__map or x.__base) end -- Create compiler closure local function create_emitter(env, stackslots, params, param_types) local V = {} -- Variable tracking / register allocator local code = { -- Generated code pc = 0, bc_pc = 0, insn = ffi.new('struct bpf_insn[4096]'), fixup = {}, reachable = true, seen_cmp = nil, } local Vstate = {} -- Track variable layout at basic block exits -- Anything below this stack offset is free to use by caller -- @note: There is no tracking memory allocator, so the caller may -- lower it for persistent objects, but such memory will never -- be reclaimed and the caller is responsible for resetting stack -- top whenever the memory below is free to be reused local stack_top = (stackslots + 1) * ffi.sizeof('uint64_t') local function emit(op, dst, src, off, imm) local ins = code.insn[code.pc] ins.code = op ins.dst_reg = dst ins.src_reg = src ins.off = off ins.imm = imm code.pc = code.pc + 1 end local function reg_spill(var) local vinfo = V[var] assert(vinfo.reg, 'attempt to spill VAR that doesn\'t have an allocated register') vinfo.spill = (var + 1) * ffi.sizeof('uint64_t') -- Index by (variable number) * (register width) emit(BPF.MEM + BPF.STX + BPF.DW, 10, vinfo.reg, -vinfo.spill, 0) vinfo.reg = nil end local function reg_fill(var, reg) local vinfo = V[var] assert(reg, 'attempt to fill variable to register but not register is allocated') assert(vinfo.spill, 'attempt to fill register with a VAR that isn\'t spilled') emit(BPF.MEM + BPF.LDX + BPF.DW, reg, 10, -vinfo.spill, 0) vinfo.reg = reg vinfo.spill = nil end -- Allocate a register (lazy simple allocator) local function reg_alloc(var, reg) -- Specific register requested, must spill/move existing variable if reg then for k,v in pairs(V) do -- Spill any variable that has this register if v.reg == reg and not v.shadow then reg_spill(k) break end end return reg end -- Find free or least recently used slot local last, last_seen, used = nil, 0xffff, 0 for k,v in pairs(V) do if v.reg then if not v.live_to or v.live_to < last_seen then last, last_seen = k, v.live_to or last_seen end used = bit.bor(used, bit.lshift(1, v.reg)) end end -- Attempt to select a free register from R7-R9 (callee saved) local free = bit.bnot(used) if bit.band(free, 0x80) ~= 0 then reg = 7 elseif bit.band(free,0x100) ~= 0 then reg = 8 elseif bit.band(free,0x200) ~= 0 then reg = 9 end -- Select another variable to be spilled if not reg then assert(last) reg = V[last].reg reg_spill(last) end assert(reg, 'VAR '..var..'fill/spill failed') return reg end -- Set new variable local function vset(var, reg, const, vtype) -- Must materialise all variables shadowing this variable slot, as it will be overwritten if V[var] and V[var].reg then for _, vinfo in pairs(V) do -- Shadowing variable MUST share the same type and attributes, -- but the register assignment may have changed if vinfo.shadow == var then vinfo.reg = V[var].reg vinfo.shadow = nil end end end -- Get precise type for CDATA or attempt to narrow numeric constant if not vtype and type(const) == 'cdata' then vtype = ffi.typeof(const) end V[var] = {reg=reg, const=const, type=vtype} -- Track variable source if V[var].const and type(const) == 'table' then V[var].source = V[var].const.source end end -- Materialize (or register) a variable in a register -- If the register is nil, then the a new register is assigned (if not already assigned) local function vreg(var, reg, reserve, vtype) local vinfo = V[var] assert(vinfo, 'VAR '..var..' not registered') vinfo.live_to = code.pc-1 if (vinfo.reg and not reg) and not vinfo.shadow then return vinfo.reg end reg = reg_alloc(var, reg) -- Materialize variable shadow copy local src = vinfo while src.shadow do src = V[src.shadow] end if reserve then -- luacheck: ignore -- No load to register occurs elseif src.reg then emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, src.reg, 0, 0) elseif src.spill then vinfo.spill = src.spill reg_fill(var, reg) elseif src.const then vtype = vtype or src.type if type(src.const) == 'table' and src.const.__base then -- Load pointer type emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, 10, 0, 0) emit(BPF.ALU64 + BPF.ADD + BPF.K, reg, 0, 0, -src.const.__base) elseif type(src.const) == 'table' and src.const.__dissector then -- Load dissector offset (imm32), but keep the constant part (dissector proxy) emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const.off or 0) elseif vtype and ffi.sizeof(vtype) == 8 then -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32)) emit(BPF.LD + BPF.DW, reg, 0, 0, ffi.cast('uint32_t', src.const)) emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.rshift(bit.rshift(src.const, 16), 16))) vinfo.const = nil -- The variable is live else emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const) vinfo.const = nil -- The variable is live end else assert(false, 'VAR '..var..' has neither register nor constant value') end vinfo.reg = reg vinfo.shadow = nil vinfo.live_from = code.pc-1 vinfo.type = vtype or vinfo.type return reg end -- Copy variable local function vcopy(dst, src) if dst == src then return end V[dst] = {reg=V[src].reg, const=V[src].const, shadow=src, source=V[src].source, type=V[src].type} end -- Dereference variable of pointer type local function vderef(dst_reg, src_reg, vinfo) -- Dereference map pointers for primitive types -- BPF doesn't allow pointer arithmetics, so use the entry value assert(type(vinfo.const) == 'table' and vinfo.const.__dissector, 'cannot dereference a non-pointer variable') local vtype = vinfo.const.__dissector local w = ffi.sizeof(vtype) assert(const_width[w], 'NYI: sizeof('..tostring(vtype)..') not 1/2/4/8 bytes') if dst_reg ~= src_reg then emit(BPF.ALU64 + BPF.MOV + BPF.X, dst_reg, src_reg, 0, 0) -- dst = src end -- Optimize the NULL check away if provably not NULL if not vinfo.source or vinfo.source:find('_or_null', 1, true) then emit(BPF.JMP + BPF.JEQ + BPF.K, src_reg, 0, 1, 0) -- if (src != NULL) end emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, 0, 0) -- dst = *src; end -- Allocate a space for variable local function valloc(size, blank) local base = stack_top assert(stack_top + size < 512 * 1024, 'exceeded maximum stack size of 512kB') stack_top = stack_top + size -- Align to 8 byte boundary stack_top = math.ceil(stack_top/8)*8 -- Current kernel version doesn't support ARG_PTR_TO_RAW_STACK -- so we always need to have memory initialized, remove this when supported if blank then if type(blank) == 'string' then local sp = 0 while sp < size do -- TODO: no BPF_ST + BPF_DW instruction yet local as_u32 = ffi.new('uint32_t [1]') local sub = blank:sub(sp+1, sp+ffi.sizeof(as_u32)) ffi.copy(as_u32, sub, #sub) emit(BPF.MEM + BPF.ST + BPF.W, 10, 0, -(stack_top-sp), as_u32[0]) sp = sp + ffi.sizeof(as_u32) end elseif type(blank) == 'boolean' then reg_alloc(stackslots, 0) emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0) for sp = base+8,stack_top,8 do emit(BPF.MEM + BPF.STX + BPF.DW, 10, 0, -sp, 0) end else error('NYI: will with unknown type '..type(blank)) end end return stack_top end -- Turn variable into scalar in register (or constant) local function vscalar(a, w) assert(const_width[w], 'sizeof(scalar variable) must be 1/2/4/8') local src_reg -- If source is a pointer, we must dereference it first if cdef.isptr(V[a].type) then src_reg = vreg(a) local tmp_reg = reg_alloc(stackslots, 1) -- Clone variable in tmp register emit(BPF.ALU64 + BPF.MOV + BPF.X, tmp_reg, src_reg, 0, 0) vderef(tmp_reg, tmp_reg, V[a]) src_reg = tmp_reg -- Materialize and dereference it -- Source is a value on stack, we must load it first elseif type(V[a].const) == 'table' and V[a].const.__base > 0 then src_reg = vreg(a) emit(BPF.MEM + BPF.LDX + const_width[w], src_reg, 10, -V[a].const.__base, 0) V[a].type = V[a].const.__dissector V[a].const = nil -- Value is dereferenced -- If source is an imm32 number, avoid register load elseif type(V[a].const) == 'number' and w < 8 then return nil, V[a].const -- Load variable from any other source else src_reg = vreg(a) end return src_reg, nil end -- Emit compensation code at the end of basic block to unify variable set layout on all block exits -- 1. we need to free registers by spilling -- 2. fill registers to match other exits from this BB local function bb_end(Vcomp) for i,v in pairs(V) do if Vcomp[i] and Vcomp[i].spill and not v.spill then -- Materialize constant or shadowing variable to be able to spill if not v.reg and (v.shadow or cdef.isimmconst(v)) then vreg(i) end reg_spill(i) end end for i,v in pairs(V) do if Vcomp[i] and Vcomp[i].reg and not v.reg then vreg(i, Vcomp[i].reg) end -- Compensate variable metadata change if Vcomp[i] and Vcomp[i].source then V[i].source = Vcomp[i].source end end end local function CMP_STR(a, b, op) assert(op == 'JEQ' or op == 'JNE', 'NYI: only equivallence stack/string only supports == or ~=') -- I have no better idea how to implement it than unrolled XOR loop, as we can fixup only one JMP -- So: X(a,b) = a[0] ^ b[0] | a[1] ^ b[1] | ... -- EQ(a,b) <=> X == 0 -- This could be optimised by placing early exits by rewriter in second phase for long strings local base, size = V[a].const.__base, math.min(#b, ffi.sizeof(V[a].type)) local acc, tmp = reg_alloc(stackslots, 0), reg_alloc(stackslots+1, 1) local sp = 0 emit(BPF.ALU64 + BPF.MOV + BPF.K, acc, 0, 0, 0) while sp < size do -- Load string chunk as imm32 local as_u32 = ffi.new('uint32_t [1]') local sub = b:sub(sp+1, sp+ffi.sizeof(as_u32)) ffi.copy(as_u32, sub, #sub) -- TODO: make this faster by interleaved load/compare steps with DW length emit(BPF.MEM + BPF.LDX + BPF.W, tmp, 10, -(base-sp), 0) emit(BPF.ALU64 + BPF.XOR + BPF.K, tmp, 0, 0, as_u32[0]) emit(BPF.ALU64 + BPF.OR + BPF.X, acc, tmp, 0, 0) sp = sp + ffi.sizeof(as_u32) end emit(BPF.JMP + BPF[op] + BPF.K, acc, 0, 0xffff, 0) code.seen_cmp = code.pc-1 end local function CMP_REG(a, b, op) -- Fold compile-time expressions if V[a].const and V[b].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then code.seen_cmp = const_expr[op](V[a].const, V[b].const) and ALWAYS or NEVER else -- Comparison against compile-time string or stack memory if V[b].const and type(V[b].const) == 'string' then return CMP_STR(a, V[b].const, op) end -- The 0xFFFF target here has no significance, it's just a placeholder for -- compiler to replace it's absolute offset to LJ bytecode insn with a relative -- offset in BPF program code, verifier will accept only programs with valid JMP targets local a_reg, b_reg = vreg(a), vreg(b) emit(BPF.JMP + BPF[op] + BPF.X, a_reg, b_reg, 0xffff, 0) code.seen_cmp = code.pc-1 end end local function CMP_IMM(a, b, op) local c = V[a].const if c and not is_proxy(c) then -- Fold compile-time expressions code.seen_cmp = const_expr[op](c, b) and ALWAYS or NEVER else -- Convert imm32 to number if type(b) == 'string' then if #b == 1 then b = b:byte() elseif cdef.isptr(V[a].type) then -- String comparison between stack/constant string return CMP_STR(a, b, op) elseif #b <= 4 then -- Convert to u32 with network byte order local imm = ffi.new('uint32_t[1]') ffi.copy(imm, b, #b) b = builtins.hton(imm[0]) else error('NYI: compare register with string, where #string > sizeof(u32)') end end -- The 0xFFFF target here has no significance, it's just a placeholder for -- compiler to replace it's absolute offset to LJ bytecode insn with a relative -- offset in BPF program code, verifier will accept only programs with valid JMP targets local reg = vreg(a) emit(BPF.JMP + BPF[op] + BPF.K, reg, 0, 0xffff, b) code.seen_cmp = code.pc-1 -- Remember NULL pointer checks as BPF prohibits pointer comparisons -- and repeated checks wouldn't pass the verifier, only comparisons -- against constants are checked. if op == 'JEQ' and tonumber(b) == 0 and V[a].source then local pos = V[a].source:find('_or_null', 1, true) if pos then code.seen_null_guard = a end -- Inverse NULL pointer check (if a ~= nil) elseif op == 'JNE' and tonumber(b) == 0 and V[a].source then local pos = V[a].source:find('_or_null', 1, true) if pos then code.seen_null_guard = a code.seen_null_guard_inverse = true end end end end local function ALU_IMM(dst, a, b, op) -- Fold compile-time expressions if V[a].const and not is_proxy(V[a].const) then assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric') vset(dst, nil, const_expr[op](V[a].const, b)) -- Now we need to materialize dissected value at DST, and add it else vcopy(dst, a) local dst_reg = vreg(dst) if cdef.isptr(V[a].type) then vderef(dst_reg, dst_reg, V[a]) V[dst].type = V[a].const.__dissector else V[dst].type = V[a].type end emit(BPF.ALU64 + BPF[op] + BPF.K, dst_reg, 0, 0, b) end end local function ALU_REG(dst, a, b, op) -- Fold compile-time expressions if V[a].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric') assert(cdef.isimmconst(V[b]), 'VAR '..b..' must be numeric') if type(op) == 'string' then op = const_expr[op] end vcopy(dst, a) V[dst].const = op(V[a].const, V[b].const) else local src_reg = b and vreg(b) or 0 -- SRC is optional for unary operations if b and cdef.isptr(V[b].type) then -- We have to allocate a temporary register for dereferencing to preserve -- pointer in source variable that MUST NOT be altered reg_alloc(stackslots, 2) vderef(2, src_reg, V[b]) src_reg = 2 end vcopy(dst, a) -- DST may alias B, so copy must occur after we materialize B local dst_reg = vreg(dst) if cdef.isptr(V[a].type) then vderef(dst_reg, dst_reg, V[a]) V[dst].type = V[a].const.__dissector end emit(BPF.ALU64 + BPF[op] + BPF.X, dst_reg, src_reg, 0, 0) V[stackslots].reg = nil -- Free temporary registers end end local function ALU_IMM_NV(dst, a, b, op) -- Do DST = IMM(a) op VAR(b) where we can't invert because -- the registers are u64 but immediates are u32, so complement -- arithmetics wouldn't work vset(stackslots+1, nil, a) ALU_REG(dst, stackslots+1, b, op) end local function LD_ABS(dst, w, off) assert(off, 'LD_ABS called without offset') if w < 8 then local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0 emit(BPF.LD + BPF.ABS + const_width[w], dst_reg, 0, 0, off) if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8) end elseif w == 8 then -- LD_ABS|IND prohibits DW, we need to do two W loads and combine them local tmp_reg = vreg(stackslots, 0, true, builtins.width_type(w)) -- Reserve R0 emit(BPF.LD + BPF.ABS + const_width[4], tmp_reg, 0, 0, off + 4) if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse emit(BPF.ALU + BPF.END + BPF.TO_BE, tmp_reg, 0, 0, 32) end ALU_IMM(stackslots, stackslots, 32, 'LSH') local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0, spill tmp variable emit(BPF.LD + BPF.ABS + const_width[4], dst_reg, 0, 0, off) if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, 32) end ALU_REG(dst, dst, stackslots, 'OR') V[stackslots].reg = nil -- Free temporary registers else assert(w < 8, 'NYI: only LD_ABS of 1/2/4/8 is supported') end end local function LD_IND(dst, src, w, off) local src_reg = vreg(src) -- Must materialize first in case dst == src local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0 emit(BPF.LD + BPF.IND + const_width[w], dst_reg, src_reg, 0, off or 0) if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8) end end local function LD_MEM(dst, src, w, off) local src_reg = vreg(src) -- Must materialize first in case dst == src local dst_reg = vreg(dst, nil, true, builtins.width_type(w)) -- Reserve R0 emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, off or 0, 0) end -- @note: This is specific now as it expects registers reserved local function LD_IMM_X(dst_reg, src_type, imm, w) if w == 8 then -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32)) emit(BPF.LD + const_width[w], dst_reg, src_type, 0, ffi.cast('uint32_t', imm)) -- Must shift in two steps as bit.lshift supports [0..31] emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.lshift(bit.lshift(imm, 16), 16))) else emit(BPF.LD + const_width[w], dst_reg, src_type, 0, imm) end end local function BUILTIN(func, ...) local builtin_export = { -- Compiler primitives (work with variable slots, emit instructions) V=V, vreg=vreg, vset=vset, vcopy=vcopy, vderef=vderef, valloc=valloc, emit=emit, reg_alloc=reg_alloc, reg_spill=reg_spill, tmpvar=stackslots, const_width=const_width, -- Extensions and helpers (use with care) LD_IMM_X = LD_IMM_X, } func(builtin_export, ...) end local function LOAD(dst, src, off, vtype) local base = V[src].const assert(base and base.__dissector, 'NYI: load() on variable that doesn\'t have dissector') assert(V[src].source, 'NYI: load() on variable with unknown source') -- Cast to different type if requested vtype = vtype or base.__dissector local w = ffi.sizeof(vtype) assert(const_width[w], 'NYI: load() supports 1/2/4/8 bytes at a time only, wanted ' .. tostring(w)) -- Packet access with a dissector (use BPF_LD) if V[src].source:find('ptr_to_pkt', 1, true) then if base.off then -- Absolute address to payload LD_ABS(dst, w, off + base.off) else -- Indirect address to payload LD_IND(dst, src, w, off) end -- Direct access to first argument (skb fields, pt regs, ...) elseif V[src].source:find('ptr_to_ctx', 1, true) then LD_MEM(dst, src, w, off) -- Direct skb access with a dissector (use BPF_MEM) elseif V[src].source:find('ptr_to_skb', 1, true) then LD_MEM(dst, src, w, off) -- Pointer to map-backed memory (use BPF_MEM) elseif V[src].source:find('ptr_to_map_value', 1, true) then LD_MEM(dst, src, w, off) -- Indirect read using probe (uprobe or kprobe, uses helper) elseif V[src].source:find('ptr_to_probe', 1, true) then BUILTIN(builtins[builtins.probe_read], nil, dst, src, vtype, off) V[dst].source = V[src].source -- Builtin handles everything else error('NYI: load() on variable from ' .. V[src].source) end V[dst].type = vtype V[dst].const = nil -- Dissected value is not constant anymore end local function CALL(a, b, d) assert(b-1 <= 1, 'NYI: CALL with >1 return values') -- Perform either compile-time, helper, or builtin local func = V[a].const -- Gather all arguments and check if they're constant local args, const, nargs = {}, true, d - 1 for i = a+1, a+d-1 do table.insert(args, V[i].const) if not V[i].const or is_proxy(V[i].const) then const = false end end local builtin = builtins[func] if not const or nargs == 0 then if builtin and type(builtin) == 'function' then args = {a} for i = a+1, a+nargs do table.insert(args, i) end BUILTIN(builtin, unpack(args)) elseif V[a+2] and V[a+2].const then -- var OP imm ALU_IMM(a, a+1, V[a+2].const, builtin) elseif nargs <= 2 then -- var OP var ALU_REG(a, a+1, V[a+2] and a+2, builtin) else error('NYI: CALL non-builtin with 3 or more arguments') end -- Call on dissector implies slice retrieval elseif type(func) == 'table' and func.__dissector then assert(nargs >= 2, 'NYI: <dissector>.slice(a, b) must have at least two arguments') assert(V[a+1].const and V[a+2].const, 'NYI: slice() arguments must be constant') local off = V[a+1].const local vtype = builtins.width_type(V[a+2].const - off) -- Access to packet via packet (use BPF_LD) if V[a].source and V[a].source:find('ptr_to_', 1, true) then LOAD(a, a, off, vtype) else error('NYI: <dissector>.slice(a, b) on non-pointer memory ' .. (V[a].source or 'unknown')) end -- Strict builtins cannot be expanded on compile-time elseif builtins_strict[func] and builtin then args = {a} for i = a+1, a+nargs do table.insert(args, i) end BUILTIN(builtin, unpack(args)) -- Attempt compile-time call expansion (expects all argument compile-time known) else assert(const, 'NYI: CALL attempted on constant arguments, but at least one argument is not constant') V[a].const = func(unpack(args)) end end local function MAP_INIT(map_var, key, imm) local map = V[map_var].const vreg(map_var, 1, true, ffi.typeof('uint64_t')) -- Reserve R1 and load ptr for process-local map fd LD_IMM_X(1, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof(V[map_var].type)) V[map_var].reg = nil -- R1 will be invalidated after CALL, forget register allocation -- Reserve R2 and load R2 = key pointer local key_size = ffi.sizeof(map.key_type) local w = const_width[key_size] or BPF.DW local pod_type = const_width[key_size] local sp = stack_top + key_size -- Must use stack below spill slots -- Store immediate value on stack reg_alloc(stackslots, 2) -- Spill anything in R2 (unnamed tmp variable) local key_base = key and V[key].const imm = imm or key_base if imm and (not key or not is_proxy(key_base)) then assert(pod_type, 'NYI: map[const K], K width must be 1/2/4/8') emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, imm) -- Key is in register, spill it elseif V[key].reg and pod_type then if cdef.isptr(V[key].type) then -- There is already pointer in register, dereference before spilling emit(BPF.MEM + BPF.LDX + w, 2, V[key].reg, 0, 0) emit(BPF.MEM + BPF.STX + w, 10, 2, -sp, 0) else -- Variable in register is POD, spill it on the stack emit(BPF.MEM + BPF.STX + w, 10, V[key].reg, -sp, 0) end -- Key is spilled from register to stack elseif V[key].spill then sp = V[key].spill -- Key is already on stack, write to base-relative address elseif key_base.__base then assert(key_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map key type') sp = key_base.__base else error('VAR '..key..' is neither const-expr/register/stack/spilled') end -- If [FP+K] addressing, emit it if sp then emit(BPF.ALU64 + BPF.MOV + BPF.X, 2, 10, 0, 0) emit(BPF.ALU64 + BPF.ADD + BPF.K, 2, 0, 0, -sp) end end local function MAP_GET(dst, map_var, key, imm) local map = V[map_var].const MAP_INIT(map_var, key, imm) -- Flag as pointer type and associate dissector for map value type vreg(dst, 0, true, ffi.typeof('uint8_t *')) V[dst].const = {__dissector=map.val_type} V[dst].source = 'ptr_to_map_value_or_null' emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_lookup_elem) V[stackslots].reg = nil -- Free temporary registers end local function MAP_DEL(map_var, key, key_imm) -- Set R0, R1 (map fd, preempt R0) reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable) MAP_INIT(map_var, key, key_imm) emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_delete_elem) V[stackslots].reg = nil -- Free temporary registers end local function MAP_SET(map_var, key, key_imm, src) local map = V[map_var].const -- Delete when setting nil if V[src].type == ffi.typeof('void') then return MAP_DEL(map_var, key, key_imm) end -- Set R0, R1 (map fd, preempt R0) reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable) MAP_INIT(map_var, key, key_imm) reg_alloc(stackslots, 4) -- Spill anything in R4 (unnamed tmp variable) emit(BPF.ALU64 + BPF.MOV + BPF.K, 4, 0, 0, 0) -- BPF_ANY, create new element or update existing -- Reserve R3 for value pointer reg_alloc(stackslots, 3) -- Spill anything in R3 (unnamed tmp variable) local val_size = ffi.sizeof(map.val_type) local w = const_width[val_size] or BPF.DW local pod_type = const_width[val_size] -- Stack pointer must be aligned to both key/value size and have enough headroom for (key, value) local sp = stack_top + ffi.sizeof(map.key_type) + val_size sp = sp + (sp % val_size) local base = V[src].const if base and not is_proxy(base) then assert(pod_type, 'NYI: MAP[K] = imm V; V width must be 1/2/4/8') emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, base) -- Value is in register, spill it elseif V[src].reg and pod_type then -- Value is a pointer, derefernce it and spill it if cdef.isptr(V[src].type) then vderef(3, V[src].reg, V[src]) emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0) else emit(BPF.MEM + BPF.STX + w, 10, V[src].reg, -sp, 0) end -- We get a pointer to spilled register on stack elseif V[src].spill then -- If variable is a pointer, we can load it to R3 directly (save "LEA") if cdef.isptr(V[src].type) then reg_fill(src, 3) -- If variable is a stack pointer, we don't have to check it if base.__base then emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem) return end vderef(3, V[src].reg, V[src]) emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0) else sp = V[src].spill end -- Value is already on stack, write to base-relative address elseif base.__base then if val_size ~= ffi.sizeof(V[src].type) then local err = string.format('VAR %d type (%s) incompatible with BPF map value type (%s): expected %d, got %d', src, V[src].type, map.val_type, val_size, ffi.sizeof(V[src].type)) error(err) end sp = base.__base -- Value is constant, materialize it on stack else error('VAR '.. src ..' is neither const-expr/register/stack/spilled') end emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, 10, 0, 0) emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, -sp) emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem) V[stackslots].reg = nil -- Free temporary registers end -- Finally - this table translates LuaJIT bytecode into code emitter actions. local BC = { -- Constants KNUM = function(a, _, c, _) -- KNUM if c < 2147483648 then vset(a, nil, c, ffi.typeof('int32_t')) else vset(a, nil, c, ffi.typeof('uint64_t')) end end, KSHORT = function(a, _, _, d) -- KSHORT vset(a, nil, d, ffi.typeof('int16_t')) end, KCDATA = function(a, _, c, _) -- KCDATA -- Coerce numeric types if possible local ct = ffi.typeof(c) if ffi.istype(ct, ffi.typeof('uint64_t')) or ffi.istype(ct, ffi.typeof('int64_t')) then vset(a, nil, c, ct) elseif tonumber(c) ~= nil then -- TODO: this should not be possible vset(a, nil, tonumber(c), ct) else error('NYI: cannot use CDATA constant of type ' .. ct) end end, KPRI = function(a, _, _, d) -- KPRI -- KNIL is 0, must create a special type to identify it local vtype = (d < 1) and ffi.typeof('void') or ffi.typeof('uint8_t') vset(a, nil, (d < 2) and 0 or 1, vtype) end, KSTR = function(a, _, c, _) -- KSTR vset(a, nil, c, ffi.typeof('const char[?]')) end, MOV = function(a, _, _, d) -- MOV var, var vcopy(a, d) end, -- Comparison ops -- Note: comparisons are always followed by JMP opcode, that -- will fuse following JMP to JMP+CMP instruction in BPF -- Note: we're narrowed to integers, so operand/operator inversion is legit ISLT = function(a, _, _, d) return CMP_REG(d, a, 'JGE') end, -- (a < d) (inverted) ISGE = function(a, _, _, d) return CMP_REG(a, d, 'JGE') end, -- (a >= d) ISGT = function(a, _, _, d) return CMP_REG(a, d, 'JGT') end, -- (a > d) ISEQV = function(a, _, _, d) return CMP_REG(a, d, 'JEQ') end, -- (a == d) ISNEV = function(a, _, _, d) return CMP_REG(a, d, 'JNE') end, -- (a ~= d) ISEQS = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == str(c)) ISNES = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= str(c)) ISEQN = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == c) ISNEN = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= c) IST = function(_, _, _, d) return CMP_IMM(d, 0, 'JNE') end, -- (d) ISF = function(_, _, _, d) return CMP_IMM(d, 0, 'JEQ') end, -- (not d) ISEQP = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- ISEQP (a == c) -- Binary operations with RHS constants ADDVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, SUBVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'SUB') end, MULVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, DIVVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'DIV') end, MODVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MOD') end, -- Binary operations with LHS constants -- Cheat code: we're narrowed to integer arithmetic, so MUL+ADD are commutative ADDNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, -- ADDNV MULNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, -- MULNV SUBNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'SUB') end, -- SUBNV DIVNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'DIV') end, -- DIVNV -- Binary operations between registers ADDVV = function(a, b, _, d) return ALU_REG(a, b, d, 'ADD') end, SUBVV = function(a, b, _, d) return ALU_REG(a, b, d, 'SUB') end, MULVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MUL') end, DIVVV = function(a, b, _, d) return ALU_REG(a, b, d, 'DIV') end, MODVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MOD') end, -- Strings CAT = function(a, b, _, d) -- CAT A = B ~ D assert(V[b].const and V[d].const, 'NYI: CAT only works on compile-time expressions') assert(type(V[b].const) == 'string' and type(V[d].const) == 'string', 'NYI: CAT only works on compile-time strings') vset(a, nil, V[b].const .. V[d].const) end, -- Tables GGET = function (a, _, c, _) -- GGET (A = GLOBAL[c]) if env[c] ~= nil then vset(a, nil, env[c]) else error(string.format("undefined global '%s'", c)) end end, UGET = function (a, _, c, _) -- UGET (A = UPVALUE[c]) if env[c] ~= nil then vset(a, nil, env[c]) else error(string.format("undefined upvalue '%s'", c)) end end, TSETB = function (a, b, _, d) -- TSETB (B[D] = A) assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer') local vinfo = V[b].const if vinfo.__map then -- BPF map read (constant) return MAP_SET(b, nil, d, a) -- D is literal elseif vinfo.__dissector then assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size') local w = ffi.sizeof(vinfo.__dissector) -- TODO: support vectorized moves larger than register width assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8') local src_reg, const = vscalar(a, w) -- If changing map value, write to absolute address + offset if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then local dst_reg = vreg(b) -- Optimization: immediate values (imm32) can be stored directly if type(const) == 'number' then emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, d, const) else emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, d, 0) end -- Table is already on stack, write to vinfo-relative address elseif vinfo.__base then -- Optimization: immediate values (imm32) can be stored directly if type(const) == 'number' then emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -vinfo.__base + (d * w), const) else emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -vinfo.__base + (d * w), 0) end else error('NYI: B[D] where B is not Lua table, BPF map, or pointer') end elseif vinfo and vinfo and V[a].const then vinfo[V[d].const] = V[a].const else error('NYI: B[D] where B is not Lua table, BPF map, or pointer') end end, TSETV = function (a, b, _, d) -- TSETV (B[D] = A) assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer') local vinfo = V[b].const if vinfo.__map then -- BPF map read (constant) return MAP_SET(b, d, nil, a) -- D is variable elseif vinfo.__dissector then assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size') local w = ffi.sizeof(vinfo.__dissector) -- TODO: support vectorized moves larger than register width assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8') local src_reg, const = vscalar(a, w) -- If changing map value, write to absolute address + offset if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then -- Calculate variable address from two registers local tmp_var = stackslots + 1 vset(tmp_var, nil, d) ALU_REG(tmp_var, tmp_var, b, 'ADD') local dst_reg = vreg(tmp_var) V[tmp_var].reg = nil -- Only temporary allocation -- Optimization: immediate values (imm32) can be stored directly if type(const) == 'number' and w < 8 then emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, 0, const) else emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, 0, 0) end -- Table is already on stack, write to vinfo-relative address elseif vinfo.__base then -- Calculate variable address from two registers local tmp_var = stackslots + 1 vcopy(tmp_var, d) -- Element position if w > 1 then ALU_IMM(tmp_var, tmp_var, w, 'MUL') -- multiply by element size end local dst_reg = vreg(tmp_var) -- add R10 (stack pointer) emit(BPF.ALU64 + BPF.ADD + BPF.X, dst_reg, 10, 0, 0) V[tmp_var].reg = nil -- Only temporary allocation -- Optimization: immediate values (imm32) can be stored directly if type(const) == 'number' and w < 8 then emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, -vinfo.__base, const) else emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, -vinfo.__base, 0) end else error('NYI: B[D] where B is not Lua table, BPF map, or pointer') end elseif vinfo and V[d].const and V[a].const then vinfo[V[d].const] = V[a].const else error('NYI: B[D] where B is not Lua table, BPF map, or pointer') end end, TSETS = function (a, b, c, _) -- TSETS (B[C] = A) assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table, BPF map, or pointer') local base = V[b].const if base.__dissector then local ofs,bpos = ffi.offsetof(base.__dissector, c) assert(not bpos, 'NYI: B[C] = A, where C is a bitfield') local w = builtins.sizeofattr(base.__dissector, c) -- TODO: support vectorized moves larger than register width assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8') local src_reg, const = vscalar(a, w) -- If changing map value, write to absolute address + offset if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then local dst_reg = vreg(b) -- Optimization: immediate values (imm32) can be stored directly if type(const) == 'number' and w < 8 then emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, ofs, const) else emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, ofs, 0) end -- Table is already on stack, write to base-relative address elseif base.__base then -- Optimization: immediate values (imm32) can be stored directly if type(const) == 'number' and w < 8 then emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -base.__base + ofs, const) else emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -base.__base + ofs, 0) end else error('NYI: B[C] where B is not Lua table, BPF map, or pointer') end elseif V[a].const then base[c] = V[a].const else error('NYI: B[C] where B is not Lua table, BPF map, or pointer') end end, TGETB = function (a, b, _, d) -- TGETB (A = B[D]) local base = V[b].const assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map') if a ~= b then vset(a) end if base.__map then -- BPF map read (constant) MAP_GET(a, b, nil, d) -- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM) elseif V[b].source and V[b].source:find('ptr_to_') then local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t') LOAD(a, b, d, vtype) -- Specialise PTR[0] as dereference operator elseif cdef.isptr(V[b].type) and d == 0 then vcopy(a, b) local dst_reg = vreg(a) vderef(dst_reg, dst_reg, V[a]) V[a].type = V[a].const.__dissector else error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference') end end, TGETV = function (a, b, _, d) -- TGETV (A = B[D]) local base = V[b].const assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map') if a ~= b then vset(a) end if base.__map then -- BPF map read MAP_GET(a, b, d) -- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM) elseif V[b].source and V[b].source:find('ptr_to_') then local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t') LOAD(a, b, d, vtype) -- Constant dereference elseif type(V[d].const) == 'number' then V[a].const = base[V[d].const] else error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference') end end, TGETS = function (a, b, c, _) -- TGETS (A = B[C]) local base = V[b].const assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map') if a ~= b then vset(a) end if base.__dissector then local ofs,bpos,bsize = ffi.offsetof(base.__dissector, c) -- Resolve table key using metatable if not ofs and type(base.__dissector[c]) == 'string' then c = base.__dissector[c] ofs,bpos,bsize = ffi.offsetof(base.__dissector, c) end if not ofs and proto[c] then -- Load new dissector on given offset BUILTIN(proto[c], a, b, c) else -- Loading register from offset is a little bit tricky as there are -- several data sources and value loading modes with different restrictions -- such as checking pointer values for NULL compared to using stack. assert(ofs, tostring(base.__dissector)..'.'..c..' attribute not exists') if a ~= b then vset(a) end -- Dissected value is probably not constant anymore local new_const = nil local w, atype = builtins.sizeofattr(base.__dissector, c) -- [SP+K] addressing using R10 (stack pointer) -- Doesn't need to be checked for NULL if base.__base and base.__base > 0 then if cdef.isptr(atype) then -- If the member is pointer type, update base pointer with offset new_const = {__base = base.__base-ofs} else local dst_reg = vreg(a, nil, true) emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, 10, -base.__base+ofs, 0) end -- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM) elseif V[b].source and V[b].source:find('ptr_to_') then LOAD(a, b, ofs, atype) else error('NYI: B[C] where B is not Lua table, BPF map, or pointer') end -- Bitfield, must be further narrowed with a bitmask/shift if bpos then local mask = 0 for i=bpos+1,bpos+bsize do mask = bit.bor(mask, bit.lshift(1, w*8-i)) end emit(BPF.ALU64 + BPF.AND + BPF.K, vreg(a), 0, 0, mask) -- Free optimization: single-bit values need just boolean result if bsize > 1 then local shift = w*8-bsize-bpos if shift > 0 then emit(BPF.ALU64 + BPF.RSH + BPF.K, vreg(a), 0, 0, shift) end end end V[a].type = atype V[a].const = new_const V[a].source = V[b].source -- Track direct access to skb data -- see https://www.kernel.org/doc/Documentation/networking/filter.txt "Direct packet access" if ffi.istype(base.__dissector, ffi.typeof('struct sk_buff')) then -- Direct access to skb uses skb->data and skb->data_end -- which are encoded as u32, but are actually pointers if c == 'data' or c == 'data_end' then V[a].const = {__dissector = ffi.typeof('uint8_t')} V[a].source = 'ptr_to_skb' end end end else V[a].const = base[c] end end, -- Loops and branches CALLM = function (a, b, _, d) -- A = A(A+1, ..., A+D+MULTRES) -- NYI: Support single result only CALL(a, b, d+2) end, CALL = function (a, b, _, d) -- A = A(A+1, ..., A+D-1) CALL(a, b, d) end, JMP = function (a, _, c, _) -- JMP -- Discard unused slots after jump for i, _ in pairs(V) do if i >= a and i < stackslots then V[i] = nil end end -- Cross basic block boundary if the jump target isn't provably unreachable local val = code.fixup[c] or {} if code.seen_cmp and code.seen_cmp ~= ALWAYS then if code.seen_cmp ~= NEVER then -- Do not emit the jump or fixup -- Store previous CMP insn for reemitting after compensation code local jmpi = ffi.new('struct bpf_insn', code.insn[code.pc-1]) code.pc = code.pc - 1 -- First branch point, emit compensation code local Vcomp = Vstate[c] if not Vcomp then -- Select scratch register (R0-5) that isn't used as operand -- in the CMP instruction, as the variable may not be live, after -- the JMP, but it may be used in the JMP+CMP instruction itself local tmp_reg = 0 for reg = 0, 5 do if reg ~= jmpi.dst_reg and reg ~= jmpi.src_reg then tmp_reg = reg break end end -- Force materialization of constants at the end of BB for i, v in pairs(V) do if not v.reg and cdef.isimmconst(v) then vreg(i, tmp_reg) -- Load to TMP register (not saved) reg_spill(i) -- Spill caller-saved registers end end -- Record variable state Vstate[c] = V Vcomp = V V = table_copy(V) -- Variable state already set, emit specific compensation code else bb_end(Vcomp) end -- Record pointer NULL check from condition -- If the condition checks pointer variable against NULL, -- we can assume it will not be NULL in the fall-through block if code.seen_null_guard then local var = code.seen_null_guard -- The null guard can have two forms: -- if x == nil then goto -- if x ~= nil then goto -- First form guarantees that the variable will be non-nil on the following instruction -- Second form guarantees that the variable will be non-nil at the jump target local vinfo = code.seen_null_guard_inverse and Vcomp[var] or V[var] if vinfo.source then local pos = vinfo.source:find('_or_null', 1, true) if pos then vinfo.source = vinfo.source:sub(1, pos - 1) end end end -- Reemit CMP insn emit(jmpi.code, jmpi.dst_reg, jmpi.src_reg, jmpi.off, jmpi.imm) -- Fuse JMP into previous CMP opcode, mark JMP target for fixup -- as we don't knot the relative offset in generated code yet table.insert(val, code.pc-1) code.fixup[c] = val end code.seen_cmp = nil code.seen_null_guard = nil code.seen_null_guard_inverse = nil elseif c == code.bc_pc + 1 then -- luacheck: ignore 542 -- Eliminate jumps to next immediate instruction -- e.g. 0002 JMP 1 => 0003 else -- We need to synthesise a condition that's always true, however -- BPF prohibits pointer arithmetic to prevent pointer leaks -- so we have to clear out one register and use it for cmp that's always true local dst_reg = reg_alloc(stackslots) V[stackslots].reg = nil -- Only temporary allocation -- First branch point, emit compensation code local Vcomp = Vstate[c] if not Vcomp then -- Force materialization of constants at the end of BB for i, v in pairs(V) do if not v.reg and cdef.isimmconst(v) then vreg(i, dst_reg) -- Load to TMP register (not saved) reg_spill(i) -- Spill caller-saved registers end end -- Record variable state Vstate[c] = V V = table_copy(V) -- Variable state already set, emit specific compensation code else bb_end(Vcomp) end emit(BPF.ALU64 + BPF.MOV + BPF.K, dst_reg, 0, 0, 0) emit(BPF.JMP + BPF.JEQ + BPF.K, dst_reg, 0, 0xffff, 0) table.insert(val, code.pc-1) -- Fixup JMP target code.reachable = false -- Code following the JMP is not reachable code.fixup[c] = val end end, RET1 = function (a, _, _, _) -- RET1 -- Free optimisation: spilled variable will not be filled again for i, v in pairs(V) do if i ~= a then v.reg = nil end end if V[a].reg ~= 0 then vreg(a, 0) end -- Convenience: dereference pointer variables -- e.g. 'return map[k]' will return actual map value, not pointer if cdef.isptr(V[a].type) then vderef(0, 0, V[a]) end emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0) code.reachable = false end, RET0 = function (_, _, _, _) -- RET0 emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0) emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0) code.reachable = false end, compile = function () return code end } -- Composite instructions function BC.CALLT(a, _, _, d) -- Tailcall: return A(A+1, ..., A+D-1) CALL(a, 1, d) BC.RET1(a) end -- Always initialize R6 with R1 context emit(BPF.ALU64 + BPF.MOV + BPF.X, 6, 1, 0, 0) -- Register R6 as context variable (first argument) if params and params > 0 then vset(0, 6, param_types[1] or proto.skb) assert(V[0].source == V[0].const.source) -- Propagate source annotation from typeinfo end -- Register tmpvars vset(stackslots) vset(stackslots+1) return setmetatable(BC, { __index = function (_, k, _) if type(k) == 'number' then local op_str = string.sub(require('jit.vmdef').bcnames, 6*k+1, 6*k+6) error(string.format("NYI: opcode '0x%02x' (%-04s)", k, op_str)) end end, __call = function (t, op, a, b, c, d) code.bc_pc = code.bc_pc + 1 -- Exitting BB straight through, emit compensation code if Vstate[code.bc_pc] then if code.reachable then -- Instruction is reachable from previous line -- so we must make the variable allocation consistent -- with the variable allocation at the jump source -- e.g. 0001 x:R0 = 5 -- 0002 if rand() then goto 0005 -- 0003 x:R0 -> x:stack -- 0004 y:R0 = 5 -- 0005 x:? = 10 <-- x was in R0 before jump, and stack after jump bb_end(Vstate[code.bc_pc]) else -- Instruction isn't reachable from previous line, restore variable layout -- e.g. RET or condition-less JMP on previous line V = table_copy(Vstate[code.bc_pc]) end end -- Perform fixup of jump targets -- We need to do this because the number of consumed and emitted -- bytecode instructions is different local fixup = code.fixup[code.bc_pc] if fixup ~= nil then -- Patch JMP source insn with relative offset for _,pc in ipairs(fixup) do code.insn[pc].off = code.pc - 1 - pc end code.fixup[code.bc_pc] = nil code.reachable = true end -- Execute if code.reachable then assert(t[op], string.format('NYI: instruction %s, parameters: %s,%s,%s,%s', op,a,b,c,d)) return t[op](a, b, c, d) end end, }) end -- Emitted code dump local function dump_mem(cls, ins, _, fuse) -- This is a very dense MEM instruction decoder without much explanation -- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format local mode = bit.band(ins.code, 0xe0) if mode == BPF.XADD then cls = 5 end -- The only mode local op_1 = {'LD', 'LDX', 'ST', 'STX', '', 'XADD'} local op_2 = {[0]='W', [8]='H', [16]='B', [24]='DW'} local name = op_1[cls+1] .. op_2[bit.band(ins.code, 0x18)] local off = tonumber(ffi.cast('int16_t', ins.off)) -- Reinterpret as signed local dst = cls < 2 and 'R'..ins.dst_reg or string.format('[R%d%+d]', ins.dst_reg, off) local src = cls % 2 == 0 and '#'..ins.imm or 'R'..ins.src_reg if cls == BPF.LDX then src = string.format('[R%d%+d]', ins.src_reg, off) end if mode == BPF.ABS then src = string.format('skb[%d]', ins.imm) end if mode == BPF.IND then src = string.format('skb[R%d%+d]', ins.src_reg, ins.imm) end return string.format('%s\t%s\t%s', fuse and '' or name, fuse and '' or dst, src) end local function dump_alu(cls, ins, pc) local alu = {'ADD', 'SUB', 'MUL', 'DIV', 'OR', 'AND', 'LSH', 'RSH', 'NEG', 'MOD', 'XOR', 'MOV', 'ARSH', 'END' } local jmp = {'JA', 'JEQ', 'JGT', 'JGE', 'JSET', 'JNE', 'JSGT', 'JSGE', 'CALL', 'EXIT'} local helper = {'unspec', 'map_lookup_elem', 'map_update_elem', 'map_delete_elem', 'probe_read', 'ktime_get_ns', 'trace_printk', 'get_prandom_u32', 'get_smp_processor_id', 'skb_store_bytes', 'l3_csum_replace', 'l4_csum_replace', 'tail_call', 'clone_redirect', 'get_current_pid_tgid', 'get_current_uid_gid', 'get_current_comm', 'get_cgroup_classid', 'skb_vlan_push', 'skb_vlan_pop', 'skb_get_tunnel_key', 'skb_set_tunnel_key', 'perf_event_read', 'redirect', 'get_route_realm', 'perf_event_output', 'skb_load_bytes'} local op = 0 -- This is a very dense ALU instruction decoder without much explanation -- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format for i = 0,13 do if 0x10 * i == bit.band(ins.code, 0xf0) then op = i + 1 break end end local name = (cls == 5) and jmp[op] or alu[op] local src = (bit.band(ins.code, 0x08) == BPF.X) and 'R'..ins.src_reg or '#'..ins.imm local target = (cls == 5 and op < 9) and string.format('\t=> %04d', pc + ins.off + 1) or '' if cls == 5 and op == 9 then target = string.format('\t; %s', helper[ins.imm + 1] or tostring(ins.imm)) end return string.format('%s\t%s\t%s%s', name, 'R'..ins.dst_reg, src, target) end local function dump_string(code, off, hide_counter) if not code then return end local cls_map = { [0] = dump_mem, [1] = dump_mem, [2] = dump_mem, [3] = dump_mem, [4] = dump_alu, [5] = dump_alu, [7] = dump_alu, } local result = {} local fused = false for i = off or 0, code.pc - 1 do local ins = code.insn[i] local cls = bit.band(ins.code, 0x07) local line = cls_map[cls](cls, ins, i, fused) if hide_counter then table.insert(result, line) else table.insert(result, string.format('%04u\t%s', i, line)) end fused = string.find(line, 'LDDW', 1) end return table.concat(result, '\n') end local function dump(code) if not code then return end print(string.format('-- BPF %s:0-%u', code.insn, code.pc)) print(dump_string(code)) end local function compile(prog, params) -- Create code emitter sandbox, include caller locals local env = { pkt=proto.pkt, eth=proto.pkt, BPF=BPF, ffi=ffi } -- Include upvalues up to 4 nested scopes back -- the narrower scope overrides broader scope for k = 5, 2, -1 do local i = 1 while true do local ok, n, v = pcall(debug.getlocal, k, i) if not ok or not n then break end env[n] = v i = i + 1 end end setmetatable(env, { __index = function (_, k) return proto[k] or builtins[k] or _G[k] end }) -- Create code emitter and compile LuaJIT bytecode if type(prog) == 'string' then prog = loadstring(prog) end -- Create error handler to print traceback local funci, pc = bytecode.funcinfo(prog), 0 local E = create_emitter(env, funci.stackslots, funci.params, params or {}) local on_err = function (e) funci = bytecode.funcinfo(prog, pc) local from, to = 0, 0 for _ = 1, funci.currentline do from = to to = string.find(funci.source, '\n', from+1, true) or 0 end print(funci.loc..':'..string.sub(funci.source, from+1, to-1)) print('error: '..e) print(debug.traceback()) end for _,op,a,b,c,d in bytecode.decoder(prog) do local ok, _, err = xpcall(E,on_err,op,a,b,c,d) if not ok then return nil, err end end return E:compile() end -- BPF map interface local bpf_map_mt = { __gc = function (map) S.close(map.fd) end, __len = function(map) return map.max_entries end, __index = function (map, k) if type(k) == 'string' then -- Return iterator if k == 'pairs' then return function(t, key) -- Get next key local next_key = ffi.new(ffi.typeof(t.key)) local cur_key if key then cur_key = t.key t.key[0] = key else cur_key = ffi.new(ffi.typeof(t.key)) end local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_GET_NEXT_KEY, map.fd, cur_key, next_key) if not ok then return nil, err end -- Get next value assert(S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, next_key, map.val)) return next_key[0], map.val[0] end, map, nil -- Read for perf event map elseif k == 'reader' then return function (pmap, pid, cpu, event_type) -- Caller must either specify PID or CPU if not pid or pid < 0 then assert((cpu and cpu >= 0), 'NYI: creating composed reader for all CPUs') pid = -1 end -- Create BPF output reader local pe = S.t.perf_event_attr1() pe[0].type = 'software' pe[0].config = 'sw_bpf_output' pe[0].sample_type = 'raw' pe[0].sample_period = 1 pe[0].wakeup_events = 1 local reader, err = S.t.perf_reader(S.perf_event_open(pe, pid, cpu or -1)) if not reader then return nil, tostring(err) end -- Register event reader fd in BPF map assert(cpu < pmap.max_entries, string.format('BPF map smaller than read CPU %d', cpu)) pmap[cpu] = reader.fd -- Open memory map and start reading local ok, err = reader:start() assert(ok, tostring(err)) ok, err = reader:mmap() assert(ok, tostring(err)) return cdef.event_reader(reader, event_type) end -- Signalise this is a map type end return k == '__map' end -- Retrieve key map.key[0] = k local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, map.key, map.val) if not ok then return nil, err end return ffi.new(map.val_type, map.val[0]) end, __newindex = function (map, k, v) map.key[0] = k if v == nil then return S.bpf_map_op(map.fd, S.c.BPF_CMD.MAP_DELETE_ELEM, map.key, nil) end map.val[0] = v return S.bpf_map_op(S.c.BPF_CMD.MAP_UPDATE_ELEM, map.fd, map.key, map.val) end, } -- Linux tracing interface local function trace_check_enabled(path) path = path or '/sys/kernel/debug/tracing' if S.statfs(path) then return true end return nil, 'debugfs not accessible: "mount -t debugfs nodev /sys/kernel/debug"? missing sudo?' end -- Tracepoint interface local tracepoint_mt = { __index = { bpf = function (t, prog) if type(prog) ~= 'table' then -- Create protocol parser with source probe prog = compile(prog, {proto.type(t.type, {source='ptr_to_probe'})}) end -- Load the BPF program local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.TRACEPOINT, prog.insn, prog.pc) assert(prog_fd, tostring(err)..': '..tostring(log)) -- Open tracepoint and attach t.reader:setbpf(prog_fd:getfd()) table.insert(t.progs, prog_fd) return prog_fd end, } } -- Open tracepoint local function tracepoint_open(path, pid, cpu, group_fd) -- Open tracepoint and compile tracepoint type local tp = assert(S.perf_tracepoint('/sys/kernel/debug/tracing/events/'..path)) local tp_type = assert(cdef.tracepoint_type(path)) -- Open tracepoint reader and create interface local reader = assert(S.perf_attach_tracepoint(tp, pid, cpu, group_fd)) return setmetatable({tp=tp,type=tp_type,reader=reader,progs={}}, tracepoint_mt) end local function trace_bpf(ptype, pname, pdef, retprobe, prog, pid, cpu, group_fd) -- Load BPF program if type(prog) ~= 'table' then prog = compile(prog, {proto.pt_regs}) end local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.KPROBE, prog.insn, prog.pc) assert(prog_fd, tostring(err)..': '..tostring(log)) -- Open tracepoint and attach local tp, err = S.perf_probe(ptype, pname, pdef, retprobe) if not tp then prog_fd:close() return nil, tostring(err) end local reader, err = S.perf_attach_tracepoint(tp, pid, cpu, group_fd, {sample_type='raw, callchain'}) if not reader then prog_fd:close() S.perf_probe(ptype, pname, false) return nil, tostring(err) end local ok, err = reader:setbpf(prog_fd:getfd()) if not ok then prog_fd:close() reader:close() S.perf_probe(ptype, pname, false) return nil, tostring(err)..' (kernel version should be at least 4.1)' end -- Create GC closure for reader to close BPF program -- and detach probe in correct order ffi.gc(reader, function () prog_fd:close() reader:close() S.perf_probe(ptype, pname, false) end) return {reader=reader, prog=prog_fd, probe=pname, probe_type=ptype} end -- Module interface return setmetatable({ new = create_emitter, dump = dump, dump_string = dump_string, maps = {}, map = function (type, max_entries, key_ctype, val_ctype) if not key_ctype then key_ctype = ffi.typeof('uint32_t') end if not val_ctype then val_ctype = ffi.typeof('uint32_t') end if not max_entries then max_entries = 4096 end -- Special case for BPF_MAP_STACK_TRACE if S.c.BPF_MAP[type] == S.c.BPF_MAP.STACK_TRACE then key_ctype = ffi.typeof('int32_t') val_ctype = ffi.typeof('struct bpf_stacktrace') end local fd, err = S.bpf_map_create(S.c.BPF_MAP[type], ffi.sizeof(key_ctype), ffi.sizeof(val_ctype), max_entries) if not fd then return nil, tostring(err) end local map = setmetatable({ max_entries = max_entries, key = ffi.new(ffi.typeof('$ [1]', key_ctype)), val = ffi.new(ffi.typeof('$ [1]', val_ctype)), map_type = S.c.BPF_MAP[type], key_type = key_ctype, val_type = val_ctype, fd = fd:nogc():getfd(), }, bpf_map_mt) return map end, socket = function (sock, prog) -- Expect socket type, if sock is string then assume it's -- an interface name (e.g. 'lo'), if it's a number then typecast it as a socket local ok, err if type(sock) == 'string' then local iface = assert(S.nl.getlink())[sock] assert(iface, sock..' is not interface name') sock, err = S.socket('packet', 'raw') assert(sock, tostring(err)) ok, err = sock:bind(S.t.sockaddr_ll({protocol='all', ifindex=iface.index})) assert(ok, tostring(err)) elseif type(sock) == 'number' then sock = S.t.fd(sock):nogc() elseif ffi.istype(S.t.fd, sock) then -- luacheck: ignore -- No cast required else return nil, 'socket must either be an fd number, an interface name, or an ljsyscall socket' end -- Load program and attach it to socket if type(prog) ~= 'table' then prog = compile(prog, {proto.skb}) end local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.SOCKET_FILTER, prog.insn, prog.pc) assert(prog_fd, tostring(err)..': '..tostring(log)) assert(sock:setsockopt('socket', 'attach_bpf', prog_fd:getfd())) return prog_fd, err end, tracepoint = function(tp, prog, pid, cpu, group_fd) assert(trace_check_enabled()) -- Return tracepoint instance if no program specified -- this allows free specialisation of arg0 to tracepoint type local probe = tracepoint_open(tp, pid, cpu, group_fd) -- Load the BPF program if prog then probe:bpf(prog) end return probe end, kprobe = function(tp, prog, retprobe, pid, cpu, group_fd) assert(trace_check_enabled()) -- Open tracepoint and attach local pname, pdef = tp:match('([^:]+):(.+)') return trace_bpf('kprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd) end, uprobe = function(tp, prog, retprobe, pid, cpu, group_fd) assert(trace_check_enabled()) -- Translate symbol to address local obj, sym_want = tp:match('([^:]+):(.+)') if not S.statfs(obj) then return nil, S.t.error(S.c.E.NOENT) end -- Resolve Elf object (no support for anything else) local elf = require('bpf.elf').open(obj) local sym = elf:resolve(sym_want) if not sym then return nil, 'no such symbol' end sym = sym.st_value - elf:loadaddr() local sym_addr = string.format('%x%04x', tonumber(bit.rshift(sym, 32)), tonumber(ffi.cast('uint32_t', sym))) -- Convert it to expected uprobe format local pname = string.format('%s_%s', obj:gsub('.*/', ''), sym_addr) local pdef = obj..':0x'..sym_addr return trace_bpf('uprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd) end, tracelog = function(path) assert(trace_check_enabled()) path = path or '/sys/kernel/debug/tracing/trace_pipe' return io.open(path, 'r') end, ntoh = builtins.ntoh, hton = builtins.hton, }, { __call = function (_, prog) return compile(prog) end, })