--[[ Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com> Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ]] local ffi = require('ffi') local BPF = ffi.typeof('struct bpf') ffi.cdef [[ struct sk_buff { uint32_t len; uint32_t pkt_type; uint32_t mark; uint32_t queue_mapping; uint32_t protocol; uint32_t vlan_present; uint32_t vlan_tci; uint32_t vlan_proto; uint32_t priority; uint32_t ingress_ifindex; uint32_t ifindex; uint32_t tc_index; uint32_t cb[5]; uint32_t hash; uint32_t tc_classid; uint32_t data; uint32_t data_end; uint32_t napi_id; /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ uint32_t family; uint32_t remote_ip4; /* Stored in network byte order */ uint32_t local_ip4; /* Stored in network byte order */ uint32_t remote_ip6[4]; /* Stored in network byte order */ uint32_t local_ip6[4]; /* Stored in network byte order */ uint32_t remote_port; /* Stored in network byte order */ uint32_t local_port; /* stored in host byte order */ /* ... here. */ uint32_t data_meta; }; struct net_off_t { uint8_t ver:4; } __attribute__((packed)); struct eth_t { uint8_t dst[6]; uint8_t src[6]; uint16_t type; } __attribute__((packed)); struct dot1q_t { uint16_t pri:3; uint16_t cfi:1; uint16_t vlanid:12; uint16_t type; } __attribute__((packed)); struct arp_t { uint16_t htype; uint16_t ptype; uint8_t hlen; uint8_t plen; uint16_t oper; uint8_t sha[6]; uint32_t spa; uint8_t tha[6]; uint32_t tpa; } __attribute__((packed)); struct ip_t { uint8_t ver:4; uint8_t hlen:4; uint8_t tos; uint16_t tlen; uint16_t identification; uint16_t ffo_unused:1; uint16_t df:1; uint16_t mf:1; uint16_t foffset:13; uint8_t ttl; uint8_t proto; uint16_t hchecksum; uint32_t src; uint32_t dst; } __attribute__((packed)); struct icmp_t { uint8_t type; uint8_t code; uint16_t checksum; } __attribute__((packed)); struct ip6_t { uint32_t ver:4; uint32_t priority:8; uint32_t flow_label:20; uint16_t payload_len; uint8_t next_header; uint8_t hop_limit; uint64_t src_hi; uint64_t src_lo; uint64_t dst_hi; uint64_t dst_lo; } __attribute__((packed)); struct ip6_opt_t { uint8_t next_header; uint8_t ext_len; uint8_t pad[6]; } __attribute__((packed)); struct icmp6_t { uint8_t type; uint8_t code; uint16_t checksum; } __attribute__((packed)); struct udp_t { uint16_t src_port; uint16_t dst_port; uint16_t length; uint16_t crc; } __attribute__((packed)); struct tcp_t { uint16_t src_port; uint16_t dst_port; uint32_t seq_num; uint32_t ack_num; uint8_t offset:4; uint8_t reserved:4; uint8_t flag_cwr:1; uint8_t flag_ece:1; uint8_t flag_urg:1; uint8_t flag_ack:1; uint8_t flag_psh:1; uint8_t flag_rst:1; uint8_t flag_syn:1; uint8_t flag_fin:1; uint16_t rcv_wnd; uint16_t cksum; uint16_t urg_ptr; } __attribute__((packed)); struct vxlan_t { uint32_t rsv1:4; uint32_t iflag:1; uint32_t rsv2:3; uint32_t rsv3:24; uint32_t key:24; uint32_t rsv4:8; } __attribute__((packed)); ]] -- Architecture-specific ptrace register layout local S = require('syscall') local arch = S.abi.arch local parm_to_reg = {} if arch == 'x64' then ffi.cdef [[ struct pt_regs { unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; unsigned long r11; unsigned long r10; unsigned long r9; unsigned long r8; unsigned long ax; unsigned long cx; unsigned long dx; unsigned long si; unsigned long di; unsigned long orig_ax; unsigned long ip; unsigned long cs; unsigned long flags; unsigned long sp; unsigned long ss; };]] parm_to_reg = {parm1='di', parm2='si', parm3='dx', parm4='cx', parm5='r8', ret='sp', fp='bp'} else ffi.cdef 'struct pt_regs {};' end -- Map symbolic registers to architecture ABI ffi.metatype('struct pt_regs', { __index = function (_ --[[t]],k) return assert(parm_to_reg[k], 'no such register: '..k) end, }) local M = {} -- Dissector interface local function dissector(type, e, dst, src, field) local parent = e.V[src].const -- Create new dissector variable e.vcopy(dst, src) -- Compute and materialize new dissector offset from parent e.V[dst].const = {off=e.V[src].const.off, __dissector=e.V[src].const.__dissector} parent.__dissector[field](e, dst) e.V[dst].const.__dissector = type end M.dissector = dissector -- Get current effective offset, load field value at an offset relative to it and -- add its value to compute next effective offset (e.g. udp_off = ip_off + pkt[ip_off].hlen) local function next_offset(e, var, type, off, mask, shift) local d = e.V[var].const -- Materialize relative offset value in R0 local dst_reg, tmp_reg if d.off then dst_reg = e.vreg(var, 0, true) tmp_reg = dst_reg -- Use target register to avoid copy e.emit(BPF.LD + BPF.ABS + e.const_width[ffi.sizeof(type)], tmp_reg, 0, 0, d.off + off or 0) else tmp_reg = e.vreg(e.tmpvar, 0, true, type) -- Reserve R0 for temporary relative offset dst_reg = e.vreg(var) -- Must rematerialize (if it was spilled by tmp var) e.emit(BPF.LD + BPF.IND + e.const_width[ffi.sizeof(type)], tmp_reg, dst_reg, 0, off or 0) end -- Finalize relative offset if mask then e.emit(BPF.ALU + BPF.AND + BPF.K, tmp_reg, 0, 0, mask) end if shift and shift ~= 0 then local op = BPF.LSH if shift < 0 then op = BPF.RSH shift = -shift end e.emit(BPF.ALU + op + BPF.K, tmp_reg, 0, 0, shift) end -- Add to base offset to turn it into effective address if dst_reg ~= tmp_reg then e.emit(BPF.ALU + BPF.ADD + BPF.X, dst_reg, tmp_reg, 0, 0) else e.emit(BPF.ALU + BPF.ADD + BPF.K, dst_reg, 0, 0, d.off) end -- Discard temporary allocations d.off = nil e.V[e.tmpvar].reg = nil end local function next_skip(e, var, off) local d = e.V[var].const if not d.off then local dst_reg = e.vreg(var) e.emit(BPF.ALU64 + BPF.ADD + BPF.K, dst_reg, 0, 0, off) else d.off = d.off + off end end local function skip_eth(e, dst) -- IP starts right after ETH header (fixed size) local d = e.V[dst].const d.off = d.off + ffi.sizeof('struct eth_t') end -- Export types M.type = function(typestr, t) t = t or {} t.__dissector=ffi.typeof(typestr) return t end M.skb = M.type('struct sk_buff', {source='ptr_to_ctx'}) M.pt_regs = M.type('struct pt_regs', {source='ptr_to_probe'}) M.pkt = M.type('struct eth_t', {off=0, source='ptr_to_pkt'}) -- skb needs special accessors -- M.eth = function (...) return dissector(ffi.typeof('struct eth_t'), ...) end M.dot1q = function (...) return dissector(ffi.typeof('struct dot1q_t'), ...) end M.arp = function (...) return dissector(ffi.typeof('struct arp_t'), ...) end M.icmp = function (...) return dissector(ffi.typeof('struct icmp_t'), ...) end M.ip = function (...) return dissector(ffi.typeof('struct ip_t'), ...) end M.icmp6 = function (...) return dissector(ffi.typeof('struct icmp6_t'), ...) end M.ip6 = function (...) return dissector(ffi.typeof('struct ip6_t'), ...) end M.ip6_opt = function (...) return dissector(ffi.typeof('struct ip6_opt_t'), ...) end M.udp = function (...) return dissector(ffi.typeof('struct udp_t'), ...) end M.tcp = function (...) return dissector(ffi.typeof('struct tcp_t'), ...) end M.vxlan = function (...) return dissector(ffi.typeof('struct vxlan_t'), ...) end M.data = function (...) return dissector(ffi.typeof('uint8_t'), ...) end M.net_off = function (...) return dissector(ffi.typeof('struct net_off_t'), ...) end -- Metatables ffi.metatype(ffi.typeof('struct eth_t'), { __index = { ip = skip_eth, ip6 = skip_eth, net_off = function (e, dst) next_skip(e, dst, BPF.NET_OFF) end, } }) ffi.metatype(ffi.typeof('struct net_off_t'), { __index = { ip = function () end, ip6 = function () end, } }) ffi.metatype(ffi.typeof('struct ip_t'), { __index = { -- Skip IP header length (stored as number of words) -- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets -- Mask first nibble and shift by 2 (multiplication by 4) icmp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end, udp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end, tcp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end, } }) ffi.metatype(ffi.typeof('struct ip6_t'), { __index = { -- Skip fixed IPv6 header length (40 bytes) -- The caller must check the value of `next_header` to skip any extension headers icmp6 = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end, udp = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end, tcp = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end, ip6_opt = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end, } }) local ip6_opt_ext_len_off = ffi.offsetof('struct ip6_opt_t', 'ext_len') ffi.metatype(ffi.typeof('struct ip6_opt_t'), { __index = { -- Skip IPv6 extension header length (field `ext_len`) icmp6 = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end, udp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end, tcp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end, ip6_opt = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end, } }) ffi.metatype(ffi.typeof('struct tcp_t'), { __index = { -- Skip TCP header length (stored as number of words) -- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets data = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ffi.offsetof('struct tcp_t', 'offset'), 0xf0, -2) end, } }) ffi.metatype(ffi.typeof('struct udp_t'), { __index = { -- Skip UDP header length (8 octets) data = function(e, dst) next_skip(e, dst, ffi.sizeof('struct udp_t')) end, } }) -- Constants M.c = { eth = { -- Constants http://standards.ieee.org/regauth/ethertype ip = 0x0800, -- IP (v4) protocol ip6 = 0x86dd, -- IP (v6) protocol arp = 0x0806, -- Address resolution protocol revarp = 0x8035, -- Reverse addr resolution protocol vlan = 0x8100, -- IEEE 802.1Q VLAN tagging }, ip = { -- Reserved Addresses addr_any = 0x00000000, -- 0.0.0.0 addr_broadcast = 0xffffffff, -- 255.255.255.255 addr_loopback = 0x7f000001, -- 127.0.0.1 addr_mcast_all = 0xe0000001, -- 224.0.0.1 addr_mcast_local = 0xe00000ff, -- 224.0.0.255 -- Type of service (ip_tos), RFC 1349 ("obsoleted by RFC 2474") tos_default = 0x00, -- default tos_lowdelay = 0x10, -- low delay tos_throughput = 0x08, -- high throughput tos_reliability = 0x04, -- high reliability tos_lowcost = 0x02, -- low monetary cost - XXX tos_ect = 0x02, -- ECN-capable transport tos_ce = 0x01, -- congestion experienced -- Fragmentation flags (ip_off) rf = 0x8000, -- reserved df = 0x4000, -- don't fragment mf = 0x2000, -- more fragments (not last frag) offmask = 0x1fff, -- mask for fragment offset -- Time-to-live (ip_ttl), seconds ttl_default = 64, -- default ttl, RFC 1122, RFC 1340 ttl_max = 255, -- maximum ttl -- Protocol (ip_p) - http://www.iana.org/assignments/protocol-numbers proto_ip = 0, -- dummy for IP proto_hopopts = 0, -- IPv6 hop-by-hop options proto_icmp = 1, -- ICMP proto_igmp = 2, -- IGMP proto_ggp = 3, -- gateway-gateway protocol proto_ipip = 4, -- IP in IP proto_st = 5, -- ST datagram mode proto_tcp = 6, -- TCP proto_cbt = 7, -- CBT proto_egp = 8, -- exterior gateway protocol proto_igp = 9, -- interior gateway protocol proto_bbnrcc = 10, -- BBN RCC monitoring proto_nvp = 11, -- Network Voice Protocol proto_pup = 12, -- PARC universal packet proto_argus = 13, -- ARGUS proto_emcon = 14, -- EMCON proto_xnet = 15, -- Cross Net Debugger proto_chaos = 16, -- Chaos proto_udp = 17, -- UDP proto_mux = 18, -- multiplexing proto_dcnmeas = 19, -- DCN measurement proto_hmp = 20, -- Host Monitoring Protocol proto_prm = 21, -- Packet Radio Measurement proto_idp = 22, -- Xerox NS IDP proto_trunk1 = 23, -- Trunk-1 proto_trunk2 = 24, -- Trunk-2 proto_leaf1 = 25, -- Leaf-1 proto_leaf2 = 26, -- Leaf-2 proto_rdp = 27, -- "Reliable Datagram" proto proto_irtp = 28, -- Inet Reliable Transaction proto_tp = 29, -- ISO TP class 4 proto_netblt = 30, -- Bulk Data Transfer proto_mfpnsp = 31, -- MFE Network Services proto_meritinp= 32, -- Merit Internodal Protocol proto_sep = 33, -- Sequential Exchange proto proto_3pc = 34, -- Third Party Connect proto proto_idpr = 35, -- Interdomain Policy Route proto_xtp = 36, -- Xpress Transfer Protocol proto_ddp = 37, -- Datagram Delivery Proto proto_cmtp = 38, -- IDPR Ctrl Message Trans proto_tppp = 39, -- TP++ Transport Protocol proto_il = 40, -- IL Transport Protocol proto_ip6 = 41, -- IPv6 proto_sdrp = 42, -- Source Demand Routing proto_routing = 43, -- IPv6 routing header proto_fragment= 44, -- IPv6 fragmentation header proto_rsvp = 46, -- Reservation protocol proto_gre = 47, -- General Routing Encap proto_mhrp = 48, -- Mobile Host Routing proto_ena = 49, -- ENA proto_esp = 50, -- Encap Security Payload proto_ah = 51, -- Authentication Header proto_inlsp = 52, -- Integated Net Layer Sec proto_swipe = 53, -- SWIPE proto_narp = 54, -- NBMA Address Resolution proto_mobile = 55, -- Mobile IP, RFC 2004 proto_tlsp = 56, -- Transport Layer Security proto_skip = 57, -- SKIP proto_icmp6 = 58, -- ICMP for IPv6 proto_none = 59, -- IPv6 no next header proto_dstopts = 60, -- IPv6 destination options proto_anyhost = 61, -- any host internal proto proto_cftp = 62, -- CFTP proto_anynet = 63, -- any local network proto_expak = 64, -- SATNET and Backroom EXPAK proto_kryptolan = 65, -- Kryptolan proto_rvd = 66, -- MIT Remote Virtual Disk proto_ippc = 67, -- Inet Pluribus Packet Core proto_distfs = 68, -- any distributed fs proto_satmon = 69, -- SATNET Monitoring proto_visa = 70, -- VISA Protocol proto_ipcv = 71, -- Inet Packet Core Utility proto_cpnx = 72, -- Comp Proto Net Executive proto_cphb = 73, -- Comp Protocol Heart Beat proto_wsn = 74, -- Wang Span Network proto_pvp = 75, -- Packet Video Protocol proto_brsatmon= 76, -- Backroom SATNET Monitor proto_sunnd = 77, -- SUN ND Protocol proto_wbmon = 78, -- WIDEBAND Monitoring proto_wbexpak = 79, -- WIDEBAND EXPAK proto_eon = 80, -- ISO CNLP proto_vmtp = 81, -- Versatile Msg Transport proto_svmtp = 82, -- Secure VMTP proto_vines = 83, -- VINES proto_ttp = 84, -- TTP proto_nsfigp = 85, -- NSFNET-IGP proto_dgp = 86, -- Dissimilar Gateway Proto proto_tcf = 87, -- TCF proto_eigrp = 88, -- EIGRP proto_ospf = 89, -- Open Shortest Path First proto_spriterpc= 90, -- Sprite RPC Protocol proto_larp = 91, -- Locus Address Resolution proto_mtp = 92, -- Multicast Transport Proto proto_ax25 = 93, -- AX.25 Frames proto_ipipencap= 94, -- yet-another IP encap proto_micp = 95, -- Mobile Internet Ctrl proto_sccsp = 96, -- Semaphore Comm Sec Proto proto_etherip = 97, -- Ethernet in IPv4 proto_encap = 98, -- encapsulation header proto_anyenc = 99, -- private encryption scheme proto_gmtp = 100, -- GMTP proto_ifmp = 101, -- Ipsilon Flow Mgmt Proto proto_pnni = 102, -- PNNI over IP proto_pim = 103, -- Protocol Indep Multicast proto_aris = 104, -- ARIS proto_scps = 105, -- SCPS proto_qnx = 106, -- QNX proto_an = 107, -- Active Networks proto_ipcomp = 108, -- IP Payload Compression proto_snp = 109, -- Sitara Networks Protocol proto_compaqpeer= 110, -- Compaq Peer Protocol proto_ipxip = 111, -- IPX in IP proto_vrrp = 112, -- Virtual Router Redundancy proto_pgm = 113, -- PGM Reliable Transport proto_any0hop = 114, -- 0-hop protocol proto_l2tp = 115, -- Layer 2 Tunneling Proto proto_ddx = 116, -- D-II Data Exchange (DDX) proto_iatp = 117, -- Interactive Agent Xfer proto_stp = 118, -- Schedule Transfer Proto proto_srp = 119, -- SpectraLink Radio Proto proto_uti = 120, -- UTI proto_smp = 121, -- Simple Message Protocol proto_sm = 122, -- SM proto_ptp = 123, -- Performance Transparency proto_isis = 124, -- ISIS over IPv4 proto_fire = 125, -- FIRE proto_crtp = 126, -- Combat Radio Transport proto_crudp = 127, -- Combat Radio UDP proto_sscopmce= 128, -- SSCOPMCE proto_iplt = 129, -- IPLT proto_sps = 130, -- Secure Packet Shield proto_pipe = 131, -- Private IP Encap in IP proto_sctp = 132, -- Stream Ctrl Transmission proto_fc = 133, -- Fibre Channel proto_rsvpign = 134, -- RSVP-E2E-IGNORE proto_raw = 255, -- Raw IP packets proto_reserved= 255, -- Reserved }, } return M