From: Damien Miller Subject: bpf filtering on arbitrary sockets To: tech@openbsd.org Date: Thu, 30 Oct 2025 16:03:00 +1100 Hi, This is an idea that came up while talking with dlg@ about network daemons. Quite a few programs and daemons use SOCK_RAW to send link-level packets after pledge(). E.g. usr.sbin/relayd/check_icmp.c wants to send ICMP packets. The problem with this is that, if they get compromised, they still hold a very powerful socket that can send pretty much arbitrary packets. If one of these programs gets compromised then the attacker can pretty easily pivot through the existing raw socket. What if we allowed attaching a bpf instances to sockets? On the receive side, these could be used to limit the types of messages received. Similarly, on the send side, they could restrict the ability of the socket to send arbitrary messages. E.g. for something like ping(1), a bpf program like: { 0x28, 0, 0, 0x0000000c }, { 0x15, 0, 7, 0x00000800 }, { 0x30, 0, 0, 0x00000017 }, { 0x15, 0, 5, 0x00000001 }, { 0x20, 0, 0, 0x0000001a }, { 0x15, 2, 0, src_addr }, { 0x20, 0, 0, 0x0000001e }, { 0x15, 0, 1, dst_addr }, { 0x6, 0, 0, 0x00000074 }, { 0x6, 0, 0, 0x00000000 }, could be used on the send side to drastically limit the power of the raw socket. These programs are trivial to write using `tcpdump -dd`; with a small tweak to tcpdump it might be possible to generate them from a Makefile rule. Practically, this would mean a few new setsockopt()s: 1. Attach a bpf program to a socket that filter received packets 2. Attach a bpf program to a socket that filters sent packets 3. An equivalent to BIOCLOCK to lock the bpf filters for a socket Programs would set up their SOCK_RAW sockets as usual, then attach the filters, then lock the filters then pledge. Here's what this looks like for the output side. It's pretty simple (too simple?) - there's a SOL_SOCKET SO_SEND_BPF setsockopt that loads a bpf_program to a socket fd. Packets that fail to match the filter are denied with EPERM. Note that the filter is applied to just the data that is passed to send(), sendto(), etc and not any protocol-level headers. AFAIK it can work with any datagram socket type in addition to SOCK_RAW, so could for example be used to limit the structure of USP messages, etc in addition to low-level SOCK_RAW control. There's also a SO_BPF_LOCK setsockopt that locks the program as you'd expect. What do you think? What would be a good daemon to try this against? I was thinking dhcpleased but that seems to use bpf for most of its sending, though that should IMO be converted to AF_FRAME now that we have it. -d Index: kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v diff -u -p -r1.378 uipc_socket.c --- kern/uipc_socket.c 23 May 2025 23:41:46 -0000 1.378 +++ kern/uipc_socket.c 30 May 2025 06:54:45 -0000 @@ -51,6 +51,7 @@ #include #include #include +#include #ifdef DDB #include @@ -285,6 +286,7 @@ sorele(struct socket *so) so->so_proto->pr_domain->dom_dispose) (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); m_purge(so->so_rcv.sb_mb); + free(so->so_obpf, M_SO_BPF, so->so_obpf_len); #ifdef SOCKET_SPLICE if (so->so_sp) @@ -695,10 +697,17 @@ restart: top->m_flags |= M_ZEROIZE; mtx_leave(&so->so_snd.sb_mtx); solock_shared(so); + if (so->so_obpf != NULL && + bpf_mfilter((struct bpf_insn *)so->so_obpf, + top, top->m_pkthdr.len) <= 0) { + error = EPERM; + goto unsent; + } if (flags & MSG_OOB) error = pru_sendoob(so, top, addr, control); else error = pru_send(so, top, addr, control); +unsent: sounlock_shared(so); mtx_enter(&so->so_snd.sb_mtx); clen = 0; @@ -1369,6 +1378,10 @@ sosplice(struct socket *so, int fd, off_ } solock_pair(so, sosp); + if (sosp->so_obpf != NULL) { + error = EOPNOTSUPP; + goto release; + } if ((so->so_options & SO_ACCEPTCONN) || (sosp->so_options & SO_ACCEPTCONN)) { error = EOPNOTSUPP; @@ -1922,6 +1935,7 @@ int sosetopt(struct socket *so, int level, int optname, struct mbuf *m) { int error = 0; + u_char *p; if (level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput) { @@ -1950,6 +1964,11 @@ sosetopt(struct socket *so, int level, i sounlock(so); break; + case SO_BPF_LOCK: + if ((so->so_options & SO_BPF_LOCK) != 0) + return (EPERM); + /* FALLTHROUGH */ + case SO_BINDANY: if ((error = suser(curproc)) != 0) /* XXX */ return (error); @@ -2082,6 +2101,30 @@ sosetopt(struct socket *so, int level, i break; #endif /* SOCKET_SPLICE */ + case SO_OUT_BPF: + if ((so->so_options & SO_BPF_LOCK) != 0) + return (EPERM); + if ((error = suser(curproc)) != 0) /* XXX */ + return (error); + if (m == NULL || m->m_len == 0 || + (m->m_len % sizeof(struct bpf_insn)) != 0) + return (EINVAL); + if (!bpf_validate(mtod(m, struct bpf_insn *), + m->m_len / sizeof(struct bpf_insn))) + return (EINVAL); + if ((p = malloc(m->m_len, M_SO_BPF, + M_WAITOK|M_ZERO|M_CANFAIL)) == NULL) + return ENOMEM; + memcpy(p, mtod(m, void *), m->m_len); + + solock(so); + free(so->so_obpf, M_SO_BPF, so->so_obpf_len); + so->so_obpf = p; + so->so_obpf_len = m->m_len; + sounlock(so); + + break; + default: error = ENOPROTOOPT; break; @@ -2121,6 +2164,7 @@ sogetopt(struct socket *so, int level, i sounlock_shared(so); break; + case SO_BPF_LOCK: case SO_BINDANY: case SO_USELOOPBACK: case SO_DEBUG: Index: sys/malloc.h =================================================================== RCS file: /cvs/src/sys/sys/malloc.h,v diff -u -p -r1.127 malloc.h --- sys/malloc.h 5 Feb 2025 18:29:17 -0000 1.127 +++ sys/malloc.h 30 May 2025 06:54:46 -0000 @@ -65,7 +65,7 @@ #define M_FREE 0 /* should be on free list */ /* 1 - free */ #define M_DEVBUF 2 /* device driver memory */ -/* 3 - free */ +#define M_SO_BPF 3 /* socket bpf program */ #define M_PCB 4 /* protocol control blocks */ #define M_RTABLE 5 /* routing tables */ #define M_PF 6 /* packet filter structures */ Index: sys/socket.h =================================================================== RCS file: /cvs/src/sys/sys/socket.h,v diff -u -p -r1.107 socket.h --- sys/socket.h 19 Apr 2025 04:12:36 -0000 1.107 +++ sys/socket.h 30 May 2025 06:54:46 -0000 @@ -97,6 +97,7 @@ typedef __sa_family_t sa_family_t; /* so #define SO_TIMESTAMP 0x0800 /* timestamp received dgram traffic */ #define SO_BINDANY 0x1000 /* allow bind to any address */ #define SO_ZEROIZE 0x2000 /* zero out all mbufs sent over socket */ +#define SO_BPF_LOCK 0x4000 /* Deny changes to output bpf program */ /* * Additional options, not kept in so_options. @@ -114,6 +115,7 @@ typedef __sa_family_t sa_family_t; /* so #define SO_SPLICE 0x1023 /* splice data to other socket */ #define SO_DOMAIN 0x1024 /* get socket domain */ #define SO_PROTOCOL 0x1025 /* get socket protocol */ +#define SO_OUT_BPF 0x1026 /* output bpf program */ /* * Structure used for manipulating linger option. Index: sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v diff -u -p -r1.158 socketvar.h --- sys/socketvar.h 8 Apr 2025 15:31:22 -0000 1.158 +++ sys/socketvar.h 30 May 2025 06:54:46 -0000 @@ -159,6 +159,9 @@ struct socket { struct sockbuf so_rcv; struct sockbuf so_snd; + u_char *so_obpf; /* [s] output bpf program */ + u_int so_obpf_len; /* [s] length of output bpf program */ + void (*so_upcall)(struct socket *, caddr_t, int); /* [s] */ caddr_t so_upcallarg; /* [s] Arg for above */ uid_t so_euid; /* [I] who opened the socket */ @@ -191,6 +194,7 @@ struct socket { #define SS_ISDISCONNECTED 0x800 /* socket disconnected from peer */ #define SS_PRIV 0x080 /* privileged for broadcast, raw... */ +#define SS_BPF_LOCK 0x100 /* so_obpf protected against change */ #define SS_CONNECTOUT 0x1000 /* connect, not accept, at this end */ #define SS_ISSENDING 0x2000 /* hint for lower layer */ #define SS_DNS 0x4000 /* created using SOCK_DNS socket(2) */