From: Damien Miller <djm@mindrot.org>
Subject: bpf filtering on arbitrary sockets
To: tech@openbsd.org
Date: Thu, 30 Oct 2025 16:03:00 +1100

Hi,

This is an idea that came up while talking with dlg@ about network
daemons.

Quite a few programs and daemons use SOCK_RAW to send link-level packets
after pledge(). E.g. usr.sbin/relayd/check_icmp.c wants to send ICMP
packets.

The problem with this is that, if they get compromised, they still hold
a very powerful socket that can send pretty much arbitrary packets. If
one of these programs gets compromised then the attacker can pretty
easily pivot through the existing raw socket.

What if we allowed attaching a bpf instances to sockets? On the
receive side, these could be used to limit the types of messages
received. Similarly, on the send side, they could restrict the ability
of the socket to send arbitrary messages.

E.g. for something like ping(1), a bpf program like:

    { 0x28, 0, 0, 0x0000000c },
    { 0x15, 0, 7, 0x00000800 },
    { 0x30, 0, 0, 0x00000017 },
    { 0x15, 0, 5, 0x00000001 },
    { 0x20, 0, 0, 0x0000001a },
    { 0x15, 2, 0, src_addr },
    { 0x20, 0, 0, 0x0000001e },
    { 0x15, 0, 1, dst_addr },
    { 0x6, 0, 0, 0x00000074 },
    { 0x6, 0, 0, 0x00000000 },

could be used on the send side to drastically limit the power of the raw
socket. These programs are trivial to write using `tcpdump -dd`; with a
small tweak to tcpdump it might be possible to generate them from a
Makefile rule.

Practically, this would mean a few new setsockopt()s:

1. Attach a bpf program to a socket that filter received packets
2. Attach a bpf program to a socket that filters sent packets
3. An equivalent to BIOCLOCK to lock the bpf filters for a socket

Programs would set up their SOCK_RAW sockets as usual, then attach
the filters, then lock the filters then pledge.

Here's what this looks like for the output side.

It's pretty simple (too simple?) - there's a SOL_SOCKET SO_SEND_BPF
setsockopt that loads a bpf_program to a socket fd. Packets that fail
to match the filter are denied with EPERM.

Note that the filter is applied to just the data that is passed to
send(), sendto(), etc and not any protocol-level headers. AFAIK it
can work with any datagram socket type in addition to SOCK_RAW, so
could for example be used to limit the structure of USP messages,
etc in addition to low-level SOCK_RAW control.

There's also a SO_BPF_LOCK setsockopt that locks the program as you'd
expect.

What do you think? What would be a good daemon to try this against?
I was thinking dhcpleased but that seems to use bpf for most of its
sending, though that should IMO be converted to AF_FRAME now that we
have it.

-d

Index: kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.378 uipc_socket.c
--- kern/uipc_socket.c	23 May 2025 23:41:46 -0000	1.378
+++ kern/uipc_socket.c	30 May 2025 06:54:45 -0000
@@ -51,6 +51,7 @@
 #include <sys/rwlock.h>
 #include <sys/time.h>
 #include <sys/refcnt.h>
+#include <net/bpf.h>
 
 #ifdef DDB
 #include <machine/db_machdep.h>
@@ -285,6 +286,7 @@ sorele(struct socket *so)
 	    so->so_proto->pr_domain->dom_dispose)
 		(*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
 	m_purge(so->so_rcv.sb_mb);
+	free(so->so_obpf, M_SO_BPF, so->so_obpf_len);
 
 #ifdef SOCKET_SPLICE
 	if (so->so_sp)
@@ -695,10 +697,17 @@ restart:
 				top->m_flags |= M_ZEROIZE;
 			mtx_leave(&so->so_snd.sb_mtx);
 			solock_shared(so);
+			if (so->so_obpf != NULL &&
+			    bpf_mfilter((struct bpf_insn *)so->so_obpf,
+			    top, top->m_pkthdr.len) <= 0) {
+				error = EPERM;
+				goto unsent;
+			}
 			if (flags & MSG_OOB)
 				error = pru_sendoob(so, top, addr, control);
 			else
 				error = pru_send(so, top, addr, control);
+unsent:
 			sounlock_shared(so);
 			mtx_enter(&so->so_snd.sb_mtx);
 			clen = 0;
@@ -1369,6 +1378,10 @@ sosplice(struct socket *so, int fd, off_
 	}
 	solock_pair(so, sosp);
 
+	if (sosp->so_obpf != NULL) {
+		error = EOPNOTSUPP;
+		goto release;
+	}
 	if ((so->so_options & SO_ACCEPTCONN) ||
 	    (sosp->so_options & SO_ACCEPTCONN)) {
 		error = EOPNOTSUPP;
@@ -1922,6 +1935,7 @@ int
 sosetopt(struct socket *so, int level, int optname, struct mbuf *m)
 {
 	int error = 0;
+	u_char *p;
 
 	if (level != SOL_SOCKET) {
 		if (so->so_proto->pr_ctloutput) {
@@ -1950,6 +1964,11 @@ sosetopt(struct socket *so, int level, i
 			sounlock(so);
 
 			break;
+		case SO_BPF_LOCK:
+			if ((so->so_options & SO_BPF_LOCK) != 0)
+				return (EPERM);
+			/* FALLTHROUGH */
+
 		case SO_BINDANY:
 			if ((error = suser(curproc)) != 0)	/* XXX */
 				return (error);
@@ -2082,6 +2101,30 @@ sosetopt(struct socket *so, int level, i
 			break;
 #endif /* SOCKET_SPLICE */
 
+		case SO_OUT_BPF:
+			if ((so->so_options & SO_BPF_LOCK) != 0)
+				return (EPERM);
+			if ((error = suser(curproc)) != 0)	/* XXX */
+				return (error);
+			if (m == NULL || m->m_len == 0 ||
+			    (m->m_len % sizeof(struct bpf_insn)) != 0)
+				return (EINVAL);
+			if (!bpf_validate(mtod(m, struct bpf_insn *),
+			    m->m_len / sizeof(struct bpf_insn)))
+				return (EINVAL);
+			if ((p = malloc(m->m_len, M_SO_BPF,
+			    M_WAITOK|M_ZERO|M_CANFAIL)) == NULL)
+				return ENOMEM;
+			memcpy(p, mtod(m, void *), m->m_len);
+
+			solock(so);
+			free(so->so_obpf, M_SO_BPF, so->so_obpf_len);
+			so->so_obpf = p;
+			so->so_obpf_len = m->m_len;
+			sounlock(so);
+
+			break;
+
 		default:
 			error = ENOPROTOOPT;
 			break;
@@ -2121,6 +2164,7 @@ sogetopt(struct socket *so, int level, i
 			sounlock_shared(so);
 			break;
 
+		case SO_BPF_LOCK:
 		case SO_BINDANY:
 		case SO_USELOOPBACK:
 		case SO_DEBUG:
Index: sys/malloc.h
===================================================================
RCS file: /cvs/src/sys/sys/malloc.h,v
diff -u -p -r1.127 malloc.h
--- sys/malloc.h	5 Feb 2025 18:29:17 -0000	1.127
+++ sys/malloc.h	30 May 2025 06:54:46 -0000
@@ -65,7 +65,7 @@
 #define	M_FREE		0	/* should be on free list */
 /* 1 - free */
 #define	M_DEVBUF	2	/* device driver memory */
-/* 3 - free */
+#define M_SO_BPF	3	/* socket bpf program */
 #define	M_PCB		4	/* protocol control blocks */
 #define	M_RTABLE	5	/* routing tables */
 #define	M_PF		6	/* packet filter structures */
Index: sys/socket.h
===================================================================
RCS file: /cvs/src/sys/sys/socket.h,v
diff -u -p -r1.107 socket.h
--- sys/socket.h	19 Apr 2025 04:12:36 -0000	1.107
+++ sys/socket.h	30 May 2025 06:54:46 -0000
@@ -97,6 +97,7 @@ typedef	__sa_family_t	sa_family_t;	/* so
 #define SO_TIMESTAMP	0x0800		/* timestamp received dgram traffic */
 #define SO_BINDANY	0x1000		/* allow bind to any address */
 #define SO_ZEROIZE	0x2000		/* zero out all mbufs sent over socket */
+#define SO_BPF_LOCK	0x4000		/* Deny changes to output bpf program */
 
 /*
  * Additional options, not kept in so_options.
@@ -114,6 +115,7 @@ typedef	__sa_family_t	sa_family_t;	/* so
 #define	SO_SPLICE	0x1023		/* splice data to other socket */
 #define	SO_DOMAIN	0x1024		/* get socket domain */
 #define	SO_PROTOCOL	0x1025		/* get socket protocol */
+#define SO_OUT_BPF	0x1026		/* output bpf program */
 
 /*
  * Structure used for manipulating linger option.
Index: sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
diff -u -p -r1.158 socketvar.h
--- sys/socketvar.h	8 Apr 2025 15:31:22 -0000	1.158
+++ sys/socketvar.h	30 May 2025 06:54:46 -0000
@@ -159,6 +159,9 @@ struct socket {
 	struct sockbuf so_rcv;
 	struct sockbuf so_snd;
 
+	u_char	 *so_obpf;		/* [s] output bpf program */
+	u_int	so_obpf_len;		/* [s] length of output bpf program */
+
 	void	(*so_upcall)(struct socket *, caddr_t, int); /* [s] */
 	caddr_t	so_upcallarg;		/* [s] Arg for above */
 	uid_t	so_euid;		/* [I] who opened the socket */
@@ -191,6 +194,7 @@ struct socket {
 #define	SS_ISDISCONNECTED	0x800	/* socket disconnected from peer */
 
 #define	SS_PRIV			0x080	/* privileged for broadcast, raw... */
+#define SS_BPF_LOCK		0x100	/* so_obpf protected against change */
 #define	SS_CONNECTOUT		0x1000	/* connect, not accept, at this end */
 #define	SS_ISSENDING		0x2000	/* hint for lower layer */
 #define	SS_DNS			0x4000	/* created using SOCK_DNS socket(2) */