From: Otto Moerbeek <otto@drijf.net>
Subject: Re: bpf filtering on arbitrary sockets
To: Damien Miller <djm@mindrot.org>
Cc: tech@openbsd.org
Date: Thu, 30 Oct 2025 07:56:58 +0100

On Thu, Oct 30, 2025 at 04:03:00PM +1100, Damien Miller wrote:

> Hi,
> 
> This is an idea that came up while talking with dlg@ about network
> daemons.
> 
> Quite a few programs and daemons use SOCK_RAW to send link-level packets
> after pledge(). E.g. usr.sbin/relayd/check_icmp.c wants to send ICMP
> packets.
> 
> The problem with this is that, if they get compromised, they still hold
> a very powerful socket that can send pretty much arbitrary packets. If
> one of these programs gets compromised then the attacker can pretty
> easily pivot through the existing raw socket.
> 
> What if we allowed attaching a bpf instances to sockets? On the
> receive side, these could be used to limit the types of messages
> received. Similarly, on the send side, they could restrict the ability
> of the socket to send arbitrary messages.
> 
> E.g. for something like ping(1), a bpf program like:
> 
>     { 0x28, 0, 0, 0x0000000c },
>     { 0x15, 0, 7, 0x00000800 },
>     { 0x30, 0, 0, 0x00000017 },
>     { 0x15, 0, 5, 0x00000001 },
>     { 0x20, 0, 0, 0x0000001a },
>     { 0x15, 2, 0, src_addr },
>     { 0x20, 0, 0, 0x0000001e },
>     { 0x15, 0, 1, dst_addr },
>     { 0x6, 0, 0, 0x00000074 },
>     { 0x6, 0, 0, 0x00000000 },
> 
> could be used on the send side to drastically limit the power of the raw
> socket. These programs are trivial to write using `tcpdump -dd`; with a
> small tweak to tcpdump it might be possible to generate them from a
> Makefile rule.
> 
> Practically, this would mean a few new setsockopt()s:
> 
> 1. Attach a bpf program to a socket that filter received packets
> 2. Attach a bpf program to a socket that filters sent packets
> 3. An equivalent to BIOCLOCK to lock the bpf filters for a socket
> 
> Programs would set up their SOCK_RAW sockets as usual, then attach
> the filters, then lock the filters then pledge.
> 
> Here's what this looks like for the output side.
> 
> It's pretty simple (too simple?) - there's a SOL_SOCKET SO_SEND_BPF
> setsockopt that loads a bpf_program to a socket fd. Packets that fail
> to match the filter are denied with EPERM.
> 
> Note that the filter is applied to just the data that is passed to
> send(), sendto(), etc and not any protocol-level headers. AFAIK it
> can work with any datagram socket type in addition to SOCK_RAW, so
> could for example be used to limit the structure of USP messages,
> etc in addition to low-level SOCK_RAW control.
> 
> There's also a SO_BPF_LOCK setsockopt that locks the program as you'd
> expect.
> 
> What do you think? What would be a good daemon to try this against?
> I was thinking dhcpleased but that seems to use bpf for most of its
> sending, though that should IMO be converted to AF_FRAME now that we
> have it.

Linux has something very similar to thi, for the receiving side at
least). By coincidence I came acress this the other day:

https://natanyellin.com/posts/ebpf-filtering-done-right/

So would it make sense to make the interface the same? It would also
be nice to be able to avoid the race described and have a raw socket
variant that recieves nothing until you set a filter.

I like the idea of filtering sending very much.

	-Otto


> 
> -d
> 
> Index: kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> diff -u -p -r1.378 uipc_socket.c
> --- kern/uipc_socket.c	23 May 2025 23:41:46 -0000	1.378
> +++ kern/uipc_socket.c	30 May 2025 06:54:45 -0000
> @@ -51,6 +51,7 @@
>  #include <sys/rwlock.h>
>  #include <sys/time.h>
>  #include <sys/refcnt.h>
> +#include <net/bpf.h>
>  
>  #ifdef DDB
>  #include <machine/db_machdep.h>
> @@ -285,6 +286,7 @@ sorele(struct socket *so)
>  	    so->so_proto->pr_domain->dom_dispose)
>  		(*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
>  	m_purge(so->so_rcv.sb_mb);
> +	free(so->so_obpf, M_SO_BPF, so->so_obpf_len);
>  
>  #ifdef SOCKET_SPLICE
>  	if (so->so_sp)
> @@ -695,10 +697,17 @@ restart:
>  				top->m_flags |= M_ZEROIZE;
>  			mtx_leave(&so->so_snd.sb_mtx);
>  			solock_shared(so);
> +			if (so->so_obpf != NULL &&
> +			    bpf_mfilter((struct bpf_insn *)so->so_obpf,
> +			    top, top->m_pkthdr.len) <= 0) {
> +				error = EPERM;
> +				goto unsent;
> +			}
>  			if (flags & MSG_OOB)
>  				error = pru_sendoob(so, top, addr, control);
>  			else
>  				error = pru_send(so, top, addr, control);
> +unsent:
>  			sounlock_shared(so);
>  			mtx_enter(&so->so_snd.sb_mtx);
>  			clen = 0;
> @@ -1369,6 +1378,10 @@ sosplice(struct socket *so, int fd, off_
>  	}
>  	solock_pair(so, sosp);
>  
> +	if (sosp->so_obpf != NULL) {
> +		error = EOPNOTSUPP;
> +		goto release;
> +	}
>  	if ((so->so_options & SO_ACCEPTCONN) ||
>  	    (sosp->so_options & SO_ACCEPTCONN)) {
>  		error = EOPNOTSUPP;
> @@ -1922,6 +1935,7 @@ int
>  sosetopt(struct socket *so, int level, int optname, struct mbuf *m)
>  {
>  	int error = 0;
> +	u_char *p;
>  
>  	if (level != SOL_SOCKET) {
>  		if (so->so_proto->pr_ctloutput) {
> @@ -1950,6 +1964,11 @@ sosetopt(struct socket *so, int level, i
>  			sounlock(so);
>  
>  			break;
> +		case SO_BPF_LOCK:
> +			if ((so->so_options & SO_BPF_LOCK) != 0)
> +				return (EPERM);
> +			/* FALLTHROUGH */
> +
>  		case SO_BINDANY:
>  			if ((error = suser(curproc)) != 0)	/* XXX */
>  				return (error);
> @@ -2082,6 +2101,30 @@ sosetopt(struct socket *so, int level, i
>  			break;
>  #endif /* SOCKET_SPLICE */
>  
> +		case SO_OUT_BPF:
> +			if ((so->so_options & SO_BPF_LOCK) != 0)
> +				return (EPERM);
> +			if ((error = suser(curproc)) != 0)	/* XXX */
> +				return (error);
> +			if (m == NULL || m->m_len == 0 ||
> +			    (m->m_len % sizeof(struct bpf_insn)) != 0)
> +				return (EINVAL);
> +			if (!bpf_validate(mtod(m, struct bpf_insn *),
> +			    m->m_len / sizeof(struct bpf_insn)))
> +				return (EINVAL);
> +			if ((p = malloc(m->m_len, M_SO_BPF,
> +			    M_WAITOK|M_ZERO|M_CANFAIL)) == NULL)
> +				return ENOMEM;
> +			memcpy(p, mtod(m, void *), m->m_len);
> +
> +			solock(so);
> +			free(so->so_obpf, M_SO_BPF, so->so_obpf_len);
> +			so->so_obpf = p;
> +			so->so_obpf_len = m->m_len;
> +			sounlock(so);
> +
> +			break;
> +
>  		default:
>  			error = ENOPROTOOPT;
>  			break;
> @@ -2121,6 +2164,7 @@ sogetopt(struct socket *so, int level, i
>  			sounlock_shared(so);
>  			break;
>  
> +		case SO_BPF_LOCK:
>  		case SO_BINDANY:
>  		case SO_USELOOPBACK:
>  		case SO_DEBUG:
> Index: sys/malloc.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/malloc.h,v
> diff -u -p -r1.127 malloc.h
> --- sys/malloc.h	5 Feb 2025 18:29:17 -0000	1.127
> +++ sys/malloc.h	30 May 2025 06:54:46 -0000
> @@ -65,7 +65,7 @@
>  #define	M_FREE		0	/* should be on free list */
>  /* 1 - free */
>  #define	M_DEVBUF	2	/* device driver memory */
> -/* 3 - free */
> +#define M_SO_BPF	3	/* socket bpf program */
>  #define	M_PCB		4	/* protocol control blocks */
>  #define	M_RTABLE	5	/* routing tables */
>  #define	M_PF		6	/* packet filter structures */
> Index: sys/socket.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/socket.h,v
> diff -u -p -r1.107 socket.h
> --- sys/socket.h	19 Apr 2025 04:12:36 -0000	1.107
> +++ sys/socket.h	30 May 2025 06:54:46 -0000
> @@ -97,6 +97,7 @@ typedef	__sa_family_t	sa_family_t;	/* so
>  #define SO_TIMESTAMP	0x0800		/* timestamp received dgram traffic */
>  #define SO_BINDANY	0x1000		/* allow bind to any address */
>  #define SO_ZEROIZE	0x2000		/* zero out all mbufs sent over socket */
> +#define SO_BPF_LOCK	0x4000		/* Deny changes to output bpf program */
>  
>  /*
>   * Additional options, not kept in so_options.
> @@ -114,6 +115,7 @@ typedef	__sa_family_t	sa_family_t;	/* so
>  #define	SO_SPLICE	0x1023		/* splice data to other socket */
>  #define	SO_DOMAIN	0x1024		/* get socket domain */
>  #define	SO_PROTOCOL	0x1025		/* get socket protocol */
> +#define SO_OUT_BPF	0x1026		/* output bpf program */
>  
>  /*
>   * Structure used for manipulating linger option.
> Index: sys/socketvar.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/socketvar.h,v
> diff -u -p -r1.158 socketvar.h
> --- sys/socketvar.h	8 Apr 2025 15:31:22 -0000	1.158
> +++ sys/socketvar.h	30 May 2025 06:54:46 -0000
> @@ -159,6 +159,9 @@ struct socket {
>  	struct sockbuf so_rcv;
>  	struct sockbuf so_snd;
>  
> +	u_char	 *so_obpf;		/* [s] output bpf program */
> +	u_int	so_obpf_len;		/* [s] length of output bpf program */
> +
>  	void	(*so_upcall)(struct socket *, caddr_t, int); /* [s] */
>  	caddr_t	so_upcallarg;		/* [s] Arg for above */
>  	uid_t	so_euid;		/* [I] who opened the socket */
> @@ -191,6 +194,7 @@ struct socket {
>  #define	SS_ISDISCONNECTED	0x800	/* socket disconnected from peer */
>  
>  #define	SS_PRIV			0x080	/* privileged for broadcast, raw... */
> +#define SS_BPF_LOCK		0x100	/* so_obpf protected against change */
>  #define	SS_CONNECTOUT		0x1000	/* connect, not accept, at this end */
>  #define	SS_ISSENDING		0x2000	/* hint for lower layer */
>  #define	SS_DNS			0x4000	/* created using SOCK_DNS socket(2) */
>