Download raw body.
bpf filtering on arbitrary sockets
On Thu, Oct 30, 2025 at 04:03:00PM +1100, Damien Miller wrote:
> Hi,
>
> This is an idea that came up while talking with dlg@ about network
> daemons.
>
> Quite a few programs and daemons use SOCK_RAW to send link-level packets
> after pledge(). E.g. usr.sbin/relayd/check_icmp.c wants to send ICMP
> packets.
>
> The problem with this is that, if they get compromised, they still hold
> a very powerful socket that can send pretty much arbitrary packets. If
> one of these programs gets compromised then the attacker can pretty
> easily pivot through the existing raw socket.
>
> What if we allowed attaching a bpf instances to sockets? On the
> receive side, these could be used to limit the types of messages
> received. Similarly, on the send side, they could restrict the ability
> of the socket to send arbitrary messages.
>
> E.g. for something like ping(1), a bpf program like:
>
> { 0x28, 0, 0, 0x0000000c },
> { 0x15, 0, 7, 0x00000800 },
> { 0x30, 0, 0, 0x00000017 },
> { 0x15, 0, 5, 0x00000001 },
> { 0x20, 0, 0, 0x0000001a },
> { 0x15, 2, 0, src_addr },
> { 0x20, 0, 0, 0x0000001e },
> { 0x15, 0, 1, dst_addr },
> { 0x6, 0, 0, 0x00000074 },
> { 0x6, 0, 0, 0x00000000 },
>
> could be used on the send side to drastically limit the power of the raw
> socket. These programs are trivial to write using `tcpdump -dd`; with a
> small tweak to tcpdump it might be possible to generate them from a
> Makefile rule.
>
> Practically, this would mean a few new setsockopt()s:
>
> 1. Attach a bpf program to a socket that filter received packets
> 2. Attach a bpf program to a socket that filters sent packets
> 3. An equivalent to BIOCLOCK to lock the bpf filters for a socket
>
> Programs would set up their SOCK_RAW sockets as usual, then attach
> the filters, then lock the filters then pledge.
>
> Here's what this looks like for the output side.
>
> It's pretty simple (too simple?) - there's a SOL_SOCKET SO_SEND_BPF
> setsockopt that loads a bpf_program to a socket fd. Packets that fail
> to match the filter are denied with EPERM.
>
> Note that the filter is applied to just the data that is passed to
> send(), sendto(), etc and not any protocol-level headers. AFAIK it
> can work with any datagram socket type in addition to SOCK_RAW, so
> could for example be used to limit the structure of USP messages,
> etc in addition to low-level SOCK_RAW control.
>
> There's also a SO_BPF_LOCK setsockopt that locks the program as you'd
> expect.
>
> What do you think? What would be a good daemon to try this against?
> I was thinking dhcpleased but that seems to use bpf for most of its
> sending, though that should IMO be converted to AF_FRAME now that we
> have it.
Linux has something very similar to thi, for the receiving side at
least). By coincidence I came acress this the other day:
https://natanyellin.com/posts/ebpf-filtering-done-right/
So would it make sense to make the interface the same? It would also
be nice to be able to avoid the race described and have a raw socket
variant that recieves nothing until you set a filter.
I like the idea of filtering sending very much.
-Otto
>
> -d
>
> Index: kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> diff -u -p -r1.378 uipc_socket.c
> --- kern/uipc_socket.c 23 May 2025 23:41:46 -0000 1.378
> +++ kern/uipc_socket.c 30 May 2025 06:54:45 -0000
> @@ -51,6 +51,7 @@
> #include <sys/rwlock.h>
> #include <sys/time.h>
> #include <sys/refcnt.h>
> +#include <net/bpf.h>
>
> #ifdef DDB
> #include <machine/db_machdep.h>
> @@ -285,6 +286,7 @@ sorele(struct socket *so)
> so->so_proto->pr_domain->dom_dispose)
> (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
> m_purge(so->so_rcv.sb_mb);
> + free(so->so_obpf, M_SO_BPF, so->so_obpf_len);
>
> #ifdef SOCKET_SPLICE
> if (so->so_sp)
> @@ -695,10 +697,17 @@ restart:
> top->m_flags |= M_ZEROIZE;
> mtx_leave(&so->so_snd.sb_mtx);
> solock_shared(so);
> + if (so->so_obpf != NULL &&
> + bpf_mfilter((struct bpf_insn *)so->so_obpf,
> + top, top->m_pkthdr.len) <= 0) {
> + error = EPERM;
> + goto unsent;
> + }
> if (flags & MSG_OOB)
> error = pru_sendoob(so, top, addr, control);
> else
> error = pru_send(so, top, addr, control);
> +unsent:
> sounlock_shared(so);
> mtx_enter(&so->so_snd.sb_mtx);
> clen = 0;
> @@ -1369,6 +1378,10 @@ sosplice(struct socket *so, int fd, off_
> }
> solock_pair(so, sosp);
>
> + if (sosp->so_obpf != NULL) {
> + error = EOPNOTSUPP;
> + goto release;
> + }
> if ((so->so_options & SO_ACCEPTCONN) ||
> (sosp->so_options & SO_ACCEPTCONN)) {
> error = EOPNOTSUPP;
> @@ -1922,6 +1935,7 @@ int
> sosetopt(struct socket *so, int level, int optname, struct mbuf *m)
> {
> int error = 0;
> + u_char *p;
>
> if (level != SOL_SOCKET) {
> if (so->so_proto->pr_ctloutput) {
> @@ -1950,6 +1964,11 @@ sosetopt(struct socket *so, int level, i
> sounlock(so);
>
> break;
> + case SO_BPF_LOCK:
> + if ((so->so_options & SO_BPF_LOCK) != 0)
> + return (EPERM);
> + /* FALLTHROUGH */
> +
> case SO_BINDANY:
> if ((error = suser(curproc)) != 0) /* XXX */
> return (error);
> @@ -2082,6 +2101,30 @@ sosetopt(struct socket *so, int level, i
> break;
> #endif /* SOCKET_SPLICE */
>
> + case SO_OUT_BPF:
> + if ((so->so_options & SO_BPF_LOCK) != 0)
> + return (EPERM);
> + if ((error = suser(curproc)) != 0) /* XXX */
> + return (error);
> + if (m == NULL || m->m_len == 0 ||
> + (m->m_len % sizeof(struct bpf_insn)) != 0)
> + return (EINVAL);
> + if (!bpf_validate(mtod(m, struct bpf_insn *),
> + m->m_len / sizeof(struct bpf_insn)))
> + return (EINVAL);
> + if ((p = malloc(m->m_len, M_SO_BPF,
> + M_WAITOK|M_ZERO|M_CANFAIL)) == NULL)
> + return ENOMEM;
> + memcpy(p, mtod(m, void *), m->m_len);
> +
> + solock(so);
> + free(so->so_obpf, M_SO_BPF, so->so_obpf_len);
> + so->so_obpf = p;
> + so->so_obpf_len = m->m_len;
> + sounlock(so);
> +
> + break;
> +
> default:
> error = ENOPROTOOPT;
> break;
> @@ -2121,6 +2164,7 @@ sogetopt(struct socket *so, int level, i
> sounlock_shared(so);
> break;
>
> + case SO_BPF_LOCK:
> case SO_BINDANY:
> case SO_USELOOPBACK:
> case SO_DEBUG:
> Index: sys/malloc.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/malloc.h,v
> diff -u -p -r1.127 malloc.h
> --- sys/malloc.h 5 Feb 2025 18:29:17 -0000 1.127
> +++ sys/malloc.h 30 May 2025 06:54:46 -0000
> @@ -65,7 +65,7 @@
> #define M_FREE 0 /* should be on free list */
> /* 1 - free */
> #define M_DEVBUF 2 /* device driver memory */
> -/* 3 - free */
> +#define M_SO_BPF 3 /* socket bpf program */
> #define M_PCB 4 /* protocol control blocks */
> #define M_RTABLE 5 /* routing tables */
> #define M_PF 6 /* packet filter structures */
> Index: sys/socket.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/socket.h,v
> diff -u -p -r1.107 socket.h
> --- sys/socket.h 19 Apr 2025 04:12:36 -0000 1.107
> +++ sys/socket.h 30 May 2025 06:54:46 -0000
> @@ -97,6 +97,7 @@ typedef __sa_family_t sa_family_t; /* so
> #define SO_TIMESTAMP 0x0800 /* timestamp received dgram traffic */
> #define SO_BINDANY 0x1000 /* allow bind to any address */
> #define SO_ZEROIZE 0x2000 /* zero out all mbufs sent over socket */
> +#define SO_BPF_LOCK 0x4000 /* Deny changes to output bpf program */
>
> /*
> * Additional options, not kept in so_options.
> @@ -114,6 +115,7 @@ typedef __sa_family_t sa_family_t; /* so
> #define SO_SPLICE 0x1023 /* splice data to other socket */
> #define SO_DOMAIN 0x1024 /* get socket domain */
> #define SO_PROTOCOL 0x1025 /* get socket protocol */
> +#define SO_OUT_BPF 0x1026 /* output bpf program */
>
> /*
> * Structure used for manipulating linger option.
> Index: sys/socketvar.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/socketvar.h,v
> diff -u -p -r1.158 socketvar.h
> --- sys/socketvar.h 8 Apr 2025 15:31:22 -0000 1.158
> +++ sys/socketvar.h 30 May 2025 06:54:46 -0000
> @@ -159,6 +159,9 @@ struct socket {
> struct sockbuf so_rcv;
> struct sockbuf so_snd;
>
> + u_char *so_obpf; /* [s] output bpf program */
> + u_int so_obpf_len; /* [s] length of output bpf program */
> +
> void (*so_upcall)(struct socket *, caddr_t, int); /* [s] */
> caddr_t so_upcallarg; /* [s] Arg for above */
> uid_t so_euid; /* [I] who opened the socket */
> @@ -191,6 +194,7 @@ struct socket {
> #define SS_ISDISCONNECTED 0x800 /* socket disconnected from peer */
>
> #define SS_PRIV 0x080 /* privileged for broadcast, raw... */
> +#define SS_BPF_LOCK 0x100 /* so_obpf protected against change */
> #define SS_CONNECTOUT 0x1000 /* connect, not accept, at this end */
> #define SS_ISSENDING 0x2000 /* hint for lower layer */
> #define SS_DNS 0x4000 /* created using SOCK_DNS socket(2) */
>
bpf filtering on arbitrary sockets