Index | Thread | Search

From:
Jan Klemkow <jan@openbsd.org>
Subject:
Re: vmd: add checksum offload for guests
To:
Mike Larkin <mlarkin@nested.page>
Cc:
David Gwynne <david@gwynne.id.au>, Dave Voutila <dv@sisu.io>, Klemens Nanni <kn@openbsd.org>, Alexander Bluhm <bluhm@openbsd.org>, tech@openbsd.org
Date:
Fri, 16 Jan 2026 19:38:16 +0100

Download raw body.

Thread
On Thu, Jan 15, 2026 at 02:08:43PM -0800, Mike Larkin wrote:
> Does this "just work" no matter what guests I run? That's really all I care
> about.

Here is my current diff for checksum offloading in vmd(8).

I tested the following combination of features:

 - Debian/Linux and OpenBSD-current guests
 - OpenBSD-current vio(4) w/o all offloading features
 - Linux, OpenBSD and Hostsystem via veb(4) and vlan(4)
 - IPv4 and IPv6 with tcpbench(1)
 - local interface locked lladdr
 - local interface dhcp

Further tests are welcome!

ok?

Thanks,
Jan

Index: sys/kern/kern_pledge.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_pledge.c,v
diff -u -p -r1.335 kern_pledge.c
--- sys/kern/kern_pledge.c	13 Nov 2025 20:59:14 -0000	1.335
+++ sys/kern/kern_pledge.c	16 Jan 2026 18:24:49 -0000
@@ -46,6 +46,7 @@
 #include <net/route.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_tun.h>
 #include <netinet/in.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
@@ -1337,6 +1338,12 @@ pledge_ioctl(struct proc *p, long com, s
 		    cdevsw[major(vp->v_rdev)].d_open == vmmopen) {
 			error = pledge_ioctl_vmm(p, com);
 			if (error == 0)
+				return 0;
+		}
+		if ((fp->f_type == DTYPE_VNODE) &&
+		    (vp->v_type == VCHR) &&
+		    (cdevsw[major(vp->v_rdev)].d_open == tapopen)) {
+			if (com == TUNSCAP)
 				return 0;
 		}
 	}
Index: usr.sbin/vmd/vionet.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/vionet.c,v
diff -u -p -r1.29 vionet.c
--- usr.sbin/vmd/vionet.c	14 Jan 2026 03:09:05 -0000	1.29
+++ usr.sbin/vmd/vionet.c	16 Jan 2026 18:24:50 -0000
@@ -22,7 +22,12 @@
 #include <dev/pv/virtioreg.h>
 
 #include <net/if.h>
+#include <net/if_tun.h>
 #include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
 #include <netinet/if_ether.h>
 
 #include <errno.h>
@@ -50,6 +55,7 @@
 
 #define VIRTIO_NET_CONFIG_MAC		 0 /*  8 bit x 6 byte */
 
+#define VIRTIO_NET_F_GUEST_CSUM	(1 << 1)
 #define VIRTIO_NET_F_MAC	(1 << 5)
 #define RXQ	0
 #define TXQ	1
@@ -65,7 +71,7 @@ static void *rx_run_loop(void *);
 static void *tx_run_loop(void *);
 static int vionet_rx(struct virtio_dev *, int);
 static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *,
-    int, size_t);
+    int, size_t, struct tun_hdr *th);
 static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int,
     const struct iovec *, int);
 static void vionet_rx_event(int, short, void *);
@@ -84,6 +90,10 @@ static void read_pipe_rx(int, short, voi
 static void read_pipe_tx(int, short, void *);
 static void vionet_assert_pic_irq(struct virtio_dev *);
 static void vionet_deassert_pic_irq(struct virtio_dev *);
+static void vhdr2thdr(struct virtio_net_hdr *, struct tun_hdr *,
+    const struct iovec *, int);
+static void thdr2vhdr(struct tun_hdr *, struct virtio_net_hdr *,
+    const struct iovec *, int);
 
 /* Device Globals */
 struct event ev_tap;
@@ -300,6 +310,30 @@ fail:
 }
 
 /*
+ * Update and sync offload features with tap(4).
+ */
+static void
+vionet_update_offload(struct virtio_dev *dev)
+{
+	struct viodev_msg	msg;
+	int			ret;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.irq = dev->irq;
+	msg.type = VIODEV_MSG_TUNSCAP;
+
+	if (dev->driver_feature & VIRTIO_NET_F_GUEST_CSUM) {
+		msg.data |= IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4;
+		msg.data |= IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
+	}
+
+	ret = imsg_compose_event2(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
+	    &msg, sizeof(msg), ev_base_main);
+	if (ret == -1)
+		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
+}
+
+/*
  * vionet_rx
  *
  * Pull packet from the provided fd and fill the receive-side virtqueue. We
@@ -321,6 +355,7 @@ vionet_rx(struct virtio_dev *dev, int fd
 	struct virtio_net_hdr *hdr = NULL;
 	struct virtio_vq_info *vq_info;
 	struct iovec *iov;
+	struct tun_hdr th;
 	int notify = 0;
 	ssize_t sz;
 	uint8_t status = 0;
@@ -351,8 +386,8 @@ vionet_rx(struct virtio_dev *dev, int fd
 			goto reset;
 		}
 
-		iov = &iov_rx[0];
-		iov_cnt = 1;
+		iov = &iov_rx[1];
+		iov_cnt = 2;
 
 		/*
 		 * First descriptor should be at least as large as the
@@ -373,7 +408,6 @@ vionet_rx(struct virtio_dev *dev, int fd
 		if (iov->iov_base == NULL)
 			goto reset;
 		hdr = iov->iov_base;
-		memset(hdr, 0, sizeof(struct virtio_net_hdr));
 
 		/* Tweak the iovec to account for the virtio_net_hdr. */
 		iov->iov_len -= sizeof(struct virtio_net_hdr);
@@ -418,15 +452,15 @@ vionet_rx(struct virtio_dev *dev, int fd
 			goto reset;
 		}
 
-		hdr->num_buffers = iov_cnt;
-
 		/*
 		 * If we're enforcing hardware address or handling an injected
 		 * packet, we need to use a copy-based approach.
 		 */
+		iov_rx[0].iov_base = &th;
+		iov_rx[0].iov_len = sizeof(th);
 		if (vionet->lockedmac || fd != vionet->data_fd)
-			sz = vionet_rx_copy(vionet, fd, iov_rx, iov_cnt,
-			    chain_len);
+			sz = vionet_rx_copy(vionet, fd, iov_rx + 1, iov_cnt - 1,
+			    chain_len, &th);
 		else
 			sz = vionet_rx_zerocopy(vionet, fd, iov_rx, iov_cnt);
 		if (sz == -1)
@@ -434,6 +468,9 @@ vionet_rx(struct virtio_dev *dev, int fd
 		if (sz == 0)	/* No packets, so bail out for now. */
 			break;
 
+		thdr2vhdr(&th, hdr, iov_rx + 1, iov_cnt - 1);
+		hdr->num_buffers = iov_cnt - 1;
+
 		/*
 		 * Account for the prefixed header since it wasn't included
 		 * in the copy or zerocopy operations.
@@ -473,9 +510,9 @@ reset:
  */
 ssize_t
 vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov,
-    int iov_cnt, size_t chain_len)
+    int iov_cnt, size_t chain_len, struct tun_hdr *th)
 {
-	static uint8_t		 buf[VIONET_HARD_MTU];
+	static uint8_t		 buf[sizeof(struct tun_hdr) + VIONET_HARD_MTU];
 	struct packet		*pkt = NULL;
 	struct ether_header	*eh = NULL;
 	uint8_t			*payload = buf;
@@ -483,9 +520,10 @@ vionet_rx_copy(struct vionet_dev *dev, i
 	ssize_t			 sz;
 
 	/* If reading from the tap(4), try to right-size the read. */
-	if (fd == dev->data_fd)
-		nbytes = MIN(chain_len, VIONET_HARD_MTU);
-	else if (fd == pipe_inject[READ])
+	if (fd == dev->data_fd) {
+		nbytes = sizeof(struct tun_hdr) +
+		    MIN(chain_len, VIONET_HARD_MTU);
+	} else if (fd == pipe_inject[READ])
 		nbytes = sizeof(struct packet);
 	else {
 		log_warnx("%s: invalid fd: %d", __func__, fd);
@@ -504,10 +542,20 @@ vionet_rx_copy(struct vionet_dev *dev, i
 			return (-1);
 		}
 		return (0);
-	} else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) {
+	} else if (fd == dev->data_fd) {
+		if ((size_t)sz < sizeof(struct tun_hdr)) {
+			log_warnx("%s: short tun_hdr", __func__);
+			return (0);
+		}
+		memcpy(th, payload, sizeof *th);
+		payload += sizeof(struct tun_hdr);
+		sz -= sizeof(struct tun_hdr);
+
 		/* If reading the tap(4), we should get valid ethernet. */
-		log_warnx("%s: invalid packet size", __func__);
-		return (0);
+		if (sz < VIONET_MIN_TXLEN) {
+			log_warnx("%s: invalid packet size", __func__);
+			return (0);
+		}
 	} else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) {
 		log_warnx("%s: invalid injected packet object (sz=%ld)",
 		    __func__, sz);
@@ -526,6 +574,7 @@ vionet_rx_copy(struct vionet_dev *dev, i
 			log_warnx("%s: invalid injected packet size", __func__);
 			goto drop;
 		}
+		memset(th, 0, sizeof *th);
 		payload = pkt->buf;
 		sz = (ssize_t)pkt->len;
 	}
@@ -585,6 +634,12 @@ vionet_rx_zerocopy(struct vionet_dev *de
 	sz = readv(fd, iov, iov_cnt);
 	if (sz == -1 && errno == EAGAIN)
 		return (0);
+
+	if ((size_t)sz < sizeof(struct tun_hdr))
+		return (0);
+
+	sz -= sizeof(struct tun_hdr);
+
 	return (sz);
 }
 
@@ -666,6 +721,8 @@ vionet_tx(struct virtio_dev *dev)
 	struct iovec *iov;
 	struct packet pkt;
 	uint8_t status = 0;
+	struct virtio_net_hdr *vhp;
+	struct tun_hdr th;
 
 	status = dev->status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
 	if (status != VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
@@ -692,8 +749,10 @@ vionet_tx(struct virtio_dev *dev)
 			goto reset;
 		}
 
-		iov = &iov_tx[0];
-		iov_cnt = 0;
+		/* the 0th slot will by used by the tun_hdr */
+
+		iov = &iov_tx[1];
+		iov_cnt = 1;
 		chain_len = 0;
 
 		/*
@@ -704,13 +763,16 @@ vionet_tx(struct virtio_dev *dev)
 			log_warnx("%s: invalid descriptor length", __func__);
 			goto reset;
 		}
-		iov->iov_len = desc->len;
 
-		if (iov->iov_len > sizeof(struct virtio_net_hdr)) {
-			/* Chop off the virtio header, leaving packet data. */
-			iov->iov_len -= sizeof(struct virtio_net_hdr);
-			iov->iov_base = hvaddr_mem(desc->addr +
-			    sizeof(struct virtio_net_hdr), iov->iov_len);
+		/* Chop the virtio net header off */
+		vhp = hvaddr_mem(desc->addr, sizeof(*vhp));
+		if (vhp == NULL)
+			goto reset;
+
+		iov->iov_len = desc->len - sizeof(*vhp);
+		if (iov->iov_len > 0) {
+			iov->iov_base = hvaddr_mem(desc->addr + sizeof(*vhp),
+			    iov->iov_len);
 			if (iov->iov_base == NULL)
 				goto reset;
 
@@ -758,7 +820,7 @@ vionet_tx(struct virtio_dev *dev)
 		 * descriptor with packet data contains a large enough buffer
 		 * for this inspection.
 		 */
-		iov = &iov_tx[0];
+		iov = &iov_tx[1];
 		if (vionet->lockedmac) {
 			if (iov->iov_len < ETHER_HDR_LEN) {
 				log_warnx("%s: insufficient header data",
@@ -784,6 +846,15 @@ vionet_tx(struct virtio_dev *dev)
 			}
 		}
 
+		/*
+		 * if we look at more of vhp we might need to copy
+		 * it so it's aligned properly
+		 */
+		vhdr2thdr(vhp, &th, iov_tx + 1, iov_cnt - 1);
+
+		iov_tx[0].iov_base = &th;
+		iov_tx[0].iov_len = sizeof(th);
+
 		/* Write our packet to the tap(4). */
 		sz = writev(vionet->data_fd, iov_tx, iov_cnt);
 		if (sz == -1 && errno != ENOBUFS) {
@@ -1114,6 +1185,7 @@ vionet_cfg_write(struct virtio_dev *dev,
 		dev->driver_feature &= dev->device_feature;
 		DPRINTF("%s: driver features 0x%llx", __func__,
 		    dev->driver_feature);
+		vionet_update_offload(dev);
 		break;
 	case VIO1_PCI_CONFIG_MSIX_VECTOR:
 		/* Ignore until we support MSIX. */
@@ -1555,6 +1627,155 @@ vionet_assert_pic_irq(struct virtio_dev 
 	    &msg, sizeof(msg), ev_base_main);
 	if (ret == -1)
 		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
+}
+
+static int
+memcpyv(void *buf, size_t len, size_t off, const struct iovec *iov, int iovcnt)
+{
+	uint8_t *dst = buf;
+	size_t l;
+
+	for (;;) {
+		if (iovcnt == 0)
+			return (-1);
+
+		if (off < iov->iov_len)
+			break;
+
+		off -= iov->iov_len;
+		iov++;
+		iovcnt--;
+	}
+
+	l = off + len;
+	if (l > iov->iov_len)
+		l = iov->iov_len;
+	l -= off;
+
+	memcpy(dst, (const uint8_t *)iov->iov_base + off, l);
+	dst += l;
+	len -= l;
+
+	if (len == 0)
+		return (0);
+
+	for (;;) {
+		if (iovcnt == 0)
+			return (-1);
+
+		l = len;
+		if (l > iov->iov_len)
+			l = iov->iov_len;
+
+		memcpy(dst, (const uint8_t *)iov->iov_base, l);
+		dst += l;
+		len -= l;
+
+		if (len == 0)
+			break;
+
+		iov++;
+		iovcnt--;
+	}
+
+	return (0);
+}
+
+static void
+hdr_extract(const struct iovec *iov, int iovcnt, size_t *off, uint8_t *proto)
+{
+	size_t		offs;
+	uint16_t	etype;
+
+	if (memcpyv(&etype, sizeof(etype),
+	    offsetof(struct ether_header, ether_type),
+	    iov, iovcnt) == -1)
+		return;
+
+	*off = sizeof(struct ether_header);
+
+	if (etype == htons(ETHERTYPE_VLAN)) {
+		if (memcpyv(&etype, sizeof(etype),
+		    offsetof(struct ether_vlan_header, evl_proto),
+		    iov, iovcnt) == -1)
+			return;
+
+		*off = sizeof(struct ether_vlan_header);
+	}
+
+	if (etype == htons(ETHERTYPE_IP)) {
+		uint8_t hl;
+
+		/* Get ipproto field from IP header. */
+		offs = *off + offsetof(struct ip, ip_p);
+		if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1)
+			return;
+
+		/* Get IP header length field from IP header. */
+		offs = *off;
+		if (memcpyv(&hl, sizeof(hl), offs, iov, iovcnt) == -1)
+			return;
+
+		*off += (hl & 0x0f) << 2;
+	} else if (etype == htons(ETHERTYPE_IPV6)) {
+		/* Get next header field from IP header. */
+		offs = *off + offsetof(struct ip6_hdr, ip6_nxt);
+		if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1)
+			return;
+
+		*off += sizeof(struct ip6_hdr);
+	}
+}
+
+static void
+vhdr2thdr(struct virtio_net_hdr *vh, struct tun_hdr *th,
+    const struct iovec *iov, int iovcnt)
+{
+	memset(th, 0, sizeof(*th));
+
+	if (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		size_t	off;
+		uint8_t	proto;
+
+		hdr_extract(iov, iovcnt, &off, &proto);
+
+		switch (proto) {
+		case IPPROTO_TCP:
+			th->th_flags |= TUN_H_TCP_CSUM;
+			break;
+
+		case IPPROTO_UDP:
+			th->th_flags |= TUN_H_UDP_CSUM;
+			break;
+		}
+	}
+}
+
+static void
+thdr2vhdr(struct tun_hdr *th, struct virtio_net_hdr *vh,
+    const struct iovec *iov, int iovcnt)
+{
+	size_t	off;
+	uint8_t	proto;
+
+	memset(vh, 0, sizeof(*vh));
+
+	if (th->th_flags & (TUN_H_TCP_CSUM | TUN_H_UDP_CSUM)) {
+		hdr_extract(iov, iovcnt, &off, &proto);
+
+		vh->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
+		vh->csum_start = off;
+
+		switch (proto) {
+		case IPPROTO_TCP:
+			vh->csum_offset = offsetof(struct tcphdr, th_sum);
+			break;
+
+		case IPPROTO_UDP:
+			vh->csum_offset = offsetof(struct udphdr, uh_sum);
+			break;
+		}
+	}
 }
 
 /*
Index: usr.sbin/vmd/virtio.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
diff -u -p -r1.134 virtio.c
--- usr.sbin/vmd/virtio.c	14 Jan 2026 03:09:05 -0000	1.134
+++ usr.sbin/vmd/virtio.c	16 Jan 2026 18:24:50 -0000
@@ -19,6 +19,7 @@
 #include <sys/param.h>	/* PAGE_SIZE */
 #include <sys/socket.h>
 #include <sys/wait.h>
+#include <sys/ioctl.h>
 
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcidevs.h>
@@ -28,6 +29,7 @@
 #include <dev/vmm/vmm.h>
 
 #include <net/if.h>
+#include <net/if_tun.h>
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 
@@ -64,6 +66,8 @@ SLIST_HEAD(virtio_dev_head, virtio_dev) 
 
 #define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */
 
+#define VIRTIO_NET_F_CSUM	(1<<0)
+#define VIRTIO_NET_F_GUEST_CSUM	(1<<1)
 #define VIRTIO_NET_F_MAC	(1<<5)
 
 #define VMMCI_F_TIMESYNC	(1<<0)
@@ -1020,6 +1024,8 @@ virtio_init(struct vmd_vm *vm, int child
 	/* Virtio 1.x Network Devices */
 	if (vmc->vmc_nnics > 0) {
 		for (i = 0; i < vmc->vmc_nnics; i++) {
+			struct tun_capabilities	tcap;
+
 			dev = malloc(sizeof(struct virtio_dev));
 			if (dev == NULL) {
 				log_warn("calloc failure allocating vionet");
@@ -1034,7 +1040,8 @@ virtio_init(struct vmd_vm *vm, int child
 			}
 			virtio_dev_init(vm, dev, id, VIONET_QUEUE_SIZE_DEFAULT,
 			    VIRTIO_NET_QUEUES,
-			    (VIRTIO_NET_F_MAC | VIRTIO_F_VERSION_1));
+			    (VIRTIO_NET_F_MAC | VIRTIO_NET_F_CSUM |
+				VIRTIO_NET_F_GUEST_CSUM | VIRTIO_F_VERSION_1));
 
 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
 			    dev) == -1) {
@@ -1056,6 +1063,14 @@ virtio_init(struct vmd_vm *vm, int child
 			dev->vmm_id = vm->vm_vmmid;
 			dev->vionet.data_fd = child_taps[i];
 
+			/*
+			 * IFCAPs are tweaked after feature negotiation with
+			 * the guest later.
+			 */
+			memset(&tcap, 0, sizeof(tcap));
+			if (ioctl(dev->vionet.data_fd, TUNSCAP, &tcap) == -1)
+				fatal("tap(4) TUNSCAP");
+
 			/* MAC address has been assigned by the parent */
 			memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
 			dev->vionet.lockedmac =
@@ -1532,10 +1547,12 @@ virtio_dev_launch(struct vmd_vm *vm, str
 		}
 
 		/* Close data fds. Only the child device needs them now. */
-		if (virtio_dev_closefds(dev) == -1) {
-			log_warnx("%s: failed to close device data fds",
-			    __func__);
-			goto err;
+		if (dev->dev_type != VMD_DEVTYPE_NET) {
+			if (virtio_dev_closefds(dev) == -1) {
+				log_warnx("%s: failed to close device data fds",
+				    __func__);
+				goto err;
+			}
 		}
 
 		/* 2. Send over details on the VM (including memory fds). */
@@ -1758,6 +1775,18 @@ handle_dev_msg(struct viodev_msg *msg, s
 	case VIODEV_MSG_ERROR:
 		log_warnx("%s: device reported error", __func__);
 		break;
+	case VIODEV_MSG_TUNSCAP:
+	{
+		struct tun_capabilities	tcap;
+
+		memset(&tcap, 0, sizeof(tcap));
+		tcap.tun_if_capabilities = msg->data;
+
+		if (ioctl(gdev->vionet.data_fd, TUNSCAP, &tcap) == -1)
+			fatal("%s: tap(4) TUNSCAP", __func__);
+
+		break;
+	}
 	case VIODEV_MSG_INVALID:
 	case VIODEV_MSG_IO_READ:
 	case VIODEV_MSG_IO_WRITE:
Index: usr.sbin/vmd/virtio.h
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v
diff -u -p -r1.60 virtio.h
--- usr.sbin/vmd/virtio.h	14 Jan 2026 03:09:05 -0000	1.60
+++ usr.sbin/vmd/virtio.h	16 Jan 2026 18:24:50 -0000
@@ -134,6 +134,7 @@ struct viodev_msg {
 #define VIODEV_MSG_IO_WRITE	5
 #define VIODEV_MSG_DUMP		6
 #define VIODEV_MSG_SHUTDOWN	7
+#define VIODEV_MSG_TUNSCAP	8
 
 	uint16_t reg;		/* VirtIO register */
 	uint8_t io_sz;		/* IO instruction size */
@@ -309,6 +310,9 @@ struct virtio_net_hdr {
 	uint16_t padding_reserved;
 	*/
 };
+
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM	1 /* flags */
+#define VIRTIO_NET_HDR_F_DATA_VALID	2 /* flags */
 
 enum vmmci_cmd {
 	VMMCI_NONE = 0,