Download raw body.
vmd: add checksum offload for guests
On Fri, Jan 16, 2026 at 07:38:16PM +0100, Jan Klemkow wrote:
> On Thu, Jan 15, 2026 at 02:08:43PM -0800, Mike Larkin wrote:
> > Does this "just work" no matter what guests I run? That's really all I care
> > about.
>
> Here is my current diff for checksum offloading in vmd(8).
>
> I tested the following combination of features:
>
> - Debian/Linux and OpenBSD-current guests
> - OpenBSD-current vio(4) w/o all offloading features
> - Linux, OpenBSD and Hostsystem via veb(4) and vlan(4)
> - IPv4 and IPv6 with tcpbench(1)
> - local interface locked lladdr
> - local interface dhcp
>
> Further tests are welcome!
>
> ok?
>
> Thanks,
> Jan
>
Not sure about dv@, but I can't really review this. it's hundreds of lines
of changes in vmd vionet that require a level of understanding of tap(4) and
in virtio/vionet (and the network stack in general) that I don't have.
When I did the original vionet in vmd years ago it was pretty straightforward
since the spec (for *all* virtio) was only like 20 pages. I was able to write
that code in a weekend. now that we have bolted on all this other stuff, I
don't feel comfortable giving oks in this area anymore since there is no way
I can look at this and know if it's right or not. I think you need a network
stack person to ok this, *and* explain what the ramifications are for vmd
in general. It looks like vmd is doing inspection of every packet now? I
dont think we want that.
-ml
> Index: sys/kern/kern_pledge.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_pledge.c,v
> diff -u -p -r1.335 kern_pledge.c
> --- sys/kern/kern_pledge.c 13 Nov 2025 20:59:14 -0000 1.335
> +++ sys/kern/kern_pledge.c 16 Jan 2026 18:24:49 -0000
> @@ -46,6 +46,7 @@
> #include <net/route.h>
> #include <net/if.h>
> #include <net/if_var.h>
> +#include <net/if_tun.h>
> #include <netinet/in.h>
> #include <netinet6/in6_var.h>
> #include <netinet6/nd6.h>
> @@ -1337,6 +1338,12 @@ pledge_ioctl(struct proc *p, long com, s
> cdevsw[major(vp->v_rdev)].d_open == vmmopen) {
> error = pledge_ioctl_vmm(p, com);
> if (error == 0)
> + return 0;
> + }
> + if ((fp->f_type == DTYPE_VNODE) &&
> + (vp->v_type == VCHR) &&
> + (cdevsw[major(vp->v_rdev)].d_open == tapopen)) {
> + if (com == TUNSCAP)
> return 0;
> }
> }
> Index: usr.sbin/vmd/vionet.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/vionet.c,v
> diff -u -p -r1.29 vionet.c
> --- usr.sbin/vmd/vionet.c 14 Jan 2026 03:09:05 -0000 1.29
> +++ usr.sbin/vmd/vionet.c 16 Jan 2026 18:24:50 -0000
> @@ -22,7 +22,12 @@
> #include <dev/pv/virtioreg.h>
>
> #include <net/if.h>
> +#include <net/if_tun.h>
> #include <netinet/in.h>
> +#include <netinet/ip.h>
> +#include <netinet/ip6.h>
> +#include <netinet/tcp.h>
> +#include <netinet/udp.h>
> #include <netinet/if_ether.h>
>
> #include <errno.h>
> @@ -50,6 +55,7 @@
>
> #define VIRTIO_NET_CONFIG_MAC 0 /* 8 bit x 6 byte */
>
> +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1)
> #define VIRTIO_NET_F_MAC (1 << 5)
> #define RXQ 0
> #define TXQ 1
> @@ -65,7 +71,7 @@ static void *rx_run_loop(void *);
> static void *tx_run_loop(void *);
> static int vionet_rx(struct virtio_dev *, int);
> static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *,
> - int, size_t);
> + int, size_t, struct tun_hdr *th);
> static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int,
> const struct iovec *, int);
> static void vionet_rx_event(int, short, void *);
> @@ -84,6 +90,10 @@ static void read_pipe_rx(int, short, voi
> static void read_pipe_tx(int, short, void *);
> static void vionet_assert_pic_irq(struct virtio_dev *);
> static void vionet_deassert_pic_irq(struct virtio_dev *);
> +static void vhdr2thdr(struct virtio_net_hdr *, struct tun_hdr *,
> + const struct iovec *, int);
> +static void thdr2vhdr(struct tun_hdr *, struct virtio_net_hdr *,
> + const struct iovec *, int);
>
> /* Device Globals */
> struct event ev_tap;
> @@ -300,6 +310,30 @@ fail:
> }
>
> /*
> + * Update and sync offload features with tap(4).
> + */
> +static void
> +vionet_update_offload(struct virtio_dev *dev)
> +{
> + struct viodev_msg msg;
> + int ret;
> +
> + memset(&msg, 0, sizeof(msg));
> + msg.irq = dev->irq;
> + msg.type = VIODEV_MSG_TUNSCAP;
> +
> + if (dev->driver_feature & VIRTIO_NET_F_GUEST_CSUM) {
> + msg.data |= IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4;
> + msg.data |= IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> + }
> +
> + ret = imsg_compose_event2(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
> + &msg, sizeof(msg), ev_base_main);
> + if (ret == -1)
> + log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
> +}
> +
> +/*
> * vionet_rx
> *
> * Pull packet from the provided fd and fill the receive-side virtqueue. We
> @@ -321,6 +355,7 @@ vionet_rx(struct virtio_dev *dev, int fd
> struct virtio_net_hdr *hdr = NULL;
> struct virtio_vq_info *vq_info;
> struct iovec *iov;
> + struct tun_hdr th;
> int notify = 0;
> ssize_t sz;
> uint8_t status = 0;
> @@ -351,8 +386,8 @@ vionet_rx(struct virtio_dev *dev, int fd
> goto reset;
> }
>
> - iov = &iov_rx[0];
> - iov_cnt = 1;
> + iov = &iov_rx[1];
> + iov_cnt = 2;
>
> /*
> * First descriptor should be at least as large as the
> @@ -373,7 +408,6 @@ vionet_rx(struct virtio_dev *dev, int fd
> if (iov->iov_base == NULL)
> goto reset;
> hdr = iov->iov_base;
> - memset(hdr, 0, sizeof(struct virtio_net_hdr));
>
> /* Tweak the iovec to account for the virtio_net_hdr. */
> iov->iov_len -= sizeof(struct virtio_net_hdr);
> @@ -418,15 +452,15 @@ vionet_rx(struct virtio_dev *dev, int fd
> goto reset;
> }
>
> - hdr->num_buffers = iov_cnt;
> -
> /*
> * If we're enforcing hardware address or handling an injected
> * packet, we need to use a copy-based approach.
> */
> + iov_rx[0].iov_base = &th;
> + iov_rx[0].iov_len = sizeof(th);
> if (vionet->lockedmac || fd != vionet->data_fd)
> - sz = vionet_rx_copy(vionet, fd, iov_rx, iov_cnt,
> - chain_len);
> + sz = vionet_rx_copy(vionet, fd, iov_rx + 1, iov_cnt - 1,
> + chain_len, &th);
> else
> sz = vionet_rx_zerocopy(vionet, fd, iov_rx, iov_cnt);
> if (sz == -1)
> @@ -434,6 +468,9 @@ vionet_rx(struct virtio_dev *dev, int fd
> if (sz == 0) /* No packets, so bail out for now. */
> break;
>
> + thdr2vhdr(&th, hdr, iov_rx + 1, iov_cnt - 1);
> + hdr->num_buffers = iov_cnt - 1;
> +
> /*
> * Account for the prefixed header since it wasn't included
> * in the copy or zerocopy operations.
> @@ -473,9 +510,9 @@ reset:
> */
> ssize_t
> vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov,
> - int iov_cnt, size_t chain_len)
> + int iov_cnt, size_t chain_len, struct tun_hdr *th)
> {
> - static uint8_t buf[VIONET_HARD_MTU];
> + static uint8_t buf[sizeof(struct tun_hdr) + VIONET_HARD_MTU];
> struct packet *pkt = NULL;
> struct ether_header *eh = NULL;
> uint8_t *payload = buf;
> @@ -483,9 +520,10 @@ vionet_rx_copy(struct vionet_dev *dev, i
> ssize_t sz;
>
> /* If reading from the tap(4), try to right-size the read. */
> - if (fd == dev->data_fd)
> - nbytes = MIN(chain_len, VIONET_HARD_MTU);
> - else if (fd == pipe_inject[READ])
> + if (fd == dev->data_fd) {
> + nbytes = sizeof(struct tun_hdr) +
> + MIN(chain_len, VIONET_HARD_MTU);
> + } else if (fd == pipe_inject[READ])
> nbytes = sizeof(struct packet);
> else {
> log_warnx("%s: invalid fd: %d", __func__, fd);
> @@ -504,10 +542,20 @@ vionet_rx_copy(struct vionet_dev *dev, i
> return (-1);
> }
> return (0);
> - } else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) {
> + } else if (fd == dev->data_fd) {
> + if ((size_t)sz < sizeof(struct tun_hdr)) {
> + log_warnx("%s: short tun_hdr", __func__);
> + return (0);
> + }
> + memcpy(th, payload, sizeof *th);
> + payload += sizeof(struct tun_hdr);
> + sz -= sizeof(struct tun_hdr);
> +
> /* If reading the tap(4), we should get valid ethernet. */
> - log_warnx("%s: invalid packet size", __func__);
> - return (0);
> + if (sz < VIONET_MIN_TXLEN) {
> + log_warnx("%s: invalid packet size", __func__);
> + return (0);
> + }
> } else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) {
> log_warnx("%s: invalid injected packet object (sz=%ld)",
> __func__, sz);
> @@ -526,6 +574,7 @@ vionet_rx_copy(struct vionet_dev *dev, i
> log_warnx("%s: invalid injected packet size", __func__);
> goto drop;
> }
> + memset(th, 0, sizeof *th);
> payload = pkt->buf;
> sz = (ssize_t)pkt->len;
> }
> @@ -585,6 +634,12 @@ vionet_rx_zerocopy(struct vionet_dev *de
> sz = readv(fd, iov, iov_cnt);
> if (sz == -1 && errno == EAGAIN)
> return (0);
> +
> + if ((size_t)sz < sizeof(struct tun_hdr))
> + return (0);
> +
> + sz -= sizeof(struct tun_hdr);
> +
> return (sz);
> }
>
> @@ -666,6 +721,8 @@ vionet_tx(struct virtio_dev *dev)
> struct iovec *iov;
> struct packet pkt;
> uint8_t status = 0;
> + struct virtio_net_hdr *vhp;
> + struct tun_hdr th;
>
> status = dev->status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
> if (status != VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
> @@ -692,8 +749,10 @@ vionet_tx(struct virtio_dev *dev)
> goto reset;
> }
>
> - iov = &iov_tx[0];
> - iov_cnt = 0;
> + /* the 0th slot will by used by the tun_hdr */
> +
> + iov = &iov_tx[1];
> + iov_cnt = 1;
> chain_len = 0;
>
> /*
> @@ -704,13 +763,16 @@ vionet_tx(struct virtio_dev *dev)
> log_warnx("%s: invalid descriptor length", __func__);
> goto reset;
> }
> - iov->iov_len = desc->len;
>
> - if (iov->iov_len > sizeof(struct virtio_net_hdr)) {
> - /* Chop off the virtio header, leaving packet data. */
> - iov->iov_len -= sizeof(struct virtio_net_hdr);
> - iov->iov_base = hvaddr_mem(desc->addr +
> - sizeof(struct virtio_net_hdr), iov->iov_len);
> + /* Chop the virtio net header off */
> + vhp = hvaddr_mem(desc->addr, sizeof(*vhp));
> + if (vhp == NULL)
> + goto reset;
> +
> + iov->iov_len = desc->len - sizeof(*vhp);
> + if (iov->iov_len > 0) {
> + iov->iov_base = hvaddr_mem(desc->addr + sizeof(*vhp),
> + iov->iov_len);
> if (iov->iov_base == NULL)
> goto reset;
>
> @@ -758,7 +820,7 @@ vionet_tx(struct virtio_dev *dev)
> * descriptor with packet data contains a large enough buffer
> * for this inspection.
> */
> - iov = &iov_tx[0];
> + iov = &iov_tx[1];
> if (vionet->lockedmac) {
> if (iov->iov_len < ETHER_HDR_LEN) {
> log_warnx("%s: insufficient header data",
> @@ -784,6 +846,15 @@ vionet_tx(struct virtio_dev *dev)
> }
> }
>
> + /*
> + * if we look at more of vhp we might need to copy
> + * it so it's aligned properly
> + */
> + vhdr2thdr(vhp, &th, iov_tx + 1, iov_cnt - 1);
> +
> + iov_tx[0].iov_base = &th;
> + iov_tx[0].iov_len = sizeof(th);
> +
> /* Write our packet to the tap(4). */
> sz = writev(vionet->data_fd, iov_tx, iov_cnt);
> if (sz == -1 && errno != ENOBUFS) {
> @@ -1114,6 +1185,7 @@ vionet_cfg_write(struct virtio_dev *dev,
> dev->driver_feature &= dev->device_feature;
> DPRINTF("%s: driver features 0x%llx", __func__,
> dev->driver_feature);
> + vionet_update_offload(dev);
> break;
> case VIO1_PCI_CONFIG_MSIX_VECTOR:
> /* Ignore until we support MSIX. */
> @@ -1555,6 +1627,155 @@ vionet_assert_pic_irq(struct virtio_dev
> &msg, sizeof(msg), ev_base_main);
> if (ret == -1)
> log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
> +}
> +
> +static int
> +memcpyv(void *buf, size_t len, size_t off, const struct iovec *iov, int iovcnt)
> +{
> + uint8_t *dst = buf;
> + size_t l;
> +
> + for (;;) {
> + if (iovcnt == 0)
> + return (-1);
> +
> + if (off < iov->iov_len)
> + break;
> +
> + off -= iov->iov_len;
> + iov++;
> + iovcnt--;
> + }
> +
> + l = off + len;
> + if (l > iov->iov_len)
> + l = iov->iov_len;
> + l -= off;
> +
> + memcpy(dst, (const uint8_t *)iov->iov_base + off, l);
> + dst += l;
> + len -= l;
> +
> + if (len == 0)
> + return (0);
> +
> + for (;;) {
> + if (iovcnt == 0)
> + return (-1);
> +
> + l = len;
> + if (l > iov->iov_len)
> + l = iov->iov_len;
> +
> + memcpy(dst, (const uint8_t *)iov->iov_base, l);
> + dst += l;
> + len -= l;
> +
> + if (len == 0)
> + break;
> +
> + iov++;
> + iovcnt--;
> + }
> +
> + return (0);
> +}
> +
> +static void
> +hdr_extract(const struct iovec *iov, int iovcnt, size_t *off, uint8_t *proto)
> +{
> + size_t offs;
> + uint16_t etype;
> +
> + if (memcpyv(&etype, sizeof(etype),
> + offsetof(struct ether_header, ether_type),
> + iov, iovcnt) == -1)
> + return;
> +
> + *off = sizeof(struct ether_header);
> +
> + if (etype == htons(ETHERTYPE_VLAN)) {
> + if (memcpyv(&etype, sizeof(etype),
> + offsetof(struct ether_vlan_header, evl_proto),
> + iov, iovcnt) == -1)
> + return;
> +
> + *off = sizeof(struct ether_vlan_header);
> + }
> +
> + if (etype == htons(ETHERTYPE_IP)) {
> + uint8_t hl;
> +
> + /* Get ipproto field from IP header. */
> + offs = *off + offsetof(struct ip, ip_p);
> + if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1)
> + return;
> +
> + /* Get IP header length field from IP header. */
> + offs = *off;
> + if (memcpyv(&hl, sizeof(hl), offs, iov, iovcnt) == -1)
> + return;
> +
> + *off += (hl & 0x0f) << 2;
> + } else if (etype == htons(ETHERTYPE_IPV6)) {
> + /* Get next header field from IP header. */
> + offs = *off + offsetof(struct ip6_hdr, ip6_nxt);
> + if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1)
> + return;
> +
> + *off += sizeof(struct ip6_hdr);
> + }
> +}
> +
> +static void
> +vhdr2thdr(struct virtio_net_hdr *vh, struct tun_hdr *th,
> + const struct iovec *iov, int iovcnt)
> +{
> + memset(th, 0, sizeof(*th));
> +
> + if (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> + size_t off;
> + uint8_t proto;
> +
> + hdr_extract(iov, iovcnt, &off, &proto);
> +
> + switch (proto) {
> + case IPPROTO_TCP:
> + th->th_flags |= TUN_H_TCP_CSUM;
> + break;
> +
> + case IPPROTO_UDP:
> + th->th_flags |= TUN_H_UDP_CSUM;
> + break;
> + }
> + }
> +}
> +
> +static void
> +thdr2vhdr(struct tun_hdr *th, struct virtio_net_hdr *vh,
> + const struct iovec *iov, int iovcnt)
> +{
> + size_t off;
> + uint8_t proto;
> +
> + memset(vh, 0, sizeof(*vh));
> +
> + if (th->th_flags & (TUN_H_TCP_CSUM | TUN_H_UDP_CSUM)) {
> + hdr_extract(iov, iovcnt, &off, &proto);
> +
> + vh->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
> + vh->csum_start = off;
> +
> + switch (proto) {
> + case IPPROTO_TCP:
> + vh->csum_offset = offsetof(struct tcphdr, th_sum);
> + break;
> +
> + case IPPROTO_UDP:
> + vh->csum_offset = offsetof(struct udphdr, uh_sum);
> + break;
> + }
> + }
> }
>
> /*
> Index: usr.sbin/vmd/virtio.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
> diff -u -p -r1.134 virtio.c
> --- usr.sbin/vmd/virtio.c 14 Jan 2026 03:09:05 -0000 1.134
> +++ usr.sbin/vmd/virtio.c 16 Jan 2026 18:24:50 -0000
> @@ -19,6 +19,7 @@
> #include <sys/param.h> /* PAGE_SIZE */
> #include <sys/socket.h>
> #include <sys/wait.h>
> +#include <sys/ioctl.h>
>
> #include <dev/pci/pcireg.h>
> #include <dev/pci/pcidevs.h>
> @@ -28,6 +29,7 @@
> #include <dev/vmm/vmm.h>
>
> #include <net/if.h>
> +#include <net/if_tun.h>
> #include <netinet/in.h>
> #include <netinet/if_ether.h>
>
> @@ -64,6 +66,8 @@ SLIST_HEAD(virtio_dev_head, virtio_dev)
>
> #define MAXPHYS (64 * 1024) /* max raw I/O transfer size */
>
> +#define VIRTIO_NET_F_CSUM (1<<0)
> +#define VIRTIO_NET_F_GUEST_CSUM (1<<1)
> #define VIRTIO_NET_F_MAC (1<<5)
>
> #define VMMCI_F_TIMESYNC (1<<0)
> @@ -1020,6 +1024,8 @@ virtio_init(struct vmd_vm *vm, int child
> /* Virtio 1.x Network Devices */
> if (vmc->vmc_nnics > 0) {
> for (i = 0; i < vmc->vmc_nnics; i++) {
> + struct tun_capabilities tcap;
> +
> dev = malloc(sizeof(struct virtio_dev));
> if (dev == NULL) {
> log_warn("calloc failure allocating vionet");
> @@ -1034,7 +1040,8 @@ virtio_init(struct vmd_vm *vm, int child
> }
> virtio_dev_init(vm, dev, id, VIONET_QUEUE_SIZE_DEFAULT,
> VIRTIO_NET_QUEUES,
> - (VIRTIO_NET_F_MAC | VIRTIO_F_VERSION_1));
> + (VIRTIO_NET_F_MAC | VIRTIO_NET_F_CSUM |
> + VIRTIO_NET_F_GUEST_CSUM | VIRTIO_F_VERSION_1));
>
> if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
> dev) == -1) {
> @@ -1056,6 +1063,14 @@ virtio_init(struct vmd_vm *vm, int child
> dev->vmm_id = vm->vm_vmmid;
> dev->vionet.data_fd = child_taps[i];
>
> + /*
> + * IFCAPs are tweaked after feature negotiation with
> + * the guest later.
> + */
> + memset(&tcap, 0, sizeof(tcap));
> + if (ioctl(dev->vionet.data_fd, TUNSCAP, &tcap) == -1)
> + fatal("tap(4) TUNSCAP");
> +
> /* MAC address has been assigned by the parent */
> memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
> dev->vionet.lockedmac =
> @@ -1532,10 +1547,12 @@ virtio_dev_launch(struct vmd_vm *vm, str
> }
>
> /* Close data fds. Only the child device needs them now. */
> - if (virtio_dev_closefds(dev) == -1) {
> - log_warnx("%s: failed to close device data fds",
> - __func__);
> - goto err;
> + if (dev->dev_type != VMD_DEVTYPE_NET) {
> + if (virtio_dev_closefds(dev) == -1) {
> + log_warnx("%s: failed to close device data fds",
> + __func__);
> + goto err;
> + }
> }
>
> /* 2. Send over details on the VM (including memory fds). */
> @@ -1758,6 +1775,18 @@ handle_dev_msg(struct viodev_msg *msg, s
> case VIODEV_MSG_ERROR:
> log_warnx("%s: device reported error", __func__);
> break;
> + case VIODEV_MSG_TUNSCAP:
> + {
> + struct tun_capabilities tcap;
> +
> + memset(&tcap, 0, sizeof(tcap));
> + tcap.tun_if_capabilities = msg->data;
> +
> + if (ioctl(gdev->vionet.data_fd, TUNSCAP, &tcap) == -1)
> + fatal("%s: tap(4) TUNSCAP", __func__);
> +
> + break;
> + }
> case VIODEV_MSG_INVALID:
> case VIODEV_MSG_IO_READ:
> case VIODEV_MSG_IO_WRITE:
> Index: usr.sbin/vmd/virtio.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v
> diff -u -p -r1.60 virtio.h
> --- usr.sbin/vmd/virtio.h 14 Jan 2026 03:09:05 -0000 1.60
> +++ usr.sbin/vmd/virtio.h 16 Jan 2026 18:24:50 -0000
> @@ -134,6 +134,7 @@ struct viodev_msg {
> #define VIODEV_MSG_IO_WRITE 5
> #define VIODEV_MSG_DUMP 6
> #define VIODEV_MSG_SHUTDOWN 7
> +#define VIODEV_MSG_TUNSCAP 8
>
> uint16_t reg; /* VirtIO register */
> uint8_t io_sz; /* IO instruction size */
> @@ -309,6 +310,9 @@ struct virtio_net_hdr {
> uint16_t padding_reserved;
> */
> };
> +
> +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* flags */
> +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* flags */
>
> enum vmmci_cmd {
> VMMCI_NONE = 0,
vmd: add checksum offload for guests