From: Jan Klemkow Subject: Re: vmd: add checksum offload for guests To: Mike Larkin Cc: David Gwynne , Dave Voutila , Klemens Nanni , Alexander Bluhm , tech@openbsd.org Date: Fri, 16 Jan 2026 19:38:16 +0100 On Thu, Jan 15, 2026 at 02:08:43PM -0800, Mike Larkin wrote: > Does this "just work" no matter what guests I run? That's really all I care > about. Here is my current diff for checksum offloading in vmd(8). I tested the following combination of features: - Debian/Linux and OpenBSD-current guests - OpenBSD-current vio(4) w/o all offloading features - Linux, OpenBSD and Hostsystem via veb(4) and vlan(4) - IPv4 and IPv6 with tcpbench(1) - local interface locked lladdr - local interface dhcp Further tests are welcome! ok? Thanks, Jan Index: sys/kern/kern_pledge.c =================================================================== RCS file: /cvs/src/sys/kern/kern_pledge.c,v diff -u -p -r1.335 kern_pledge.c --- sys/kern/kern_pledge.c 13 Nov 2025 20:59:14 -0000 1.335 +++ sys/kern/kern_pledge.c 16 Jan 2026 18:24:49 -0000 @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -1337,6 +1338,12 @@ pledge_ioctl(struct proc *p, long com, s cdevsw[major(vp->v_rdev)].d_open == vmmopen) { error = pledge_ioctl_vmm(p, com); if (error == 0) + return 0; + } + if ((fp->f_type == DTYPE_VNODE) && + (vp->v_type == VCHR) && + (cdevsw[major(vp->v_rdev)].d_open == tapopen)) { + if (com == TUNSCAP) return 0; } } Index: usr.sbin/vmd/vionet.c =================================================================== RCS file: /cvs/src/usr.sbin/vmd/vionet.c,v diff -u -p -r1.29 vionet.c --- usr.sbin/vmd/vionet.c 14 Jan 2026 03:09:05 -0000 1.29 +++ usr.sbin/vmd/vionet.c 16 Jan 2026 18:24:50 -0000 @@ -22,7 +22,12 @@ #include #include +#include #include +#include +#include +#include +#include #include #include @@ -50,6 +55,7 @@ #define VIRTIO_NET_CONFIG_MAC 0 /* 8 bit x 6 byte */ +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) #define VIRTIO_NET_F_MAC (1 << 5) #define RXQ 0 #define TXQ 1 @@ -65,7 +71,7 @@ static void *rx_run_loop(void *); static void *tx_run_loop(void *); static int vionet_rx(struct virtio_dev *, int); static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *, - int, size_t); + int, size_t, struct tun_hdr *th); static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int, const struct iovec *, int); static void vionet_rx_event(int, short, void *); @@ -84,6 +90,10 @@ static void read_pipe_rx(int, short, voi static void read_pipe_tx(int, short, void *); static void vionet_assert_pic_irq(struct virtio_dev *); static void vionet_deassert_pic_irq(struct virtio_dev *); +static void vhdr2thdr(struct virtio_net_hdr *, struct tun_hdr *, + const struct iovec *, int); +static void thdr2vhdr(struct tun_hdr *, struct virtio_net_hdr *, + const struct iovec *, int); /* Device Globals */ struct event ev_tap; @@ -300,6 +310,30 @@ fail: } /* + * Update and sync offload features with tap(4). + */ +static void +vionet_update_offload(struct virtio_dev *dev) +{ + struct viodev_msg msg; + int ret; + + memset(&msg, 0, sizeof(msg)); + msg.irq = dev->irq; + msg.type = VIODEV_MSG_TUNSCAP; + + if (dev->driver_feature & VIRTIO_NET_F_GUEST_CSUM) { + msg.data |= IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4; + msg.data |= IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; + } + + ret = imsg_compose_event2(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1, + &msg, sizeof(msg), ev_base_main); + if (ret == -1) + log_warnx("%s: failed to assert irq %d", __func__, dev->irq); +} + +/* * vionet_rx * * Pull packet from the provided fd and fill the receive-side virtqueue. We @@ -321,6 +355,7 @@ vionet_rx(struct virtio_dev *dev, int fd struct virtio_net_hdr *hdr = NULL; struct virtio_vq_info *vq_info; struct iovec *iov; + struct tun_hdr th; int notify = 0; ssize_t sz; uint8_t status = 0; @@ -351,8 +386,8 @@ vionet_rx(struct virtio_dev *dev, int fd goto reset; } - iov = &iov_rx[0]; - iov_cnt = 1; + iov = &iov_rx[1]; + iov_cnt = 2; /* * First descriptor should be at least as large as the @@ -373,7 +408,6 @@ vionet_rx(struct virtio_dev *dev, int fd if (iov->iov_base == NULL) goto reset; hdr = iov->iov_base; - memset(hdr, 0, sizeof(struct virtio_net_hdr)); /* Tweak the iovec to account for the virtio_net_hdr. */ iov->iov_len -= sizeof(struct virtio_net_hdr); @@ -418,15 +452,15 @@ vionet_rx(struct virtio_dev *dev, int fd goto reset; } - hdr->num_buffers = iov_cnt; - /* * If we're enforcing hardware address or handling an injected * packet, we need to use a copy-based approach. */ + iov_rx[0].iov_base = &th; + iov_rx[0].iov_len = sizeof(th); if (vionet->lockedmac || fd != vionet->data_fd) - sz = vionet_rx_copy(vionet, fd, iov_rx, iov_cnt, - chain_len); + sz = vionet_rx_copy(vionet, fd, iov_rx + 1, iov_cnt - 1, + chain_len, &th); else sz = vionet_rx_zerocopy(vionet, fd, iov_rx, iov_cnt); if (sz == -1) @@ -434,6 +468,9 @@ vionet_rx(struct virtio_dev *dev, int fd if (sz == 0) /* No packets, so bail out for now. */ break; + thdr2vhdr(&th, hdr, iov_rx + 1, iov_cnt - 1); + hdr->num_buffers = iov_cnt - 1; + /* * Account for the prefixed header since it wasn't included * in the copy or zerocopy operations. @@ -473,9 +510,9 @@ reset: */ ssize_t vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov, - int iov_cnt, size_t chain_len) + int iov_cnt, size_t chain_len, struct tun_hdr *th) { - static uint8_t buf[VIONET_HARD_MTU]; + static uint8_t buf[sizeof(struct tun_hdr) + VIONET_HARD_MTU]; struct packet *pkt = NULL; struct ether_header *eh = NULL; uint8_t *payload = buf; @@ -483,9 +520,10 @@ vionet_rx_copy(struct vionet_dev *dev, i ssize_t sz; /* If reading from the tap(4), try to right-size the read. */ - if (fd == dev->data_fd) - nbytes = MIN(chain_len, VIONET_HARD_MTU); - else if (fd == pipe_inject[READ]) + if (fd == dev->data_fd) { + nbytes = sizeof(struct tun_hdr) + + MIN(chain_len, VIONET_HARD_MTU); + } else if (fd == pipe_inject[READ]) nbytes = sizeof(struct packet); else { log_warnx("%s: invalid fd: %d", __func__, fd); @@ -504,10 +542,20 @@ vionet_rx_copy(struct vionet_dev *dev, i return (-1); } return (0); - } else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) { + } else if (fd == dev->data_fd) { + if ((size_t)sz < sizeof(struct tun_hdr)) { + log_warnx("%s: short tun_hdr", __func__); + return (0); + } + memcpy(th, payload, sizeof *th); + payload += sizeof(struct tun_hdr); + sz -= sizeof(struct tun_hdr); + /* If reading the tap(4), we should get valid ethernet. */ - log_warnx("%s: invalid packet size", __func__); - return (0); + if (sz < VIONET_MIN_TXLEN) { + log_warnx("%s: invalid packet size", __func__); + return (0); + } } else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) { log_warnx("%s: invalid injected packet object (sz=%ld)", __func__, sz); @@ -526,6 +574,7 @@ vionet_rx_copy(struct vionet_dev *dev, i log_warnx("%s: invalid injected packet size", __func__); goto drop; } + memset(th, 0, sizeof *th); payload = pkt->buf; sz = (ssize_t)pkt->len; } @@ -585,6 +634,12 @@ vionet_rx_zerocopy(struct vionet_dev *de sz = readv(fd, iov, iov_cnt); if (sz == -1 && errno == EAGAIN) return (0); + + if ((size_t)sz < sizeof(struct tun_hdr)) + return (0); + + sz -= sizeof(struct tun_hdr); + return (sz); } @@ -666,6 +721,8 @@ vionet_tx(struct virtio_dev *dev) struct iovec *iov; struct packet pkt; uint8_t status = 0; + struct virtio_net_hdr *vhp; + struct tun_hdr th; status = dev->status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK; if (status != VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) { @@ -692,8 +749,10 @@ vionet_tx(struct virtio_dev *dev) goto reset; } - iov = &iov_tx[0]; - iov_cnt = 0; + /* the 0th slot will by used by the tun_hdr */ + + iov = &iov_tx[1]; + iov_cnt = 1; chain_len = 0; /* @@ -704,13 +763,16 @@ vionet_tx(struct virtio_dev *dev) log_warnx("%s: invalid descriptor length", __func__); goto reset; } - iov->iov_len = desc->len; - if (iov->iov_len > sizeof(struct virtio_net_hdr)) { - /* Chop off the virtio header, leaving packet data. */ - iov->iov_len -= sizeof(struct virtio_net_hdr); - iov->iov_base = hvaddr_mem(desc->addr + - sizeof(struct virtio_net_hdr), iov->iov_len); + /* Chop the virtio net header off */ + vhp = hvaddr_mem(desc->addr, sizeof(*vhp)); + if (vhp == NULL) + goto reset; + + iov->iov_len = desc->len - sizeof(*vhp); + if (iov->iov_len > 0) { + iov->iov_base = hvaddr_mem(desc->addr + sizeof(*vhp), + iov->iov_len); if (iov->iov_base == NULL) goto reset; @@ -758,7 +820,7 @@ vionet_tx(struct virtio_dev *dev) * descriptor with packet data contains a large enough buffer * for this inspection. */ - iov = &iov_tx[0]; + iov = &iov_tx[1]; if (vionet->lockedmac) { if (iov->iov_len < ETHER_HDR_LEN) { log_warnx("%s: insufficient header data", @@ -784,6 +846,15 @@ vionet_tx(struct virtio_dev *dev) } } + /* + * if we look at more of vhp we might need to copy + * it so it's aligned properly + */ + vhdr2thdr(vhp, &th, iov_tx + 1, iov_cnt - 1); + + iov_tx[0].iov_base = &th; + iov_tx[0].iov_len = sizeof(th); + /* Write our packet to the tap(4). */ sz = writev(vionet->data_fd, iov_tx, iov_cnt); if (sz == -1 && errno != ENOBUFS) { @@ -1114,6 +1185,7 @@ vionet_cfg_write(struct virtio_dev *dev, dev->driver_feature &= dev->device_feature; DPRINTF("%s: driver features 0x%llx", __func__, dev->driver_feature); + vionet_update_offload(dev); break; case VIO1_PCI_CONFIG_MSIX_VECTOR: /* Ignore until we support MSIX. */ @@ -1555,6 +1627,155 @@ vionet_assert_pic_irq(struct virtio_dev &msg, sizeof(msg), ev_base_main); if (ret == -1) log_warnx("%s: failed to assert irq %d", __func__, dev->irq); +} + +static int +memcpyv(void *buf, size_t len, size_t off, const struct iovec *iov, int iovcnt) +{ + uint8_t *dst = buf; + size_t l; + + for (;;) { + if (iovcnt == 0) + return (-1); + + if (off < iov->iov_len) + break; + + off -= iov->iov_len; + iov++; + iovcnt--; + } + + l = off + len; + if (l > iov->iov_len) + l = iov->iov_len; + l -= off; + + memcpy(dst, (const uint8_t *)iov->iov_base + off, l); + dst += l; + len -= l; + + if (len == 0) + return (0); + + for (;;) { + if (iovcnt == 0) + return (-1); + + l = len; + if (l > iov->iov_len) + l = iov->iov_len; + + memcpy(dst, (const uint8_t *)iov->iov_base, l); + dst += l; + len -= l; + + if (len == 0) + break; + + iov++; + iovcnt--; + } + + return (0); +} + +static void +hdr_extract(const struct iovec *iov, int iovcnt, size_t *off, uint8_t *proto) +{ + size_t offs; + uint16_t etype; + + if (memcpyv(&etype, sizeof(etype), + offsetof(struct ether_header, ether_type), + iov, iovcnt) == -1) + return; + + *off = sizeof(struct ether_header); + + if (etype == htons(ETHERTYPE_VLAN)) { + if (memcpyv(&etype, sizeof(etype), + offsetof(struct ether_vlan_header, evl_proto), + iov, iovcnt) == -1) + return; + + *off = sizeof(struct ether_vlan_header); + } + + if (etype == htons(ETHERTYPE_IP)) { + uint8_t hl; + + /* Get ipproto field from IP header. */ + offs = *off + offsetof(struct ip, ip_p); + if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1) + return; + + /* Get IP header length field from IP header. */ + offs = *off; + if (memcpyv(&hl, sizeof(hl), offs, iov, iovcnt) == -1) + return; + + *off += (hl & 0x0f) << 2; + } else if (etype == htons(ETHERTYPE_IPV6)) { + /* Get next header field from IP header. */ + offs = *off + offsetof(struct ip6_hdr, ip6_nxt); + if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1) + return; + + *off += sizeof(struct ip6_hdr); + } +} + +static void +vhdr2thdr(struct virtio_net_hdr *vh, struct tun_hdr *th, + const struct iovec *iov, int iovcnt) +{ + memset(th, 0, sizeof(*th)); + + if (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + size_t off; + uint8_t proto; + + hdr_extract(iov, iovcnt, &off, &proto); + + switch (proto) { + case IPPROTO_TCP: + th->th_flags |= TUN_H_TCP_CSUM; + break; + + case IPPROTO_UDP: + th->th_flags |= TUN_H_UDP_CSUM; + break; + } + } +} + +static void +thdr2vhdr(struct tun_hdr *th, struct virtio_net_hdr *vh, + const struct iovec *iov, int iovcnt) +{ + size_t off; + uint8_t proto; + + memset(vh, 0, sizeof(*vh)); + + if (th->th_flags & (TUN_H_TCP_CSUM | TUN_H_UDP_CSUM)) { + hdr_extract(iov, iovcnt, &off, &proto); + + vh->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; + vh->csum_start = off; + + switch (proto) { + case IPPROTO_TCP: + vh->csum_offset = offsetof(struct tcphdr, th_sum); + break; + + case IPPROTO_UDP: + vh->csum_offset = offsetof(struct udphdr, uh_sum); + break; + } + } } /* Index: usr.sbin/vmd/virtio.c =================================================================== RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v diff -u -p -r1.134 virtio.c --- usr.sbin/vmd/virtio.c 14 Jan 2026 03:09:05 -0000 1.134 +++ usr.sbin/vmd/virtio.c 16 Jan 2026 18:24:50 -0000 @@ -19,6 +19,7 @@ #include /* PAGE_SIZE */ #include #include +#include #include #include @@ -28,6 +29,7 @@ #include #include +#include #include #include @@ -64,6 +66,8 @@ SLIST_HEAD(virtio_dev_head, virtio_dev) #define MAXPHYS (64 * 1024) /* max raw I/O transfer size */ +#define VIRTIO_NET_F_CSUM (1<<0) +#define VIRTIO_NET_F_GUEST_CSUM (1<<1) #define VIRTIO_NET_F_MAC (1<<5) #define VMMCI_F_TIMESYNC (1<<0) @@ -1020,6 +1024,8 @@ virtio_init(struct vmd_vm *vm, int child /* Virtio 1.x Network Devices */ if (vmc->vmc_nnics > 0) { for (i = 0; i < vmc->vmc_nnics; i++) { + struct tun_capabilities tcap; + dev = malloc(sizeof(struct virtio_dev)); if (dev == NULL) { log_warn("calloc failure allocating vionet"); @@ -1034,7 +1040,8 @@ virtio_init(struct vmd_vm *vm, int child } virtio_dev_init(vm, dev, id, VIONET_QUEUE_SIZE_DEFAULT, VIRTIO_NET_QUEUES, - (VIRTIO_NET_F_MAC | VIRTIO_F_VERSION_1)); + (VIRTIO_NET_F_MAC | VIRTIO_NET_F_CSUM | + VIRTIO_NET_F_GUEST_CSUM | VIRTIO_F_VERSION_1)); if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io, dev) == -1) { @@ -1056,6 +1063,14 @@ virtio_init(struct vmd_vm *vm, int child dev->vmm_id = vm->vm_vmmid; dev->vionet.data_fd = child_taps[i]; + /* + * IFCAPs are tweaked after feature negotiation with + * the guest later. + */ + memset(&tcap, 0, sizeof(tcap)); + if (ioctl(dev->vionet.data_fd, TUNSCAP, &tcap) == -1) + fatal("tap(4) TUNSCAP"); + /* MAC address has been assigned by the parent */ memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6); dev->vionet.lockedmac = @@ -1532,10 +1547,12 @@ virtio_dev_launch(struct vmd_vm *vm, str } /* Close data fds. Only the child device needs them now. */ - if (virtio_dev_closefds(dev) == -1) { - log_warnx("%s: failed to close device data fds", - __func__); - goto err; + if (dev->dev_type != VMD_DEVTYPE_NET) { + if (virtio_dev_closefds(dev) == -1) { + log_warnx("%s: failed to close device data fds", + __func__); + goto err; + } } /* 2. Send over details on the VM (including memory fds). */ @@ -1758,6 +1775,18 @@ handle_dev_msg(struct viodev_msg *msg, s case VIODEV_MSG_ERROR: log_warnx("%s: device reported error", __func__); break; + case VIODEV_MSG_TUNSCAP: + { + struct tun_capabilities tcap; + + memset(&tcap, 0, sizeof(tcap)); + tcap.tun_if_capabilities = msg->data; + + if (ioctl(gdev->vionet.data_fd, TUNSCAP, &tcap) == -1) + fatal("%s: tap(4) TUNSCAP", __func__); + + break; + } case VIODEV_MSG_INVALID: case VIODEV_MSG_IO_READ: case VIODEV_MSG_IO_WRITE: Index: usr.sbin/vmd/virtio.h =================================================================== RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v diff -u -p -r1.60 virtio.h --- usr.sbin/vmd/virtio.h 14 Jan 2026 03:09:05 -0000 1.60 +++ usr.sbin/vmd/virtio.h 16 Jan 2026 18:24:50 -0000 @@ -134,6 +134,7 @@ struct viodev_msg { #define VIODEV_MSG_IO_WRITE 5 #define VIODEV_MSG_DUMP 6 #define VIODEV_MSG_SHUTDOWN 7 +#define VIODEV_MSG_TUNSCAP 8 uint16_t reg; /* VirtIO register */ uint8_t io_sz; /* IO instruction size */ @@ -309,6 +310,9 @@ struct virtio_net_hdr { uint16_t padding_reserved; */ }; + +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* flags */ +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* flags */ enum vmmci_cmd { VMMCI_NONE = 0,