Index | Thread | Search

From:
Dave Voutila <dv@sisu.io>
Subject:
Re: vmd: add checksum offload for guests
To:
Jan Klemkow <jan@openbsd.org>
Cc:
Klemens Nanni <kn@openbsd.org>, Alexander Bluhm <bluhm@openbsd.org>, tech@openbsd.org
Date:
Wed, 14 Jan 2026 14:21:18 -0500

Download raw body.

Thread
Jan Klemkow <jan@openbsd.org> writes:

> On Sat, May 24, 2025 at 06:14:38AM +0000, Klemens Nanni wrote:
>> 24.05.2025 06:33, Jan Klemkow пишет:
>> Still breaks:
>>
>> May 24 09:12:25 atar vmd[44493]: vionet_tx: bad source address 22:8d:47:b5:88:f6
>> May 24 09:12:56 atar last message repeated 25 time
>>
>> Linux VM is completely offline.
>
> There was a bug in the csum_start and csum_offset calculation which is fixed in
> the following diff.  I tested it successfully with Debian/Linux and OpenBSD
> guests.
>
> This diff introduces optional checksum offloading for VMM guests.
>
> Tests are welcome.
>
> ok?
>

Questions in line about pledge changes.

Other question is broader: why the need for memory copying with this
offload feature? I don't know how this offload works, but adding in more
memcpy never seems like an improvement for efficiency. Does the tun
header stuff not have some requirement to not be fragmented across
buffers?

> Thanks,
> Jan
>
> Index: sys/kern/kern_pledge.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_pledge.c,v
> diff -u -p -r1.335 kern_pledge.c
> --- sys/kern/kern_pledge.c	13 Nov 2025 20:59:14 -0000	1.335
> +++ sys/kern/kern_pledge.c	14 Jan 2026 17:25:57 -0000
> @@ -46,6 +46,7 @@
>  #include <net/route.h>
>  #include <net/if.h>
>  #include <net/if_var.h>
> +#include <net/if_tun.h>
>  #include <netinet/in.h>
>  #include <netinet6/in6_var.h>
>  #include <netinet6/nd6.h>
> @@ -1337,6 +1338,12 @@ pledge_ioctl(struct proc *p, long com, s
>  		    cdevsw[major(vp->v_rdev)].d_open == vmmopen) {
>  			error = pledge_ioctl_vmm(p, com);
>  			if (error == 0)
> +				return 0;
> +		}
> +		if ((fp->f_type == DTYPE_VNODE) &&
> +		    (vp->v_type == VCHR) &&
> +		    (cdevsw[major(vp->v_rdev)].d_open == tapopen)) {
> +			if (com == TUNSCAP)
>  				return 0;
>  		}

The diff scope here isn't showing the actual logic change: this adds
capabilities to the "vmm" pledge. Can this be something specific to
tap/tun devices or something related? See below for more commentary.

>  	}
> Index: usr.sbin/vmd/vionet.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/vionet.c,v
> diff -u -p -r1.29 vionet.c
> --- usr.sbin/vmd/vionet.c	14 Jan 2026 03:09:05 -0000	1.29
> +++ usr.sbin/vmd/vionet.c	14 Jan 2026 18:02:33 -0000
> @@ -17,12 +17,18 @@
>   * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
>   */
>  #include <sys/types.h>
> +#include <sys/ioctl.h>
>
>  #include <dev/pci/virtio_pcireg.h>
>  #include <dev/pv/virtioreg.h>
>
>  #include <net/if.h>
> +#include <net/if_tun.h>
>  #include <netinet/in.h>
> +#include <netinet/ip.h>
> +#include <netinet/ip6.h>
> +#include <netinet/tcp.h>
> +#include <netinet/udp.h>
>  #include <netinet/if_ether.h>
>
>  #include <errno.h>
> @@ -50,6 +56,7 @@
>
>  #define VIRTIO_NET_CONFIG_MAC		 0 /*  8 bit x 6 byte */
>
> +#define VIRTIO_NET_F_GUEST_CSUM	(1 << 1)
>  #define VIRTIO_NET_F_MAC	(1 << 5)
>  #define RXQ	0
>  #define TXQ	1
> @@ -65,7 +72,7 @@ static void *rx_run_loop(void *);
>  static void *tx_run_loop(void *);
>  static int vionet_rx(struct virtio_dev *, int);
>  static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *,
> -    int, size_t);
> +    int, size_t, struct tun_hdr *th);
>  static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int,
>      const struct iovec *, int);
>  static void vionet_rx_event(int, short, void *);
> @@ -84,6 +91,10 @@ static void read_pipe_rx(int, short, voi
>  static void read_pipe_tx(int, short, void *);
>  static void vionet_assert_pic_irq(struct virtio_dev *);
>  static void vionet_deassert_pic_irq(struct virtio_dev *);
> +static void vhdr2thdr(struct virtio_net_hdr *, struct tun_hdr *,
> +    const struct iovec *, int);
> +static void thdr2vhdr(struct tun_hdr *, struct virtio_net_hdr *,
> +    const struct iovec *, int);
>
>  /* Device Globals */
>  struct event ev_tap;
> @@ -112,6 +123,7 @@ vionet_main(int fd, int fd_vmm)
>  	struct vionet_dev	*vionet = NULL;
>  	struct viodev_msg 	 msg;
>  	struct vmd_vm	 	 vm;
> +	struct tun_capabilities	 tcap;
>  	ssize_t			 sz;
>  	int			 ret;
>
> @@ -145,6 +157,13 @@ vionet_main(int fd, int fd_vmm)
>  	    ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd,
>  	    dev.async_fd, fd_vmm);
>
> +	/*
> +	 * IFCAPs are tweaked after feature negotiation with the guest later.
> +	 */
> +	memset(&tcap, 0, sizeof(tcap));
> +	if (ioctl(vionet->data_fd, TUNSCAP, &tcap) == -1)
> +		fatal("tap(4) TUNSCAP");
> +
>  	/* Receive our vm information from the vm process. */
>  	memset(&vm, 0, sizeof(vm));
>  	sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
> @@ -168,7 +187,7 @@ vionet_main(int fd, int fd_vmm)
>  	 * We no longer need /dev/vmm access.
>  	 */
>  	close_fd(fd_vmm);
> -	if (pledge("stdio", NULL) == -1)
> +	if (pledge("stdio vmm", NULL) == -1)

The "vmm" pledge allows some capabilities this emulated device process
doesn't need nor should ever have.

While there should never be an open file descriptor for /dev/vmm for
this process and "stdio" doesn't allow open(2), I'm not a fan of this.

Since it appears the ioctl(2) against the tap fd needs to be called at
runtime based on feature negotiation, it's clear there's some allowance
that needs to be made.

I just don't think "vmm" is it. To me this should be scoped to the
tun/tap device.

I may be overly paranoid...but the goal of splitting out these devices
was to restrict capabilities as much as possible. vionet is a super
scary device given it processes network data.

>  		fatal("pledge2");
>
>  	/* Initialize our packet injection pipe. */
> @@ -300,6 +319,25 @@ fail:
>  }
>
>  /*
> + * Update and sync offload features with tap(4).
> + */
> +static void
> +vionet_update_offload(struct virtio_dev *dev)
> +{
> +	struct tun_capabilities	 tcap;
> +
> +	memset(&tcap, 0, sizeof(tcap));
> +
> +	if (dev->driver_feature & VIRTIO_NET_F_GUEST_CSUM) {
> +		tcap.tun_if_capabilities |= IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4;
> +		tcap.tun_if_capabilities |= IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> +	}
> +
> +	if (ioctl(dev->vionet.data_fd, TUNSCAP, &tcap) == -1)
> +		fatal("tap(4) TUNSCAP");
> +}
> +
> +/*
>   * vionet_rx
>   *
>   * Pull packet from the provided fd and fill the receive-side virtqueue. We
> @@ -321,6 +359,7 @@ vionet_rx(struct virtio_dev *dev, int fd
>  	struct virtio_net_hdr *hdr = NULL;
>  	struct virtio_vq_info *vq_info;
>  	struct iovec *iov;
> +	struct tun_hdr th;
>  	int notify = 0;
>  	ssize_t sz;
>  	uint8_t status = 0;
> @@ -351,8 +390,8 @@ vionet_rx(struct virtio_dev *dev, int fd
>  			goto reset;
>  		}
>
> -		iov = &iov_rx[0];
> -		iov_cnt = 1;
> +		iov = &iov_rx[1];
> +		iov_cnt = 2;
>
>  		/*
>  		 * First descriptor should be at least as large as the
> @@ -373,7 +412,6 @@ vionet_rx(struct virtio_dev *dev, int fd
>  		if (iov->iov_base == NULL)
>  			goto reset;
>  		hdr = iov->iov_base;
> -		memset(hdr, 0, sizeof(struct virtio_net_hdr));
>
>  		/* Tweak the iovec to account for the virtio_net_hdr. */
>  		iov->iov_len -= sizeof(struct virtio_net_hdr);
> @@ -418,22 +456,26 @@ vionet_rx(struct virtio_dev *dev, int fd
>  			goto reset;
>  		}
>
> -		hdr->num_buffers = iov_cnt;
> -
>  		/*
>  		 * If we're enforcing hardware address or handling an injected
>  		 * packet, we need to use a copy-based approach.
>  		 */
>  		if (vionet->lockedmac || fd != vionet->data_fd)
>  			sz = vionet_rx_copy(vionet, fd, iov_rx, iov_cnt,
> -			    chain_len);
> -		else
> +			    chain_len, &th);
> +		else {
> +			iov_rx[0].iov_base = &th;
> +			iov_rx[0].iov_len = sizeof(th);
>  			sz = vionet_rx_zerocopy(vionet, fd, iov_rx, iov_cnt);
> +		}
>  		if (sz == -1)
>  			goto reset;
>  		if (sz == 0)	/* No packets, so bail out for now. */
>  			break;
>
> +		thdr2vhdr(&th, hdr, iov_rx + 1, iov_cnt - 1);
> +		hdr->num_buffers = iov_cnt - 1;
> +
>  		/*
>  		 * Account for the prefixed header since it wasn't included
>  		 * in the copy or zerocopy operations.
> @@ -473,9 +515,9 @@ reset:
>   */
>  ssize_t
>  vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov,
> -    int iov_cnt, size_t chain_len)
> +    int iov_cnt, size_t chain_len, struct tun_hdr *th)
>  {
> -	static uint8_t		 buf[VIONET_HARD_MTU];
> +	static uint8_t		 buf[sizeof(struct tun_hdr) + VIONET_HARD_MTU];
>  	struct packet		*pkt = NULL;
>  	struct ether_header	*eh = NULL;
>  	uint8_t			*payload = buf;
> @@ -483,9 +525,10 @@ vionet_rx_copy(struct vionet_dev *dev, i
>  	ssize_t			 sz;
>
>  	/* If reading from the tap(4), try to right-size the read. */
> -	if (fd == dev->data_fd)
> -		nbytes = MIN(chain_len, VIONET_HARD_MTU);
> -	else if (fd == pipe_inject[READ])
> +	if (fd == dev->data_fd) {
> +		nbytes = sizeof(struct tun_hdr) +
> +		    MIN(chain_len, VIONET_HARD_MTU);
> +	} else if (fd == pipe_inject[READ])
>  		nbytes = sizeof(struct packet);
>  	else {
>  		log_warnx("%s: invalid fd: %d", __func__, fd);
> @@ -504,10 +547,20 @@ vionet_rx_copy(struct vionet_dev *dev, i
>  			return (-1);
>  		}
>  		return (0);
> -	} else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) {
> +	} else if (fd == dev->data_fd) {
> +		if ((size_t)sz < sizeof(struct tun_hdr)) {
> +			log_warnx("%s: short tun_hdr", __func__);
> +			return (0);
> +		}
> +		memcpy(th, payload, sizeof *th);
> +		payload += sizeof(struct tun_hdr);
> +		sz -= sizeof(struct tun_hdr);
> +
>  		/* If reading the tap(4), we should get valid ethernet. */
> -		log_warnx("%s: invalid packet size", __func__);
> -		return (0);
> +		if (sz < VIONET_MIN_TXLEN) {
> +			log_warnx("%s: invalid packet size", __func__);
> +			return (0);
> +		}
>  	} else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) {
>  		log_warnx("%s: invalid injected packet object (sz=%ld)",
>  		    __func__, sz);
> @@ -585,6 +638,12 @@ vionet_rx_zerocopy(struct vionet_dev *de
>  	sz = readv(fd, iov, iov_cnt);
>  	if (sz == -1 && errno == EAGAIN)
>  		return (0);
> +
> +	if ((size_t)sz < sizeof(struct tun_hdr))
> +		return (0);
> +
> +	sz -= sizeof(struct tun_hdr);
> +
>  	return (sz);
>  }
>
> @@ -666,6 +725,8 @@ vionet_tx(struct virtio_dev *dev)
>  	struct iovec *iov;
>  	struct packet pkt;
>  	uint8_t status = 0;
> +	struct virtio_net_hdr *vhp;
> +	struct tun_hdr th;
>
>  	status = dev->status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
>  	if (status != VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
> @@ -692,8 +753,10 @@ vionet_tx(struct virtio_dev *dev)
>  			goto reset;
>  		}
>
> -		iov = &iov_tx[0];
> -		iov_cnt = 0;
> +		/* the 0th slot will by used by the tun_hdr */
> +
> +		iov = &iov_tx[1];
> +		iov_cnt = 1;
>  		chain_len = 0;
>
>  		/*
> @@ -704,13 +767,16 @@ vionet_tx(struct virtio_dev *dev)
>  			log_warnx("%s: invalid descriptor length", __func__);
>  			goto reset;
>  		}
> -		iov->iov_len = desc->len;
>
> -		if (iov->iov_len > sizeof(struct virtio_net_hdr)) {
> -			/* Chop off the virtio header, leaving packet data. */
> -			iov->iov_len -= sizeof(struct virtio_net_hdr);
> -			iov->iov_base = hvaddr_mem(desc->addr +
> -			    sizeof(struct virtio_net_hdr), iov->iov_len);
> +		/* Chop the virtio net header off */
> +		vhp = hvaddr_mem(desc->addr, sizeof(*vhp));
> +		if (vhp == NULL)
> +			goto reset;
> +
> +		iov->iov_len = desc->len - sizeof(*vhp);
> +		if (iov->iov_len > 0) {
> +			iov->iov_base = hvaddr_mem(desc->addr + sizeof(*vhp),
> +			    iov->iov_len);
>  			if (iov->iov_base == NULL)
>  				goto reset;
>
> @@ -758,7 +824,7 @@ vionet_tx(struct virtio_dev *dev)
>  		 * descriptor with packet data contains a large enough buffer
>  		 * for this inspection.
>  		 */
> -		iov = &iov_tx[0];
> +		iov = &iov_tx[1];
>  		if (vionet->lockedmac) {
>  			if (iov->iov_len < ETHER_HDR_LEN) {
>  				log_warnx("%s: insufficient header data",
> @@ -784,6 +850,15 @@ vionet_tx(struct virtio_dev *dev)
>  			}
>  		}
>
> +		/*
> +		 * if we look at more of vhp we might need to copy
> +		 * it so it's aligned properly
> +		 */
> +		vhdr2thdr(vhp, &th, iov_tx + 1, iov_cnt - 1);
> +
> +		iov_tx[0].iov_base = &th;
> +		iov_tx[0].iov_len = sizeof(th);
> +
>  		/* Write our packet to the tap(4). */
>  		sz = writev(vionet->data_fd, iov_tx, iov_cnt);
>  		if (sz == -1 && errno != ENOBUFS) {
> @@ -1114,6 +1189,7 @@ vionet_cfg_write(struct virtio_dev *dev,
>  		dev->driver_feature &= dev->device_feature;
>  		DPRINTF("%s: driver features 0x%llx", __func__,
>  		    dev->driver_feature);
> +		vionet_update_offload(dev);
>  		break;
>  	case VIO1_PCI_CONFIG_MSIX_VECTOR:
>  		/* Ignore until we support MSIX. */
> @@ -1555,6 +1631,155 @@ vionet_assert_pic_irq(struct virtio_dev
>  	    &msg, sizeof(msg), ev_base_main);
>  	if (ret == -1)
>  		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
> +}
> +
> +static int
> +memcpyv(void *buf, size_t len, size_t off, const struct iovec *iov, int iovcnt)
> +{
> +	uint8_t *dst = buf;
> +	size_t l;
> +
> +	for (;;) {
> +		if (iovcnt == 0)
> +			return (-1);
> +
> +		if (off < iov->iov_len)
> +			break;
> +
> +		off -= iov->iov_len;
> +		iov++;
> +		iovcnt--;
> +	}
> +
> +	l = off + len;
> +	if (l > iov->iov_len)
> +		l = iov->iov_len;
> +	l -= off;
> +
> +	memcpy(dst, (const uint8_t *)iov->iov_base + off, l);
> +	dst += l;
> +	len -= l;
> +
> +	if (len == 0)
> +		return (0);
> +
> +	for (;;) {
> +		if (iovcnt == 0)
> +			return (-1);
> +
> +		l = len;
> +		if (l > iov->iov_len)
> +			l = iov->iov_len;
> +
> +		memcpy(dst, (const uint8_t *)iov->iov_base, l);
> +		dst += l;
> +		len -= l;
> +
> +		if (len == 0)
> +			break;
> +
> +		iov++;
> +		iovcnt--;
> +	}
> +
> +	return (0);
> +}
> +
> +static void
> +hdr_extract(const struct iovec *iov, int iovcnt, size_t *off, uint8_t *proto)
> +{
> +	size_t		offs;
> +	uint16_t	etype;
> +
> +	if (memcpyv(&etype, sizeof(etype),
> +	    offsetof(struct ether_header, ether_type),
> +	    iov, iovcnt) == -1)
> +		return;
> +
> +	*off = sizeof(struct ether_header);
> +
> +	if (etype == htons(ETHERTYPE_VLAN)) {
> +		if (memcpyv(&etype, sizeof(etype),
> +		    offsetof(struct ether_vlan_header, evl_proto),
> +		    iov, iovcnt) == -1)
> +			return;
> +
> +		*off = sizeof(struct ether_vlan_header);
> +	}
> +
> +	if (etype == htons(ETHERTYPE_IP)) {
> +		uint8_t hl;
> +
> +		/* Get ipproto field from IP header. */
> +		offs = *off + offsetof(struct ip, ip_p);
> +		if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1)
> +			return;
> +
> +		/* Get IP header length field from IP header. */
> +		offs = *off;
> +		if (memcpyv(&hl, sizeof(hl), offs, iov, iovcnt) == -1)
> +			return;
> +
> +		*off += (hl & 0x0f) << 2;
> +	} else if (etype == htons(ETHERTYPE_IPV6)) {
> +		/* Get next header field from IP header. */
> +		offs = *off + offsetof(struct ip6_hdr, ip6_nxt);
> +		if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1)
> +			return;
> +
> +		*off += sizeof(struct ip6_hdr);
> +	}
> +}
> +
> +static void
> +vhdr2thdr(struct virtio_net_hdr *vh, struct tun_hdr *th,
> +    const struct iovec *iov, int iovcnt)
> +{
> +	memset(th, 0, sizeof(*th));
> +
> +	if (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> +		size_t	off;
> +		uint8_t	proto;
> +
> +		hdr_extract(iov, iovcnt, &off, &proto);
> +
> +		switch (proto) {
> +		case IPPROTO_TCP:
> +			th->th_flags |= TUN_H_TCP_CSUM;
> +			break;
> +
> +		case IPPROTO_UDP:
> +			th->th_flags |= TUN_H_UDP_CSUM;
> +			break;
> +		}
> +	}
> +}
> +
> +static void
> +thdr2vhdr(struct tun_hdr *th, struct virtio_net_hdr *vh,
> +    const struct iovec *iov, int iovcnt)
> +{
> +	size_t	off;
> +	uint8_t	proto;
> +
> +	memset(vh, 0, sizeof(*vh));
> +
> +	if (th->th_flags & (TUN_H_TCP_CSUM | TUN_H_UDP_CSUM)) {
> +		hdr_extract(iov, iovcnt, &off, &proto);
> +
> +		vh->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
> +		vh->csum_start = off;
> +
> +		switch (proto) {
> +		case IPPROTO_TCP:
> +			vh->csum_offset = offsetof(struct tcphdr, th_sum);
> +			break;
> +
> +		case IPPROTO_UDP:
> +			vh->csum_offset = offsetof(struct udphdr, uh_sum);
> +			break;
> +		}
> +	}
>  }
>
>  /*
> Index: usr.sbin/vmd/virtio.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
> diff -u -p -r1.134 virtio.c
> --- usr.sbin/vmd/virtio.c	14 Jan 2026 03:09:05 -0000	1.134
> +++ usr.sbin/vmd/virtio.c	14 Jan 2026 18:02:33 -0000
> @@ -64,6 +64,8 @@ SLIST_HEAD(virtio_dev_head, virtio_dev)
>
>  #define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */
>
> +#define VIRTIO_NET_F_CSUM	(1<<0)
> +#define VIRTIO_NET_F_GUEST_CSUM	(1<<1)
>  #define VIRTIO_NET_F_MAC	(1<<5)
>
>  #define VMMCI_F_TIMESYNC	(1<<0)
> @@ -1034,7 +1036,8 @@ virtio_init(struct vmd_vm *vm, int child
>  			}
>  			virtio_dev_init(vm, dev, id, VIONET_QUEUE_SIZE_DEFAULT,
>  			    VIRTIO_NET_QUEUES,
> -			    (VIRTIO_NET_F_MAC | VIRTIO_F_VERSION_1));
> +			    (VIRTIO_NET_F_MAC | VIRTIO_NET_F_CSUM |
> +				VIRTIO_NET_F_GUEST_CSUM | VIRTIO_F_VERSION_1));
>
>  			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
>  			    dev) == -1) {
> Index: usr.sbin/vmd/virtio.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v
> diff -u -p -r1.60 virtio.h
> --- usr.sbin/vmd/virtio.h	14 Jan 2026 03:09:05 -0000	1.60
> +++ usr.sbin/vmd/virtio.h	14 Jan 2026 17:25:57 -0000
> @@ -310,6 +310,9 @@ struct virtio_net_hdr {
>  	*/
>  };
>
> +#define VIRTIO_NET_HDR_F_NEEDS_CSUM	1 /* flags */
> +#define VIRTIO_NET_HDR_F_DATA_VALID	2 /* flags */
> +
>  enum vmmci_cmd {
>  	VMMCI_NONE = 0,
>  	VMMCI_SHUTDOWN,