Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: vmd: add checksum offload for guests
To:
Jan Klemkow <jan@openbsd.org>
Cc:
tech@openbsd.org
Date:
Thu, 22 May 2025 11:14:12 +0900

Download raw body.

Thread
On Wed, May 21, 2025 at 03:38:16AM +0200, Jan Klemkow wrote:
> Hi,
> 
> This diff uses the tun(4) offload capabilities in the virtio
> implementation of vmd(8).  vmd(8) translates the virtio_net_hdr
> structures from the guests packets to tun_hdr for tun(4) and vice versa.
> The offloading features bits are dynamically negotiated with the guest.
> So, legacy guests are still working with this feature.  The pledge part
> was discussed with deraadt.
> 
> ok?

With this diff Linux VM guests running on vmd(8) cannot use network
anymore.

bluhm

> Index: sys/kern/kern_pledge.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_pledge.c,v
> diff -u -p -r1.323 kern_pledge.c
> --- sys/kern/kern_pledge.c	12 Feb 2025 14:11:26 -0000	1.323
> +++ sys/kern/kern_pledge.c	21 May 2025 01:25:04 -0000
> @@ -46,6 +46,7 @@
>  #include <net/route.h>
>  #include <net/if.h>
>  #include <net/if_var.h>
> +#include <net/if_tun.h>
>  #include <netinet/in.h>
>  #include <netinet6/in6_var.h>
>  #include <netinet6/nd6.h>
> @@ -1342,6 +1343,12 @@ pledge_ioctl(struct proc *p, long com, s
>  		    (cdevsw[major(vp->v_rdev)].d_open == vmmopen)) {
>  			error = pledge_ioctl_vmm(p, com);
>  			if (error == 0)
> +				return 0;
> +		}
> +		if ((fp->f_type == DTYPE_VNODE) &&
> +		    (vp->v_type == VCHR) &&
> +		    (cdevsw[major(vp->v_rdev)].d_open == tapopen)) {
> +			if (com == TUNSCAP)
>  				return 0;
>  		}
>  	}
> Index: usr.sbin/vmd/vionet.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/vionet.c,v
> diff -u -p -r1.23 vionet.c
> --- usr.sbin/vmd/vionet.c	12 May 2025 17:17:42 -0000	1.23
> +++ usr.sbin/vmd/vionet.c	21 May 2025 01:25:04 -0000
> @@ -17,12 +17,18 @@
>   * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
>   */
>  #include <sys/types.h>
> +#include <sys/ioctl.h>
>  
>  #include <dev/pci/virtio_pcireg.h>
>  #include <dev/pv/virtioreg.h>
>  
>  #include <net/if.h>
> +#include <net/if_tun.h>
>  #include <netinet/in.h>
> +#include <netinet/ip.h>
> +#include <netinet/ip6.h>
> +#include <netinet/tcp.h>
> +#include <netinet/udp.h>
>  #include <netinet/if_ether.h>
>  
>  #include <errno.h>
> @@ -33,11 +39,14 @@
>  #include <stdlib.h>
>  #include <string.h>
>  #include <unistd.h>
> +#include <stddef.h>
>  
>  #include "atomicio.h"
>  #include "virtio.h"
>  #include "vmd.h"
>  
> +#define VIRTIO_NET_F_CSUM	(1 << 0)
> +#define VIRTIO_NET_F_GUEST_CSUM	(1 << 1)
>  #define VIRTIO_NET_F_MAC	(1 << 5)
>  #define RXQ	0
>  #define TXQ	1
> @@ -54,7 +63,7 @@ static void *rx_run_loop(void *);
>  static void *tx_run_loop(void *);
>  static int vionet_rx(struct vionet_dev *, int);
>  static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *,
> -    int, size_t);
> +    int, size_t, struct tun_hdr *th);
>  static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int,
>      const struct iovec *, int);
>  static void vionet_rx_event(int, short, void *);
> @@ -70,6 +79,10 @@ static void read_pipe_rx(int, short, voi
>  static void read_pipe_tx(int, short, void *);
>  static void vionet_assert_pic_irq(struct virtio_dev *);
>  static void vionet_deassert_pic_irq(struct virtio_dev *);
> +static void vhdr2thdr(struct virtio_net_hdr *, struct tun_hdr *,
> +    const struct iovec *, int);
> +static void thdr2vhdr(struct tun_hdr *, struct virtio_net_hdr *,
> +    const struct iovec *, int);
>  
>  /* Device Globals */
>  struct event ev_tap;
> @@ -101,6 +114,7 @@ vionet_main(int fd, int fd_vmm)
>  	struct vm_create_params	*vcp;
>  	ssize_t			 sz;
>  	int			 ret;
> +	struct tun_capabilities	 tcap;
>  
>  	/*
>  	 * stdio - needed for read/write to disk fds and channels to the vm.
> @@ -132,6 +146,14 @@ vionet_main(int fd, int fd_vmm)
>  	    ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd,
>  	    dev.async_fd, fd_vmm);
>  
> +	/*
> +	 * IFCAPs should be tweaked based on feature negotiation
> +	 * with the guest.
> +	 */
> +	tcap.tun_if_capabilities = 0;
> +	if (ioctl(vionet->data_fd, TUNSCAP, &tcap) == -1)
> +		fatal("tap(4) TUNSCAP");
> +
>  	/* Receive our vm information from the vm process. */
>  	memset(&vm, 0, sizeof(vm));
>  	sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
> @@ -156,7 +178,7 @@ vionet_main(int fd, int fd_vmm)
>  	 * We no longer need /dev/vmm access.
>  	 */
>  	close_fd(fd_vmm);
> -	if (pledge("stdio", NULL) == -1)
> +	if (pledge("stdio vmm", NULL) == -1)
>  		fatal("pledge2");
>  
>  	/* If we're restoring hardware, re-initialize virtqueue hva's. */
> @@ -315,6 +337,25 @@ fail:
>  }
>  
>  /*
> + * Update and sync offload features with tap(4).
> + */
> +static void
> +vionet_update_offload(struct vionet_dev *dev)
> +{
> +	struct tun_capabilities	 tcap;
> +
> +	tcap.tun_if_capabilities = 0;
> +
> +	if (dev->cfg.guest_feature & VIRTIO_NET_F_GUEST_CSUM) {
> +		tcap.tun_if_capabilities |= IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4;
> +		tcap.tun_if_capabilities |= IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> +	}
> +
> +	if (ioctl(dev->data_fd, TUNSCAP, &tcap) == -1)
> +		fatal("tap(4) TUNSCAP");
> +}
> +
> +/*
>   * Update the gpa and hva of the virtqueue.
>   */
>  static void
> @@ -383,7 +424,9 @@ vionet_rx(struct vionet_dev *dev, int fd
>  	struct vring_avail *avail;
>  	struct vring_used *used;
>  	struct virtio_vq_info *vq_info;
> +	struct virtio_net_hdr *vhp;
>  	struct iovec *iov;
> +	struct tun_hdr th;
>  	int notify = 0;
>  	ssize_t sz;
>  	uint8_t status = 0;
> @@ -414,7 +457,7 @@ vionet_rx(struct vionet_dev *dev, int fd
>  			goto reset;
>  		}
>  
> -		iov = &iov_rx[0];
> +		iov = &iov_rx[1];
>  		iov_cnt = 1;
>  
>  		/*
> @@ -436,6 +479,7 @@ vionet_rx(struct vionet_dev *dev, int fd
>  		if (iov->iov_base == NULL)
>  			goto reset;
>  		memset(iov->iov_base, 0, sizeof(struct virtio_net_hdr));
> +		vhp = iov->iov_base;
>  
>  		/* Tweak the iovec to account for the virtio_net_hdr. */
>  		iov->iov_len -= sizeof(struct virtio_net_hdr);
> @@ -486,14 +530,19 @@ vionet_rx(struct vionet_dev *dev, int fd
>  		 */
>  		if (dev->lockedmac || fd != dev->data_fd)
>  			sz = vionet_rx_copy(dev, fd, iov_rx, iov_cnt,
> -			    chain_len);
> -		else
> +			    chain_len, &th);
> +		else {
> +			iov_rx[0].iov_base = &th;
> +			iov_rx[0].iov_len = sizeof(th);
>  			sz = vionet_rx_zerocopy(dev, fd, iov_rx, iov_cnt);
> +		}
>  		if (sz == -1)
>  			goto reset;
>  		if (sz == 0)	/* No packets, so bail out for now. */
>  			break;
>  
> +		thdr2vhdr(&th, vhp, iov_rx + 1, iov_cnt - 1);
> +
>  		/*
>  		 * Account for the prefixed header since it wasn't included
>  		 * in the copy or zerocopy operations.
> @@ -533,9 +582,9 @@ reset:
>   */
>  ssize_t
>  vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov,
> -    int iov_cnt, size_t chain_len)
> +    int iov_cnt, size_t chain_len, struct tun_hdr *th)
>  {
> -	static uint8_t		 buf[VIONET_HARD_MTU];
> +	static uint8_t		 buf[sizeof(struct tun_hdr) + VIONET_HARD_MTU];
>  	struct packet		*pkt = NULL;
>  	struct ether_header	*eh = NULL;
>  	uint8_t			*payload = buf;
> @@ -543,9 +592,10 @@ vionet_rx_copy(struct vionet_dev *dev, i
>  	ssize_t			 sz;
>  
>  	/* If reading from the tap(4), try to right-size the read. */
> -	if (fd == dev->data_fd)
> -		nbytes = MIN(chain_len, VIONET_HARD_MTU);
> -	else if (fd == pipe_inject[READ])
> +	if (fd == dev->data_fd) {
> +		nbytes = sizeof(struct tun_hdr) +
> +		    MIN(chain_len, VIONET_HARD_MTU);
> +	} else if (fd == pipe_inject[READ])
>  		nbytes = sizeof(struct packet);
>  	else {
>  		log_warnx("%s: invalid fd: %d", __func__, fd);
> @@ -564,10 +614,20 @@ vionet_rx_copy(struct vionet_dev *dev, i
>  			return (-1);
>  		}
>  		return (0);
> -	} else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) {
> +	} else if (fd == dev->data_fd) {
> +		if ((size_t)sz < sizeof(struct tun_hdr)) {
> +			log_warnx("%s: short tun_hdr", __func__);
> +			return (0);
> +		}
> +		memcpy(th, payload, sizeof *th);
> +		payload += sizeof(struct tun_hdr);
> +		sz -= sizeof(struct tun_hdr);
> +
>  		/* If reading the tap(4), we should get valid ethernet. */
> -		log_warnx("%s: invalid packet size", __func__);
> -		return (0);
> +		if (sz < VIONET_MIN_TXLEN) {
> +			log_warnx("%s: invalid packet size", __func__);
> +			return (0);
> +		}
>  	} else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) {
>  		log_warnx("%s: invalid injected packet object (sz=%ld)",
>  		    __func__, sz);
> @@ -729,6 +789,8 @@ vionet_tx(struct virtio_dev *dev)
>  	struct iovec *iov;
>  	struct packet pkt;
>  	uint8_t status = 0;
> +	struct virtio_net_hdr *vhp;
> +	struct tun_hdr th;
>  
>  	status = vionet->cfg.device_status
>  	    & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
> @@ -756,8 +818,10 @@ vionet_tx(struct virtio_dev *dev)
>  			goto reset;
>  		}
>  
> -		iov = &iov_tx[0];
> -		iov_cnt = 0;
> +		/* the 0th slot will by used by the tun_hdr */
> +
> +		iov = &iov_tx[1];
> +		iov_cnt = 1;
>  		chain_len = 0;
>  
>  		/*
> @@ -765,18 +829,23 @@ vionet_tx(struct virtio_dev *dev)
>  		 * descriptor sized to the virtio_net_hdr. However, the framing
>  		 * is not guaranteed, so check for packet data.
>  		 */
> -		iov->iov_len = desc->len;
> -		if (iov->iov_len < sizeof(struct virtio_net_hdr)) {
> +		if (desc->len < sizeof(*vhp)) {
>  			log_warnx("%s: invalid descriptor length", __func__);
>  			goto reset;
> -		} else if (iov->iov_len > sizeof(struct virtio_net_hdr)) {
> -			/* Chop off the virtio header, leaving packet data. */
> -			iov->iov_len -= sizeof(struct virtio_net_hdr);
> -			chain_len += iov->iov_len;
> -			iov->iov_base = hvaddr_mem(desc->addr +
> -			    sizeof(struct virtio_net_hdr), iov->iov_len);
> +		}
> +
> +		/* Chop the virtio net header off */
> +		vhp = hvaddr_mem(desc->addr, sizeof(*vhp));
> +		if (vhp == NULL)
> +			goto reset;
> +
> +		iov->iov_len = desc->len - sizeof(*vhp);
> +		if (iov->iov_len > 0) {
> +			iov->iov_base = hvaddr_mem(desc->addr + sizeof(*vhp),
> +			    iov->iov_len);
>  			if (iov->iov_base == NULL)
>  				goto reset;
> +			chain_len += iov->iov_len;
>  			iov_cnt++;
>  		}
>  
> @@ -820,7 +889,7 @@ vionet_tx(struct virtio_dev *dev)
>  		 * descriptor with packet data contains a large enough buffer
>  		 * for this inspection.
>  		 */
> -		iov = &iov_tx[0];
> +		iov = &iov_tx[1];
>  		if (vionet->lockedmac) {
>  			if (iov->iov_len < ETHER_HDR_LEN) {
>  				log_warnx("%s: insufficient header data",
> @@ -846,6 +915,17 @@ vionet_tx(struct virtio_dev *dev)
>  			}
>  		}
>  
> +		memset(&th, 0, sizeof(th));
> +
> +		/*
> +		 * if we look at more of vhp we might need to copy
> +		 * it so it's aligned properly
> +		 */
> +		vhdr2thdr(vhp, &th, iov_tx + 1, iov_cnt - 1);
> +
> +		iov_tx[0].iov_base = &th;
> +		iov_tx[0].iov_len = sizeof(th);
> +
>  		/* Write our packet to the tap(4). */
>  		sz = writev(vionet->data_fd, iov_tx, iov_cnt);
>  		if (sz == -1 && errno != ENOBUFS) {
> @@ -1064,6 +1144,7 @@ handle_io_write(struct viodev_msg *msg, 
>  		break;
>  	case VIRTIO_CONFIG_GUEST_FEATURES:
>  		vionet->cfg.guest_feature = data;
> +		vionet_update_offload(vionet);
>  		break;
>  	case VIRTIO_CONFIG_QUEUE_PFN:
>  		vionet->cfg.queue_pfn = data;
> @@ -1396,4 +1477,149 @@ vionet_deassert_pic_irq(struct virtio_de
>  	    &msg, sizeof(msg), ev_base_main);
>  	if (ret == -1)
>  		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
> +}
> +
> +static int
> +memcpyv(void *buf, size_t len, size_t off, const struct iovec *iov, int iovcnt)
> +{
> +	uint8_t *dst = buf;
> +	size_t l;
> +
> +	for (;;) {
> +		if (iovcnt == 0)
> +			return (-1);
> +
> +		if (off < iov->iov_len)
> +			break;
> +
> +		off -= iov->iov_len;
> +		iov++;
> +		iovcnt--;
> +	}
> +
> +	l = off + len;
> +	if (l > iov->iov_len)
> +		l = iov->iov_len;
> +	l -= off;
> +
> +	memcpy(dst, (const uint8_t *)iov->iov_base + off, l);
> +	dst += l;
> +	len -= l;
> +
> +	if (len == 0)
> +		return (0);
> +
> +	for (;;) {
> +		if (iovcnt == 0)
> +			return (-1);
> +
> +		l = len;
> +		if (l > iov->iov_len)
> +			l = iov->iov_len;
> +
> +		memcpy(dst, (const uint8_t *)iov->iov_base, l);
> +		dst += l;
> +		len -= l;
> +
> +		if (len == 0)
> +			break;
> +
> +		iov++;
> +		iovcnt--;
> +	}
> +
> +	return (0);
> +}
> +
> +static void
> +hdr_extract(const struct iovec *iov, int iovcnt, size_t *off, uint8_t *proto)
> +{
> +	size_t		offs;
> +	uint16_t	etype;
> +
> +	if (memcpyv(&etype, sizeof(etype),
> +	    offsetof(struct ether_header, ether_type),
> +	    iov, iovcnt) == -1)
> +		return;
> +
> +	*off = sizeof(struct ether_header);
> +
> +	if (etype == htons(ETHERTYPE_VLAN)) {
> +		if (memcpyv(&etype, sizeof(etype),
> +		    offsetof(struct ether_vlan_header, evl_proto),
> +		    iov, iovcnt) == -1)
> +			return;
> +
> +		*off = sizeof(struct ether_vlan_header);
> +	}
> +
> +	if (etype == htons(ETHERTYPE_IP)) {
> +		uint16_t len;
> +
> +		/* Get ipproto field from IP header. */
> +		offs = *off + offsetof(struct ip, ip_p);
> +		if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1)
> +			return;
> +
> +		/* Get IP length field from IP header. */
> +		offs = *off + offsetof(struct ip, ip_len);
> +		if (memcpyv(&len, sizeof(len), offs, iov, iovcnt) == -1)
> +			return;
> +
> +		*off += len;
> +	} else if (etype == htons(ETHERTYPE_IPV6)) {
> +		/* Get next header field from IP header. */
> +		offs = *off + offsetof(struct ip6_hdr, ip6_nxt);
> +		if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1)
> +			return;
> +
> +		*off += sizeof(struct ip6_hdr);
> +	}
> +}
> +
> +static void
> +vhdr2thdr(struct virtio_net_hdr *vh, struct tun_hdr *th,
> +    const struct iovec *iov, int iovcnt)
> +{
> +	if (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> +		size_t	off;
> +		uint8_t	proto;
> +
> +		hdr_extract(iov, iovcnt, &off, &proto);
> +
> +		switch (proto) {
> +		case IPPROTO_TCP:
> +			th->th_flags |= TUN_H_TCP_CSUM;
> +			break;
> +
> +		case IPPROTO_UDP:
> +			th->th_flags |= TUN_H_UDP_CSUM;
> +			break;
> +		}
> +	}
> +}
> +
> +static void
> +thdr2vhdr(struct tun_hdr *th, struct virtio_net_hdr *vh,
> +    const struct iovec *iov, int iovcnt)
> +{
> +	size_t	off;
> +	uint8_t	proto;
> +
> +	if (th->th_flags & (TUN_H_TCP_CSUM | TUN_H_UDP_CSUM)) {
> +		hdr_extract(iov, iovcnt, &off, &proto);
> +
> +		vh->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
> +		vh->csum_start = off;
> +
> +		switch (proto) {
> +		case IPPROTO_TCP:
> +			vh->csum_offset = offsetof(struct tcphdr, th_sum);
> +			break;
> +
> +		case IPPROTO_UDP:
> +			vh->csum_offset = offsetof(struct udphdr, uh_sum);
> +			break;
> +		}
> +	}
>  }
> Index: usr.sbin/vmd/virtio.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
> diff -u -p -r1.124 virtio.c
> --- usr.sbin/vmd/virtio.c	12 May 2025 17:17:42 -0000	1.124
> +++ usr.sbin/vmd/virtio.c	21 May 2025 01:25:04 -0000
> @@ -55,6 +55,8 @@ SLIST_HEAD(virtio_dev_head, virtio_dev) 
>  
>  #define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */
>  
> +#define VIRTIO_NET_F_CSUM	(1<<0)
> +#define VIRTIO_NET_F_GUEST_CSUM	(1<<1)
>  #define VIRTIO_NET_F_MAC	(1<<5)
>  
>  #define VMMCI_F_TIMESYNC	(1<<0)
> @@ -622,6 +624,10 @@ virtio_init(struct vmd_vm *vm, int child
>  			/* MAC address has been assigned by the parent */
>  			memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
>  			dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC;
> +
> +			/* offload checksum to the kernel */
> +			dev->vionet.cfg.device_feature = VIRTIO_NET_F_CSUM;
> +			dev->vionet.cfg.device_feature |= VIRTIO_NET_F_GUEST_CSUM;
>  
>  			dev->vionet.lockedmac =
>  			    vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
> Index: usr.sbin/vmd/virtio.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v
> diff -u -p -r1.54 virtio.h
> --- usr.sbin/vmd/virtio.h	12 May 2025 17:17:42 -0000	1.54
> +++ usr.sbin/vmd/virtio.h	21 May 2025 01:25:04 -0000
> @@ -307,6 +307,9 @@ struct virtio_net_hdr {
>  /*	uint16_t num_buffers; */
>  };
>  
> +#define VIRTIO_NET_HDR_F_NEEDS_CSUM	1 /* flags */
> +#define VIRTIO_NET_HDR_F_DATA_VALID	2 /* flags */
> +
>  enum vmmci_cmd {
>  	VMMCI_NONE = 0,
>  	VMMCI_SHUTDOWN,