Index | Thread | Search

From:
Jan Klemkow <jan@openbsd.org>
Subject:
vmd: add checksum offload for guests
To:
tech@openbsd.org
Date:
Wed, 21 May 2025 03:38:16 +0200

Download raw body.

Thread
Hi,

This diff uses the tun(4) offload capabilities in the virtio
implementation of vmd(8).  vmd(8) translates the virtio_net_hdr
structures from the guests packets to tun_hdr for tun(4) and vice versa.
The offloading features bits are dynamically negotiated with the guest.
So, legacy guests are still working with this feature.  The pledge part
was discussed with deraadt.

ok?

bye,
Jan

Index: sys/kern/kern_pledge.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_pledge.c,v
diff -u -p -r1.323 kern_pledge.c
--- sys/kern/kern_pledge.c	12 Feb 2025 14:11:26 -0000	1.323
+++ sys/kern/kern_pledge.c	21 May 2025 01:25:04 -0000
@@ -46,6 +46,7 @@
 #include <net/route.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_tun.h>
 #include <netinet/in.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
@@ -1342,6 +1343,12 @@ pledge_ioctl(struct proc *p, long com, s
 		    (cdevsw[major(vp->v_rdev)].d_open == vmmopen)) {
 			error = pledge_ioctl_vmm(p, com);
 			if (error == 0)
+				return 0;
+		}
+		if ((fp->f_type == DTYPE_VNODE) &&
+		    (vp->v_type == VCHR) &&
+		    (cdevsw[major(vp->v_rdev)].d_open == tapopen)) {
+			if (com == TUNSCAP)
 				return 0;
 		}
 	}
Index: usr.sbin/vmd/vionet.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/vionet.c,v
diff -u -p -r1.23 vionet.c
--- usr.sbin/vmd/vionet.c	12 May 2025 17:17:42 -0000	1.23
+++ usr.sbin/vmd/vionet.c	21 May 2025 01:25:04 -0000
@@ -17,12 +17,18 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 #include <sys/types.h>
+#include <sys/ioctl.h>
 
 #include <dev/pci/virtio_pcireg.h>
 #include <dev/pv/virtioreg.h>
 
 #include <net/if.h>
+#include <net/if_tun.h>
 #include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
 #include <netinet/if_ether.h>
 
 #include <errno.h>
@@ -33,11 +39,14 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <stddef.h>
 
 #include "atomicio.h"
 #include "virtio.h"
 #include "vmd.h"
 
+#define VIRTIO_NET_F_CSUM	(1 << 0)
+#define VIRTIO_NET_F_GUEST_CSUM	(1 << 1)
 #define VIRTIO_NET_F_MAC	(1 << 5)
 #define RXQ	0
 #define TXQ	1
@@ -54,7 +63,7 @@ static void *rx_run_loop(void *);
 static void *tx_run_loop(void *);
 static int vionet_rx(struct vionet_dev *, int);
 static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *,
-    int, size_t);
+    int, size_t, struct tun_hdr *th);
 static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int,
     const struct iovec *, int);
 static void vionet_rx_event(int, short, void *);
@@ -70,6 +79,10 @@ static void read_pipe_rx(int, short, voi
 static void read_pipe_tx(int, short, void *);
 static void vionet_assert_pic_irq(struct virtio_dev *);
 static void vionet_deassert_pic_irq(struct virtio_dev *);
+static void vhdr2thdr(struct virtio_net_hdr *, struct tun_hdr *,
+    const struct iovec *, int);
+static void thdr2vhdr(struct tun_hdr *, struct virtio_net_hdr *,
+    const struct iovec *, int);
 
 /* Device Globals */
 struct event ev_tap;
@@ -101,6 +114,7 @@ vionet_main(int fd, int fd_vmm)
 	struct vm_create_params	*vcp;
 	ssize_t			 sz;
 	int			 ret;
+	struct tun_capabilities	 tcap;
 
 	/*
 	 * stdio - needed for read/write to disk fds and channels to the vm.
@@ -132,6 +146,14 @@ vionet_main(int fd, int fd_vmm)
 	    ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd,
 	    dev.async_fd, fd_vmm);
 
+	/*
+	 * IFCAPs should be tweaked based on feature negotiation
+	 * with the guest.
+	 */
+	tcap.tun_if_capabilities = 0;
+	if (ioctl(vionet->data_fd, TUNSCAP, &tcap) == -1)
+		fatal("tap(4) TUNSCAP");
+
 	/* Receive our vm information from the vm process. */
 	memset(&vm, 0, sizeof(vm));
 	sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
@@ -156,7 +178,7 @@ vionet_main(int fd, int fd_vmm)
 	 * We no longer need /dev/vmm access.
 	 */
 	close_fd(fd_vmm);
-	if (pledge("stdio", NULL) == -1)
+	if (pledge("stdio vmm", NULL) == -1)
 		fatal("pledge2");
 
 	/* If we're restoring hardware, re-initialize virtqueue hva's. */
@@ -315,6 +337,25 @@ fail:
 }
 
 /*
+ * Update and sync offload features with tap(4).
+ */
+static void
+vionet_update_offload(struct vionet_dev *dev)
+{
+	struct tun_capabilities	 tcap;
+
+	tcap.tun_if_capabilities = 0;
+
+	if (dev->cfg.guest_feature & VIRTIO_NET_F_GUEST_CSUM) {
+		tcap.tun_if_capabilities |= IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4;
+		tcap.tun_if_capabilities |= IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
+	}
+
+	if (ioctl(dev->data_fd, TUNSCAP, &tcap) == -1)
+		fatal("tap(4) TUNSCAP");
+}
+
+/*
  * Update the gpa and hva of the virtqueue.
  */
 static void
@@ -383,7 +424,9 @@ vionet_rx(struct vionet_dev *dev, int fd
 	struct vring_avail *avail;
 	struct vring_used *used;
 	struct virtio_vq_info *vq_info;
+	struct virtio_net_hdr *vhp;
 	struct iovec *iov;
+	struct tun_hdr th;
 	int notify = 0;
 	ssize_t sz;
 	uint8_t status = 0;
@@ -414,7 +457,7 @@ vionet_rx(struct vionet_dev *dev, int fd
 			goto reset;
 		}
 
-		iov = &iov_rx[0];
+		iov = &iov_rx[1];
 		iov_cnt = 1;
 
 		/*
@@ -436,6 +479,7 @@ vionet_rx(struct vionet_dev *dev, int fd
 		if (iov->iov_base == NULL)
 			goto reset;
 		memset(iov->iov_base, 0, sizeof(struct virtio_net_hdr));
+		vhp = iov->iov_base;
 
 		/* Tweak the iovec to account for the virtio_net_hdr. */
 		iov->iov_len -= sizeof(struct virtio_net_hdr);
@@ -486,14 +530,19 @@ vionet_rx(struct vionet_dev *dev, int fd
 		 */
 		if (dev->lockedmac || fd != dev->data_fd)
 			sz = vionet_rx_copy(dev, fd, iov_rx, iov_cnt,
-			    chain_len);
-		else
+			    chain_len, &th);
+		else {
+			iov_rx[0].iov_base = &th;
+			iov_rx[0].iov_len = sizeof(th);
 			sz = vionet_rx_zerocopy(dev, fd, iov_rx, iov_cnt);
+		}
 		if (sz == -1)
 			goto reset;
 		if (sz == 0)	/* No packets, so bail out for now. */
 			break;
 
+		thdr2vhdr(&th, vhp, iov_rx + 1, iov_cnt - 1);
+
 		/*
 		 * Account for the prefixed header since it wasn't included
 		 * in the copy or zerocopy operations.
@@ -533,9 +582,9 @@ reset:
  */
 ssize_t
 vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov,
-    int iov_cnt, size_t chain_len)
+    int iov_cnt, size_t chain_len, struct tun_hdr *th)
 {
-	static uint8_t		 buf[VIONET_HARD_MTU];
+	static uint8_t		 buf[sizeof(struct tun_hdr) + VIONET_HARD_MTU];
 	struct packet		*pkt = NULL;
 	struct ether_header	*eh = NULL;
 	uint8_t			*payload = buf;
@@ -543,9 +592,10 @@ vionet_rx_copy(struct vionet_dev *dev, i
 	ssize_t			 sz;
 
 	/* If reading from the tap(4), try to right-size the read. */
-	if (fd == dev->data_fd)
-		nbytes = MIN(chain_len, VIONET_HARD_MTU);
-	else if (fd == pipe_inject[READ])
+	if (fd == dev->data_fd) {
+		nbytes = sizeof(struct tun_hdr) +
+		    MIN(chain_len, VIONET_HARD_MTU);
+	} else if (fd == pipe_inject[READ])
 		nbytes = sizeof(struct packet);
 	else {
 		log_warnx("%s: invalid fd: %d", __func__, fd);
@@ -564,10 +614,20 @@ vionet_rx_copy(struct vionet_dev *dev, i
 			return (-1);
 		}
 		return (0);
-	} else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) {
+	} else if (fd == dev->data_fd) {
+		if ((size_t)sz < sizeof(struct tun_hdr)) {
+			log_warnx("%s: short tun_hdr", __func__);
+			return (0);
+		}
+		memcpy(th, payload, sizeof *th);
+		payload += sizeof(struct tun_hdr);
+		sz -= sizeof(struct tun_hdr);
+
 		/* If reading the tap(4), we should get valid ethernet. */
-		log_warnx("%s: invalid packet size", __func__);
-		return (0);
+		if (sz < VIONET_MIN_TXLEN) {
+			log_warnx("%s: invalid packet size", __func__);
+			return (0);
+		}
 	} else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) {
 		log_warnx("%s: invalid injected packet object (sz=%ld)",
 		    __func__, sz);
@@ -729,6 +789,8 @@ vionet_tx(struct virtio_dev *dev)
 	struct iovec *iov;
 	struct packet pkt;
 	uint8_t status = 0;
+	struct virtio_net_hdr *vhp;
+	struct tun_hdr th;
 
 	status = vionet->cfg.device_status
 	    & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
@@ -756,8 +818,10 @@ vionet_tx(struct virtio_dev *dev)
 			goto reset;
 		}
 
-		iov = &iov_tx[0];
-		iov_cnt = 0;
+		/* the 0th slot will by used by the tun_hdr */
+
+		iov = &iov_tx[1];
+		iov_cnt = 1;
 		chain_len = 0;
 
 		/*
@@ -765,18 +829,23 @@ vionet_tx(struct virtio_dev *dev)
 		 * descriptor sized to the virtio_net_hdr. However, the framing
 		 * is not guaranteed, so check for packet data.
 		 */
-		iov->iov_len = desc->len;
-		if (iov->iov_len < sizeof(struct virtio_net_hdr)) {
+		if (desc->len < sizeof(*vhp)) {
 			log_warnx("%s: invalid descriptor length", __func__);
 			goto reset;
-		} else if (iov->iov_len > sizeof(struct virtio_net_hdr)) {
-			/* Chop off the virtio header, leaving packet data. */
-			iov->iov_len -= sizeof(struct virtio_net_hdr);
-			chain_len += iov->iov_len;
-			iov->iov_base = hvaddr_mem(desc->addr +
-			    sizeof(struct virtio_net_hdr), iov->iov_len);
+		}
+
+		/* Chop the virtio net header off */
+		vhp = hvaddr_mem(desc->addr, sizeof(*vhp));
+		if (vhp == NULL)
+			goto reset;
+
+		iov->iov_len = desc->len - sizeof(*vhp);
+		if (iov->iov_len > 0) {
+			iov->iov_base = hvaddr_mem(desc->addr + sizeof(*vhp),
+			    iov->iov_len);
 			if (iov->iov_base == NULL)
 				goto reset;
+			chain_len += iov->iov_len;
 			iov_cnt++;
 		}
 
@@ -820,7 +889,7 @@ vionet_tx(struct virtio_dev *dev)
 		 * descriptor with packet data contains a large enough buffer
 		 * for this inspection.
 		 */
-		iov = &iov_tx[0];
+		iov = &iov_tx[1];
 		if (vionet->lockedmac) {
 			if (iov->iov_len < ETHER_HDR_LEN) {
 				log_warnx("%s: insufficient header data",
@@ -846,6 +915,17 @@ vionet_tx(struct virtio_dev *dev)
 			}
 		}
 
+		memset(&th, 0, sizeof(th));
+
+		/*
+		 * if we look at more of vhp we might need to copy
+		 * it so it's aligned properly
+		 */
+		vhdr2thdr(vhp, &th, iov_tx + 1, iov_cnt - 1);
+
+		iov_tx[0].iov_base = &th;
+		iov_tx[0].iov_len = sizeof(th);
+
 		/* Write our packet to the tap(4). */
 		sz = writev(vionet->data_fd, iov_tx, iov_cnt);
 		if (sz == -1 && errno != ENOBUFS) {
@@ -1064,6 +1144,7 @@ handle_io_write(struct viodev_msg *msg, 
 		break;
 	case VIRTIO_CONFIG_GUEST_FEATURES:
 		vionet->cfg.guest_feature = data;
+		vionet_update_offload(vionet);
 		break;
 	case VIRTIO_CONFIG_QUEUE_PFN:
 		vionet->cfg.queue_pfn = data;
@@ -1396,4 +1477,149 @@ vionet_deassert_pic_irq(struct virtio_de
 	    &msg, sizeof(msg), ev_base_main);
 	if (ret == -1)
 		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
+}
+
+static int
+memcpyv(void *buf, size_t len, size_t off, const struct iovec *iov, int iovcnt)
+{
+	uint8_t *dst = buf;
+	size_t l;
+
+	for (;;) {
+		if (iovcnt == 0)
+			return (-1);
+
+		if (off < iov->iov_len)
+			break;
+
+		off -= iov->iov_len;
+		iov++;
+		iovcnt--;
+	}
+
+	l = off + len;
+	if (l > iov->iov_len)
+		l = iov->iov_len;
+	l -= off;
+
+	memcpy(dst, (const uint8_t *)iov->iov_base + off, l);
+	dst += l;
+	len -= l;
+
+	if (len == 0)
+		return (0);
+
+	for (;;) {
+		if (iovcnt == 0)
+			return (-1);
+
+		l = len;
+		if (l > iov->iov_len)
+			l = iov->iov_len;
+
+		memcpy(dst, (const uint8_t *)iov->iov_base, l);
+		dst += l;
+		len -= l;
+
+		if (len == 0)
+			break;
+
+		iov++;
+		iovcnt--;
+	}
+
+	return (0);
+}
+
+static void
+hdr_extract(const struct iovec *iov, int iovcnt, size_t *off, uint8_t *proto)
+{
+	size_t		offs;
+	uint16_t	etype;
+
+	if (memcpyv(&etype, sizeof(etype),
+	    offsetof(struct ether_header, ether_type),
+	    iov, iovcnt) == -1)
+		return;
+
+	*off = sizeof(struct ether_header);
+
+	if (etype == htons(ETHERTYPE_VLAN)) {
+		if (memcpyv(&etype, sizeof(etype),
+		    offsetof(struct ether_vlan_header, evl_proto),
+		    iov, iovcnt) == -1)
+			return;
+
+		*off = sizeof(struct ether_vlan_header);
+	}
+
+	if (etype == htons(ETHERTYPE_IP)) {
+		uint16_t len;
+
+		/* Get ipproto field from IP header. */
+		offs = *off + offsetof(struct ip, ip_p);
+		if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1)
+			return;
+
+		/* Get IP length field from IP header. */
+		offs = *off + offsetof(struct ip, ip_len);
+		if (memcpyv(&len, sizeof(len), offs, iov, iovcnt) == -1)
+			return;
+
+		*off += len;
+	} else if (etype == htons(ETHERTYPE_IPV6)) {
+		/* Get next header field from IP header. */
+		offs = *off + offsetof(struct ip6_hdr, ip6_nxt);
+		if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1)
+			return;
+
+		*off += sizeof(struct ip6_hdr);
+	}
+}
+
+static void
+vhdr2thdr(struct virtio_net_hdr *vh, struct tun_hdr *th,
+    const struct iovec *iov, int iovcnt)
+{
+	if (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		size_t	off;
+		uint8_t	proto;
+
+		hdr_extract(iov, iovcnt, &off, &proto);
+
+		switch (proto) {
+		case IPPROTO_TCP:
+			th->th_flags |= TUN_H_TCP_CSUM;
+			break;
+
+		case IPPROTO_UDP:
+			th->th_flags |= TUN_H_UDP_CSUM;
+			break;
+		}
+	}
+}
+
+static void
+thdr2vhdr(struct tun_hdr *th, struct virtio_net_hdr *vh,
+    const struct iovec *iov, int iovcnt)
+{
+	size_t	off;
+	uint8_t	proto;
+
+	if (th->th_flags & (TUN_H_TCP_CSUM | TUN_H_UDP_CSUM)) {
+		hdr_extract(iov, iovcnt, &off, &proto);
+
+		vh->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
+		vh->csum_start = off;
+
+		switch (proto) {
+		case IPPROTO_TCP:
+			vh->csum_offset = offsetof(struct tcphdr, th_sum);
+			break;
+
+		case IPPROTO_UDP:
+			vh->csum_offset = offsetof(struct udphdr, uh_sum);
+			break;
+		}
+	}
 }
Index: usr.sbin/vmd/virtio.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
diff -u -p -r1.124 virtio.c
--- usr.sbin/vmd/virtio.c	12 May 2025 17:17:42 -0000	1.124
+++ usr.sbin/vmd/virtio.c	21 May 2025 01:25:04 -0000
@@ -55,6 +55,8 @@ SLIST_HEAD(virtio_dev_head, virtio_dev) 
 
 #define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */
 
+#define VIRTIO_NET_F_CSUM	(1<<0)
+#define VIRTIO_NET_F_GUEST_CSUM	(1<<1)
 #define VIRTIO_NET_F_MAC	(1<<5)
 
 #define VMMCI_F_TIMESYNC	(1<<0)
@@ -622,6 +624,10 @@ virtio_init(struct vmd_vm *vm, int child
 			/* MAC address has been assigned by the parent */
 			memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
 			dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC;
+
+			/* offload checksum to the kernel */
+			dev->vionet.cfg.device_feature = VIRTIO_NET_F_CSUM;
+			dev->vionet.cfg.device_feature |= VIRTIO_NET_F_GUEST_CSUM;
 
 			dev->vionet.lockedmac =
 			    vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
Index: usr.sbin/vmd/virtio.h
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v
diff -u -p -r1.54 virtio.h
--- usr.sbin/vmd/virtio.h	12 May 2025 17:17:42 -0000	1.54
+++ usr.sbin/vmd/virtio.h	21 May 2025 01:25:04 -0000
@@ -307,6 +307,9 @@ struct virtio_net_hdr {
 /*	uint16_t num_buffers; */
 };
 
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM	1 /* flags */
+#define VIRTIO_NET_HDR_F_DATA_VALID	2 /* flags */
+
 enum vmmci_cmd {
 	VMMCI_NONE = 0,
 	VMMCI_SHUTDOWN,