From: Alexander Bluhm Subject: Re: vmd: add checksum offload for guests To: Jan Klemkow Cc: tech@openbsd.org Date: Thu, 22 May 2025 11:14:12 +0900 On Wed, May 21, 2025 at 03:38:16AM +0200, Jan Klemkow wrote: > Hi, > > This diff uses the tun(4) offload capabilities in the virtio > implementation of vmd(8). vmd(8) translates the virtio_net_hdr > structures from the guests packets to tun_hdr for tun(4) and vice versa. > The offloading features bits are dynamically negotiated with the guest. > So, legacy guests are still working with this feature. The pledge part > was discussed with deraadt. > > ok? With this diff Linux VM guests running on vmd(8) cannot use network anymore. bluhm > Index: sys/kern/kern_pledge.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_pledge.c,v > diff -u -p -r1.323 kern_pledge.c > --- sys/kern/kern_pledge.c 12 Feb 2025 14:11:26 -0000 1.323 > +++ sys/kern/kern_pledge.c 21 May 2025 01:25:04 -0000 > @@ -46,6 +46,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -1342,6 +1343,12 @@ pledge_ioctl(struct proc *p, long com, s > (cdevsw[major(vp->v_rdev)].d_open == vmmopen)) { > error = pledge_ioctl_vmm(p, com); > if (error == 0) > + return 0; > + } > + if ((fp->f_type == DTYPE_VNODE) && > + (vp->v_type == VCHR) && > + (cdevsw[major(vp->v_rdev)].d_open == tapopen)) { > + if (com == TUNSCAP) > return 0; > } > } > Index: usr.sbin/vmd/vionet.c > =================================================================== > RCS file: /cvs/src/usr.sbin/vmd/vionet.c,v > diff -u -p -r1.23 vionet.c > --- usr.sbin/vmd/vionet.c 12 May 2025 17:17:42 -0000 1.23 > +++ usr.sbin/vmd/vionet.c 21 May 2025 01:25:04 -0000 > @@ -17,12 +17,18 @@ > * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > */ > #include > +#include > > #include > #include > > #include > +#include > #include > +#include > +#include > +#include > +#include > #include > > #include > @@ -33,11 +39,14 @@ > #include > #include > #include > +#include > > #include "atomicio.h" > #include "virtio.h" > #include "vmd.h" > > +#define VIRTIO_NET_F_CSUM (1 << 0) > +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) > #define VIRTIO_NET_F_MAC (1 << 5) > #define RXQ 0 > #define TXQ 1 > @@ -54,7 +63,7 @@ static void *rx_run_loop(void *); > static void *tx_run_loop(void *); > static int vionet_rx(struct vionet_dev *, int); > static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *, > - int, size_t); > + int, size_t, struct tun_hdr *th); > static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int, > const struct iovec *, int); > static void vionet_rx_event(int, short, void *); > @@ -70,6 +79,10 @@ static void read_pipe_rx(int, short, voi > static void read_pipe_tx(int, short, void *); > static void vionet_assert_pic_irq(struct virtio_dev *); > static void vionet_deassert_pic_irq(struct virtio_dev *); > +static void vhdr2thdr(struct virtio_net_hdr *, struct tun_hdr *, > + const struct iovec *, int); > +static void thdr2vhdr(struct tun_hdr *, struct virtio_net_hdr *, > + const struct iovec *, int); > > /* Device Globals */ > struct event ev_tap; > @@ -101,6 +114,7 @@ vionet_main(int fd, int fd_vmm) > struct vm_create_params *vcp; > ssize_t sz; > int ret; > + struct tun_capabilities tcap; > > /* > * stdio - needed for read/write to disk fds and channels to the vm. > @@ -132,6 +146,14 @@ vionet_main(int fd, int fd_vmm) > ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd, > dev.async_fd, fd_vmm); > > + /* > + * IFCAPs should be tweaked based on feature negotiation > + * with the guest. > + */ > + tcap.tun_if_capabilities = 0; > + if (ioctl(vionet->data_fd, TUNSCAP, &tcap) == -1) > + fatal("tap(4) TUNSCAP"); > + > /* Receive our vm information from the vm process. */ > memset(&vm, 0, sizeof(vm)); > sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm)); > @@ -156,7 +178,7 @@ vionet_main(int fd, int fd_vmm) > * We no longer need /dev/vmm access. > */ > close_fd(fd_vmm); > - if (pledge("stdio", NULL) == -1) > + if (pledge("stdio vmm", NULL) == -1) > fatal("pledge2"); > > /* If we're restoring hardware, re-initialize virtqueue hva's. */ > @@ -315,6 +337,25 @@ fail: > } > > /* > + * Update and sync offload features with tap(4). > + */ > +static void > +vionet_update_offload(struct vionet_dev *dev) > +{ > + struct tun_capabilities tcap; > + > + tcap.tun_if_capabilities = 0; > + > + if (dev->cfg.guest_feature & VIRTIO_NET_F_GUEST_CSUM) { > + tcap.tun_if_capabilities |= IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4; > + tcap.tun_if_capabilities |= IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; > + } > + > + if (ioctl(dev->data_fd, TUNSCAP, &tcap) == -1) > + fatal("tap(4) TUNSCAP"); > +} > + > +/* > * Update the gpa and hva of the virtqueue. > */ > static void > @@ -383,7 +424,9 @@ vionet_rx(struct vionet_dev *dev, int fd > struct vring_avail *avail; > struct vring_used *used; > struct virtio_vq_info *vq_info; > + struct virtio_net_hdr *vhp; > struct iovec *iov; > + struct tun_hdr th; > int notify = 0; > ssize_t sz; > uint8_t status = 0; > @@ -414,7 +457,7 @@ vionet_rx(struct vionet_dev *dev, int fd > goto reset; > } > > - iov = &iov_rx[0]; > + iov = &iov_rx[1]; > iov_cnt = 1; > > /* > @@ -436,6 +479,7 @@ vionet_rx(struct vionet_dev *dev, int fd > if (iov->iov_base == NULL) > goto reset; > memset(iov->iov_base, 0, sizeof(struct virtio_net_hdr)); > + vhp = iov->iov_base; > > /* Tweak the iovec to account for the virtio_net_hdr. */ > iov->iov_len -= sizeof(struct virtio_net_hdr); > @@ -486,14 +530,19 @@ vionet_rx(struct vionet_dev *dev, int fd > */ > if (dev->lockedmac || fd != dev->data_fd) > sz = vionet_rx_copy(dev, fd, iov_rx, iov_cnt, > - chain_len); > - else > + chain_len, &th); > + else { > + iov_rx[0].iov_base = &th; > + iov_rx[0].iov_len = sizeof(th); > sz = vionet_rx_zerocopy(dev, fd, iov_rx, iov_cnt); > + } > if (sz == -1) > goto reset; > if (sz == 0) /* No packets, so bail out for now. */ > break; > > + thdr2vhdr(&th, vhp, iov_rx + 1, iov_cnt - 1); > + > /* > * Account for the prefixed header since it wasn't included > * in the copy or zerocopy operations. > @@ -533,9 +582,9 @@ reset: > */ > ssize_t > vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov, > - int iov_cnt, size_t chain_len) > + int iov_cnt, size_t chain_len, struct tun_hdr *th) > { > - static uint8_t buf[VIONET_HARD_MTU]; > + static uint8_t buf[sizeof(struct tun_hdr) + VIONET_HARD_MTU]; > struct packet *pkt = NULL; > struct ether_header *eh = NULL; > uint8_t *payload = buf; > @@ -543,9 +592,10 @@ vionet_rx_copy(struct vionet_dev *dev, i > ssize_t sz; > > /* If reading from the tap(4), try to right-size the read. */ > - if (fd == dev->data_fd) > - nbytes = MIN(chain_len, VIONET_HARD_MTU); > - else if (fd == pipe_inject[READ]) > + if (fd == dev->data_fd) { > + nbytes = sizeof(struct tun_hdr) + > + MIN(chain_len, VIONET_HARD_MTU); > + } else if (fd == pipe_inject[READ]) > nbytes = sizeof(struct packet); > else { > log_warnx("%s: invalid fd: %d", __func__, fd); > @@ -564,10 +614,20 @@ vionet_rx_copy(struct vionet_dev *dev, i > return (-1); > } > return (0); > - } else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) { > + } else if (fd == dev->data_fd) { > + if ((size_t)sz < sizeof(struct tun_hdr)) { > + log_warnx("%s: short tun_hdr", __func__); > + return (0); > + } > + memcpy(th, payload, sizeof *th); > + payload += sizeof(struct tun_hdr); > + sz -= sizeof(struct tun_hdr); > + > /* If reading the tap(4), we should get valid ethernet. */ > - log_warnx("%s: invalid packet size", __func__); > - return (0); > + if (sz < VIONET_MIN_TXLEN) { > + log_warnx("%s: invalid packet size", __func__); > + return (0); > + } > } else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) { > log_warnx("%s: invalid injected packet object (sz=%ld)", > __func__, sz); > @@ -729,6 +789,8 @@ vionet_tx(struct virtio_dev *dev) > struct iovec *iov; > struct packet pkt; > uint8_t status = 0; > + struct virtio_net_hdr *vhp; > + struct tun_hdr th; > > status = vionet->cfg.device_status > & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK; > @@ -756,8 +818,10 @@ vionet_tx(struct virtio_dev *dev) > goto reset; > } > > - iov = &iov_tx[0]; > - iov_cnt = 0; > + /* the 0th slot will by used by the tun_hdr */ > + > + iov = &iov_tx[1]; > + iov_cnt = 1; > chain_len = 0; > > /* > @@ -765,18 +829,23 @@ vionet_tx(struct virtio_dev *dev) > * descriptor sized to the virtio_net_hdr. However, the framing > * is not guaranteed, so check for packet data. > */ > - iov->iov_len = desc->len; > - if (iov->iov_len < sizeof(struct virtio_net_hdr)) { > + if (desc->len < sizeof(*vhp)) { > log_warnx("%s: invalid descriptor length", __func__); > goto reset; > - } else if (iov->iov_len > sizeof(struct virtio_net_hdr)) { > - /* Chop off the virtio header, leaving packet data. */ > - iov->iov_len -= sizeof(struct virtio_net_hdr); > - chain_len += iov->iov_len; > - iov->iov_base = hvaddr_mem(desc->addr + > - sizeof(struct virtio_net_hdr), iov->iov_len); > + } > + > + /* Chop the virtio net header off */ > + vhp = hvaddr_mem(desc->addr, sizeof(*vhp)); > + if (vhp == NULL) > + goto reset; > + > + iov->iov_len = desc->len - sizeof(*vhp); > + if (iov->iov_len > 0) { > + iov->iov_base = hvaddr_mem(desc->addr + sizeof(*vhp), > + iov->iov_len); > if (iov->iov_base == NULL) > goto reset; > + chain_len += iov->iov_len; > iov_cnt++; > } > > @@ -820,7 +889,7 @@ vionet_tx(struct virtio_dev *dev) > * descriptor with packet data contains a large enough buffer > * for this inspection. > */ > - iov = &iov_tx[0]; > + iov = &iov_tx[1]; > if (vionet->lockedmac) { > if (iov->iov_len < ETHER_HDR_LEN) { > log_warnx("%s: insufficient header data", > @@ -846,6 +915,17 @@ vionet_tx(struct virtio_dev *dev) > } > } > > + memset(&th, 0, sizeof(th)); > + > + /* > + * if we look at more of vhp we might need to copy > + * it so it's aligned properly > + */ > + vhdr2thdr(vhp, &th, iov_tx + 1, iov_cnt - 1); > + > + iov_tx[0].iov_base = &th; > + iov_tx[0].iov_len = sizeof(th); > + > /* Write our packet to the tap(4). */ > sz = writev(vionet->data_fd, iov_tx, iov_cnt); > if (sz == -1 && errno != ENOBUFS) { > @@ -1064,6 +1144,7 @@ handle_io_write(struct viodev_msg *msg, > break; > case VIRTIO_CONFIG_GUEST_FEATURES: > vionet->cfg.guest_feature = data; > + vionet_update_offload(vionet); > break; > case VIRTIO_CONFIG_QUEUE_PFN: > vionet->cfg.queue_pfn = data; > @@ -1396,4 +1477,149 @@ vionet_deassert_pic_irq(struct virtio_de > &msg, sizeof(msg), ev_base_main); > if (ret == -1) > log_warnx("%s: failed to assert irq %d", __func__, dev->irq); > +} > + > +static int > +memcpyv(void *buf, size_t len, size_t off, const struct iovec *iov, int iovcnt) > +{ > + uint8_t *dst = buf; > + size_t l; > + > + for (;;) { > + if (iovcnt == 0) > + return (-1); > + > + if (off < iov->iov_len) > + break; > + > + off -= iov->iov_len; > + iov++; > + iovcnt--; > + } > + > + l = off + len; > + if (l > iov->iov_len) > + l = iov->iov_len; > + l -= off; > + > + memcpy(dst, (const uint8_t *)iov->iov_base + off, l); > + dst += l; > + len -= l; > + > + if (len == 0) > + return (0); > + > + for (;;) { > + if (iovcnt == 0) > + return (-1); > + > + l = len; > + if (l > iov->iov_len) > + l = iov->iov_len; > + > + memcpy(dst, (const uint8_t *)iov->iov_base, l); > + dst += l; > + len -= l; > + > + if (len == 0) > + break; > + > + iov++; > + iovcnt--; > + } > + > + return (0); > +} > + > +static void > +hdr_extract(const struct iovec *iov, int iovcnt, size_t *off, uint8_t *proto) > +{ > + size_t offs; > + uint16_t etype; > + > + if (memcpyv(&etype, sizeof(etype), > + offsetof(struct ether_header, ether_type), > + iov, iovcnt) == -1) > + return; > + > + *off = sizeof(struct ether_header); > + > + if (etype == htons(ETHERTYPE_VLAN)) { > + if (memcpyv(&etype, sizeof(etype), > + offsetof(struct ether_vlan_header, evl_proto), > + iov, iovcnt) == -1) > + return; > + > + *off = sizeof(struct ether_vlan_header); > + } > + > + if (etype == htons(ETHERTYPE_IP)) { > + uint16_t len; > + > + /* Get ipproto field from IP header. */ > + offs = *off + offsetof(struct ip, ip_p); > + if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1) > + return; > + > + /* Get IP length field from IP header. */ > + offs = *off + offsetof(struct ip, ip_len); > + if (memcpyv(&len, sizeof(len), offs, iov, iovcnt) == -1) > + return; > + > + *off += len; > + } else if (etype == htons(ETHERTYPE_IPV6)) { > + /* Get next header field from IP header. */ > + offs = *off + offsetof(struct ip6_hdr, ip6_nxt); > + if (memcpyv(proto, sizeof(*proto), offs, iov, iovcnt) == -1) > + return; > + > + *off += sizeof(struct ip6_hdr); > + } > +} > + > +static void > +vhdr2thdr(struct virtio_net_hdr *vh, struct tun_hdr *th, > + const struct iovec *iov, int iovcnt) > +{ > + if (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { > + size_t off; > + uint8_t proto; > + > + hdr_extract(iov, iovcnt, &off, &proto); > + > + switch (proto) { > + case IPPROTO_TCP: > + th->th_flags |= TUN_H_TCP_CSUM; > + break; > + > + case IPPROTO_UDP: > + th->th_flags |= TUN_H_UDP_CSUM; > + break; > + } > + } > +} > + > +static void > +thdr2vhdr(struct tun_hdr *th, struct virtio_net_hdr *vh, > + const struct iovec *iov, int iovcnt) > +{ > + size_t off; > + uint8_t proto; > + > + if (th->th_flags & (TUN_H_TCP_CSUM | TUN_H_UDP_CSUM)) { > + hdr_extract(iov, iovcnt, &off, &proto); > + > + vh->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; > + vh->csum_start = off; > + > + switch (proto) { > + case IPPROTO_TCP: > + vh->csum_offset = offsetof(struct tcphdr, th_sum); > + break; > + > + case IPPROTO_UDP: > + vh->csum_offset = offsetof(struct udphdr, uh_sum); > + break; > + } > + } > } > Index: usr.sbin/vmd/virtio.c > =================================================================== > RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v > diff -u -p -r1.124 virtio.c > --- usr.sbin/vmd/virtio.c 12 May 2025 17:17:42 -0000 1.124 > +++ usr.sbin/vmd/virtio.c 21 May 2025 01:25:04 -0000 > @@ -55,6 +55,8 @@ SLIST_HEAD(virtio_dev_head, virtio_dev) > > #define MAXPHYS (64 * 1024) /* max raw I/O transfer size */ > > +#define VIRTIO_NET_F_CSUM (1<<0) > +#define VIRTIO_NET_F_GUEST_CSUM (1<<1) > #define VIRTIO_NET_F_MAC (1<<5) > > #define VMMCI_F_TIMESYNC (1<<0) > @@ -622,6 +624,10 @@ virtio_init(struct vmd_vm *vm, int child > /* MAC address has been assigned by the parent */ > memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6); > dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC; > + > + /* offload checksum to the kernel */ > + dev->vionet.cfg.device_feature = VIRTIO_NET_F_CSUM; > + dev->vionet.cfg.device_feature |= VIRTIO_NET_F_GUEST_CSUM; > > dev->vionet.lockedmac = > vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0; > Index: usr.sbin/vmd/virtio.h > =================================================================== > RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v > diff -u -p -r1.54 virtio.h > --- usr.sbin/vmd/virtio.h 12 May 2025 17:17:42 -0000 1.54 > +++ usr.sbin/vmd/virtio.h 21 May 2025 01:25:04 -0000 > @@ -307,6 +307,9 @@ struct virtio_net_hdr { > /* uint16_t num_buffers; */ > }; > > +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* flags */ > +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* flags */ > + > enum vmmci_cmd { > VMMCI_NONE = 0, > VMMCI_SHUTDOWN,