Download raw body.
vmd: add checksum offload for guests
On Sat, Jan 17, 2026 at 11:38:50AM -0500, Dave Voutila wrote:
> Mike Larkin <mlarkin@nested.page> writes:
>
> > On Fri, Jan 16, 2026 at 07:38:16PM +0100, Jan Klemkow wrote:
> >> On Thu, Jan 15, 2026 at 02:08:43PM -0800, Mike Larkin wrote:
> >> > Does this "just work" no matter what guests I run? That's really all I care
> >> > about.
> >>
> >> Here is my current diff for checksum offloading in vmd(8).
> >>
> >> I tested the following combination of features:
> >>
> >> - Debian/Linux and OpenBSD-current guests
> >> - OpenBSD-current vio(4) w/o all offloading features
> >> - Linux, OpenBSD and Hostsystem via veb(4) and vlan(4)
> >> - IPv4 and IPv6 with tcpbench(1)
> >> - local interface locked lladdr
> >> - local interface dhcp
> >>
> >> Further tests are welcome!
> >>
> >> ok?
> >>
> >> Thanks,
> >> Jan
> >>
> >
> > Not sure about dv@, but I can't really review this. it's hundreds of lines
> > of changes in vmd vionet that require a level of understanding of tap(4) and
> > in virtio/vionet (and the network stack in general) that I don't have.
> > When I did the original vionet in vmd years ago it was pretty straightforward
> > since the spec (for *all* virtio) was only like 20 pages. I was able to write
> > that code in a weekend. now that we have bolted on all this other stuff, I
> > don't feel comfortable giving oks in this area anymore since there is no way
> > I can look at this and know if it's right or not. I think you need a network
> > stack person to ok this, *and* explain what the ramifications are for vmd
> > in general. It looks like vmd is doing inspection of every packet now? I
> > dont think we want that.
>
> I've spent time digging into this and better understand it now. I'm also
> happy now with how the current diff isn't expanding pledges for vionet.
>
> It feels overkill to have to poke every packet, but I do manage to see a
> small improvement in the one test I did using iperf3 sending from host
> to guest. It's only about 1-2% gain in throughput on my Intel x1c gen10
> and less than 1% on my newer Ryzen AI 350 machine. (This was using a
> -current snapshot for the guest.)
>
> I did this both with the "local interface" (where we already inspect
> each packet to intercept DHCP packets) and one added to a veb(4) device
> with and accompanying host-side vport(4).
>
> My hypothesis is the gain is mostly due to offloading work from the
> single-vcpu guest to the host vionet tx or rx threads.
>
> Is it worth it? Especially knowing we're technically shortcutting the
> actual spec as written by attesting for every packet checksum being
> good? /shrug
>
> Does someone have a better benchmark showing this moves the needle?
This version of the diff reduces the code complexity for the whole
system. dlg@ is right, that the work is shifted from vmd(8) to tun(4).
But, the kernel already has proven infrastructure for this with
ether_extract_headers(). So, tun(4) can do this in just a few lines.
With out introducing packet paring up to the TCP layer in vmd(8).
From the feature perspective we just lost VLAN tagging and IP header
checksum offloading for using virtio_net_hdr in favor of tun_hdr. But,
both have a non-measurable impact on performance.
This diff contains just the checksum part so you can truly compare it to
me last diff. The next will bring a PoC version of TSO to measure if
its worth it.
Index: sys/kern/kern_pledge.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_pledge.c,v
diff -u -p -r1.335 kern_pledge.c
--- sys/kern/kern_pledge.c 13 Nov 2025 20:59:14 -0000 1.335
+++ sys/kern/kern_pledge.c 6 Feb 2026 16:29:28 -0000
@@ -46,6 +46,7 @@
#include <net/route.h>
#include <net/if.h>
#include <net/if_var.h>
+#include <net/if_tun.h>
#include <netinet/in.h>
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
@@ -1337,6 +1338,12 @@ pledge_ioctl(struct proc *p, long com, s
cdevsw[major(vp->v_rdev)].d_open == vmmopen) {
error = pledge_ioctl_vmm(p, com);
if (error == 0)
+ return 0;
+ }
+ if ((fp->f_type == DTYPE_VNODE) &&
+ (vp->v_type == VCHR) &&
+ (cdevsw[major(vp->v_rdev)].d_open == tapopen)) {
+ if (com == TUNSCAP)
return 0;
}
}
Index: sys/net/if_tun.c
===================================================================
RCS file: /cvs/src/sys/net/if_tun.c,v
diff -u -p -r1.256 if_tun.c
--- sys/net/if_tun.c 14 Dec 2025 01:51:26 -0000 1.256
+++ sys/net/if_tun.c 6 Feb 2026 22:42:52 -0000
@@ -57,6 +57,7 @@
#include <net/rtable.h>
#include <netinet/in.h>
+#include <netinet/udp.h>
#include <netinet/if_ether.h>
/* for tun_input_process */
@@ -87,11 +88,11 @@ struct tun_softc {
struct sigio_ref sc_sigio; /* async I/O registration */
unsigned int sc_flags; /* misc flags */
#define TUN_DEAD (1 << 16)
-#define TUN_HDR (1 << 17)
dev_t sc_dev;
struct refcnt sc_refs;
unsigned int sc_reading;
+ struct tun_capabilities sc_cap;
};
#ifdef TUN_DEBUG
@@ -105,10 +106,8 @@ int tundebug = TUN_DEBUG;
#define TUN_IFF_FLAGS (IFF_POINTOPOINT|IFF_MULTICAST|IFF_BROADCAST)
#define TUN_IF_CAPS ( \
- IFCAP_CSUM_IPv4 | \
- IFCAP_CSUM_TCPv4|IFCAP_CSUM_UDPv4|IFCAP_CSUM_TCPv6|IFCAP_CSUM_UDPv6 | \
- IFCAP_VLAN_MTU|IFCAP_VLAN_HWTAGGING|IFCAP_VLAN_HWOFFLOAD | \
- IFCAP_TSOv4|IFCAP_TSOv6|IFCAP_LRO \
+ VIRTIO_NET_F_CSUM | \
+ VIRTIO_NET_F_GUEST_CSUM \
)
void tunattach(int);
@@ -167,6 +166,25 @@ struct if_clone tun_cloner =
struct if_clone tap_cloner =
IF_CLONE_INITIALIZER("tap", tap_clone_create, tun_clone_destroy);
+/* Packet header structure */
+struct virtio_net_hdr {
+ uint8_t flags;
+ uint8_t gso_type;
+ uint16_t hdr_len;
+ uint16_t gso_size;
+ uint16_t csum_start;
+ uint16_t csum_offset;
+
+ /* only present if VIRTIO_NET_F_MRG_RXBUF is negotiated */
+ uint16_t num_buffers;
+} __packed;
+
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* flags */
+#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* flags */
+
+#define VIRTIO_NET_F_CSUM (1ULL<<0)
+#define VIRTIO_NET_F_GUEST_CSUM (1ULL<<1)
+
void
tunattach(int n)
{
@@ -505,11 +523,11 @@ tun_dev_close(dev_t dev, struct proc *p)
*/
NET_LOCK();
CLR(ifp->if_flags, IFF_UP | IFF_RUNNING);
- CLR(ifp->if_capabilities, TUN_IF_CAPS);
+ ifp->if_capabilities = 0;
NET_UNLOCK();
ifq_purge(&ifp->if_snd);
- CLR(sc->sc_flags, TUN_ASYNC|TUN_HDR);
+ CLR(sc->sc_flags, TUN_ASYNC);
sigio_free(&sc->sc_sigio);
if (!ISSET(sc->sc_flags, TUN_DEAD)) {
@@ -640,44 +658,51 @@ tapioctl(dev_t dev, u_long cmd, caddr_t
static int
tun_set_capabilities(struct tun_softc *sc, const struct tun_capabilities *cap)
{
- if (ISSET(cap->tun_if_capabilities, ~TUN_IF_CAPS))
+ if (cap->vnet_hdr_size != 0 &&
+ cap->vnet_hdr_size != sizeof(struct virtio_net_hdr))
return (EINVAL);
- KERNEL_ASSERT_LOCKED();
- SET(sc->sc_flags, TUN_HDR);
+ if (cap->vnet_hdr_size == 0 && cap->tun_if_capabilities != 0)
+ return (EINVAL);
+
+ if (ISSET(cap->tun_if_capabilities, ~TUN_IF_CAPS))
+ return (EINVAL);
NET_LOCK();
- CLR(sc->sc_if.if_capabilities, TUN_IF_CAPS);
- SET(sc->sc_if.if_capabilities, cap->tun_if_capabilities);
+ sc->sc_cap.tun_if_capabilities = cap->tun_if_capabilities;
+ sc->sc_cap.vnet_hdr_size = cap->vnet_hdr_size;
+
+ if (ISSET(sc->sc_cap.tun_if_capabilities, VIRTIO_NET_F_GUEST_CSUM)) {
+ SET(sc->sc_if.if_capabilities, IFCAP_CSUM_TCPv4);
+ SET(sc->sc_if.if_capabilities, IFCAP_CSUM_TCPv6);
+ SET(sc->sc_if.if_capabilities, IFCAP_CSUM_UDPv4);
+ SET(sc->sc_if.if_capabilities, IFCAP_CSUM_UDPv6);
+ }
NET_UNLOCK();
+
return (0);
}
static int
tun_get_capabilities(struct tun_softc *sc, struct tun_capabilities *cap)
{
- int error = 0;
-
NET_LOCK_SHARED();
- if (ISSET(sc->sc_flags, TUN_HDR)) {
- cap->tun_if_capabilities =
- (sc->sc_if.if_capabilities & TUN_IF_CAPS);
- } else
- error = ENODEV;
+ cap->tun_if_capabilities = sc->sc_cap.tun_if_capabilities;
+ cap->vnet_hdr_size = sc->sc_cap.vnet_hdr_size;
NET_UNLOCK_SHARED();
- return (error);
+ return (0);
}
static int
tun_del_capabilities(struct tun_softc *sc)
{
- NET_LOCK();
- CLR(sc->sc_if.if_capabilities, TUN_IF_CAPS);
- NET_UNLOCK();
+ struct tun_capabilities cap = {
+ .tun_if_capabilities = 0,
+ .vnet_hdr_size = 0
+ };
- KERNEL_ASSERT_LOCKED();
- CLR(sc->sc_flags, TUN_HDR);
+ tun_set_capabilities(sc, &cap);
return (0);
}
@@ -689,8 +714,8 @@ tun_hdatalen(struct tun_softc *sc)
int len;
len = ifq_hdatalen(&ifp->if_snd);
- if (len > 0 && ISSET(sc->sc_flags, TUN_HDR))
- len += sizeof(struct tun_hdr);
+ if (len > 0)
+ len += sc->sc_cap.vnet_hdr_size;
return (len);
}
@@ -842,38 +867,37 @@ tun_dev_read(dev_t dev, struct uio *uio,
ifp->if_bpf_mtap(ifp->if_bpf, m0, BPF_DIRECTION_OUT);
#endif
- if (ISSET(sc->sc_flags, TUN_HDR)) {
- struct tun_hdr th;
+ if (sc->sc_cap.vnet_hdr_size != 0) {
+ struct virtio_net_hdr vh;
+
+ memset(&vh, 0, sizeof vh);
KASSERT(ISSET(m0->m_flags, M_PKTHDR));
- th.th_flags = 0;
- if (ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT))
- SET(th.th_flags, TUN_H_IPV4_CSUM);
- if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT))
- SET(th.th_flags, TUN_H_TCP_CSUM);
- if (ISSET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT))
- SET(th.th_flags, TUN_H_UDP_CSUM);
- if (ISSET(m0->m_pkthdr.csum_flags, M_ICMP_CSUM_OUT))
- SET(th.th_flags, TUN_H_ICMP_CSUM);
-
- th.th_pad = 0;
-
- th.th_vtag = 0;
- if (ISSET(m0->m_flags, M_VLANTAG)) {
- SET(th.th_flags, TUN_H_VTAG);
- th.th_vtag = m0->m_pkthdr.ether_vtag;
- }
-
- th.th_mss = 0;
- if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO)) {
- SET(th.th_flags, TUN_H_TCP_MSS);
- th.th_mss = m0->m_pkthdr.ph_mss;
+ if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT) ||
+ ISSET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) {
+ struct ether_extracted ext;
+
+ ether_extract_headers(m0, &ext);
+
+ vh.csum_start = sizeof(*ext.eh);
+ if (ext.evh)
+ vh.csum_start = sizeof(*ext.evh);
+ vh.csum_start += ext.iphlen;
+
+ if (ext.tcp)
+ vh.csum_offset =
+ offsetof(struct tcphdr, th_sum);
+ if (ext.udp)
+ vh.csum_offset =
+ offsetof(struct udphdr, uh_sum);
+
+ SET(vh.flags, VIRTIO_NET_HDR_F_NEEDS_CSUM);
}
- len = ulmin(uio->uio_resid, sizeof(th));
+ len = ulmin(uio->uio_resid, sizeof(vh));
if (len > 0) {
- error = uiomove(&th, len, uio);
+ error = uiomove(&vh, len, uio);
if (error != 0)
goto free;
}
@@ -925,7 +949,7 @@ tun_dev_write(dev_t dev, struct uio *uio
int error = 0;
size_t len, alen, mlen;
size_t hlen;
- struct tun_hdr th;
+ struct virtio_net_hdr vh;
sc = tun_get(dev);
if (sc == NULL)
@@ -934,8 +958,8 @@ tun_dev_write(dev_t dev, struct uio *uio
ifp = &sc->sc_if;
hlen = ifp->if_hdrlen;
- if (ISSET(sc->sc_flags, TUN_HDR))
- hlen += sizeof(th);
+ if (sc->sc_cap.vnet_hdr_size != 0)
+ hlen += sizeof(vh);
if (uio->uio_resid < hlen ||
uio->uio_resid > (hlen + MAXMCLBYTES)) {
error = EMSGSIZE;
@@ -948,50 +972,10 @@ tun_dev_write(dev_t dev, struct uio *uio
goto put;
}
- if (ISSET(sc->sc_flags, TUN_HDR)) {
- error = uiomove(&th, sizeof(th), uio);
+ if (sc->sc_cap.vnet_hdr_size != 0) {
+ error = uiomove(&vh, sizeof(vh), uio);
if (error != 0)
goto drop;
-
- if (ISSET(th.th_flags, TUN_H_IPV4_CSUM)) {
- SET(m0->m_pkthdr.csum_flags,
- M_IPV4_CSUM_OUT | M_IPV4_CSUM_IN_OK);
- }
-
- switch (th.th_flags &
- (TUN_H_TCP_CSUM|TUN_H_UDP_CSUM|TUN_H_ICMP_CSUM)) {
- case 0:
- break;
- case TUN_H_TCP_CSUM:
- SET(m0->m_pkthdr.csum_flags,
- M_TCP_CSUM_OUT | M_TCP_CSUM_IN_OK);
- break;
- case TUN_H_UDP_CSUM:
- SET(m0->m_pkthdr.csum_flags,
- M_UDP_CSUM_OUT | M_UDP_CSUM_IN_OK);
- break;
- case TUN_H_ICMP_CSUM:
- SET(m0->m_pkthdr.csum_flags,
- M_ICMP_CSUM_OUT | M_ICMP_CSUM_IN_OK);
- break;
- default:
- error = EINVAL;
- goto drop;
- }
-
- if (ISSET(th.th_flags, TUN_H_VTAG)) {
- if (!ISSET(sc->sc_flags, TUN_LAYER2)) {
- error = EINVAL;
- goto drop;
- }
- SET(m0->m_flags, M_VLANTAG);
- m0->m_pkthdr.ether_vtag = th.th_vtag;
- }
-
- if (ISSET(th.th_flags, TUN_H_TCP_MSS)) {
- SET(m0->m_pkthdr.csum_flags, M_TCP_TSO);
- m0->m_pkthdr.ph_mss = th.th_mss;
- }
}
align += roundup(max_linkhdr, sizeof(long));
@@ -1039,6 +1023,22 @@ tun_dev_write(dev_t dev, struct uio *uio
m->m_next = n;
m = n;
+ }
+
+ if (sc->sc_cap.vnet_hdr_size != 0) {
+ if (ISSET(vh.flags, VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
+ struct ether_extracted ext;
+
+ ether_extract_headers(m0, &ext);
+
+ if (ext.tcp) {
+ SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
+ SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK);
+ } else if (ext.udp) {
+ SET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT);
+ SET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_IN_OK);
+ }
+ }
}
tun_input_process(ifp, m0);
Index: sys/net/if_tun.h
===================================================================
RCS file: /cvs/src/sys/net/if_tun.h,v
diff -u -p -r1.18 if_tun.h
--- sys/net/if_tun.h 17 Nov 2024 00:25:07 -0000 1.18
+++ sys/net/if_tun.h 6 Feb 2026 16:29:30 -0000
@@ -113,6 +113,7 @@ struct tuninfo {
struct tun_capabilities {
uint32_t tun_if_capabilities; /* IFCAP_* from net/if.h */
+ int vnet_hdr_size;
};
#define TUNSCAP _IOW('t', 196, struct tun_capabilities)
Index: usr.sbin/vmd/vionet.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/vionet.c,v
diff -u -p -r1.29 vionet.c
--- usr.sbin/vmd/vionet.c 14 Jan 2026 03:09:05 -0000 1.29
+++ usr.sbin/vmd/vionet.c 6 Feb 2026 22:42:52 -0000
@@ -50,17 +50,11 @@
#define VIRTIO_NET_CONFIG_MAC 0 /* 8 bit x 6 byte */
-#define VIRTIO_NET_F_MAC (1 << 5)
#define RXQ 0
#define TXQ 1
extern struct vmd_vm *current_vm;
-struct packet {
- uint8_t *buf;
- size_t len;
-};
-
static void *rx_run_loop(void *);
static void *tx_run_loop(void *);
static int vionet_rx(struct virtio_dev *, int);
@@ -300,6 +294,27 @@ fail:
}
/*
+ * Update and sync offload features with tap(4).
+ */
+static void
+vionet_update_offload(struct virtio_dev *dev)
+{
+ struct viodev_msg msg;
+ int ret;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.irq = dev->irq;
+ msg.type = VIODEV_MSG_TUNSCAP;
+ msg.data = dev->driver_feature &
+ (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM);
+
+ ret = imsg_compose_event2(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
+ &msg, sizeof(msg), ev_base_main);
+ if (ret == -1)
+ log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
+}
+
+/*
* vionet_rx
*
* Pull packet from the provided fd and fill the receive-side virtqueue. We
@@ -365,22 +380,11 @@ vionet_rx(struct virtio_dev *dev, int fd
goto reset;
}
- /*
- * Insert the virtio_net_hdr and adjust len/base. We do the
- * pointer math here before it's a void*.
- */
iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
if (iov->iov_base == NULL)
goto reset;
hdr = iov->iov_base;
- memset(hdr, 0, sizeof(struct virtio_net_hdr));
- /* Tweak the iovec to account for the virtio_net_hdr. */
- iov->iov_len -= sizeof(struct virtio_net_hdr);
- iov->iov_base = hvaddr_mem(desc->addr +
- sizeof(struct virtio_net_hdr), iov->iov_len);
- if (iov->iov_base == NULL)
- goto reset;
chain_len = iov->iov_len;
/*
@@ -434,12 +438,6 @@ vionet_rx(struct virtio_dev *dev, int fd
if (sz == 0) /* No packets, so bail out for now. */
break;
- /*
- * Account for the prefixed header since it wasn't included
- * in the copy or zerocopy operations.
- */
- sz += sizeof(struct virtio_net_hdr);
-
/* Mark our buffers as used. */
used->ring[used->idx & vq_info->mask].id = hdr_idx;
used->ring[used->idx & vq_info->mask].len = sz;
@@ -475,23 +473,21 @@ ssize_t
vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov,
int iov_cnt, size_t chain_len)
{
- static uint8_t buf[VIONET_HARD_MTU];
- struct packet *pkt = NULL;
+ static uint8_t buf[sizeof(struct virtio_net_hdr)
+ + VIONET_HARD_MTU];
struct ether_header *eh = NULL;
uint8_t *payload = buf;
size_t i, chunk, nbytes, copied = 0;
ssize_t sz;
- /* If reading from the tap(4), try to right-size the read. */
- if (fd == dev->data_fd)
- nbytes = MIN(chain_len, VIONET_HARD_MTU);
- else if (fd == pipe_inject[READ])
- nbytes = sizeof(struct packet);
- else {
+ if (fd != dev->data_fd && fd != pipe_inject[READ]) {
log_warnx("%s: invalid fd: %d", __func__, fd);
return (-1);
}
+ /* If reading from the tap(4), try to right-size the read. */
+ nbytes = MIN(chain_len, VIONET_HARD_MTU);
+
/*
* Try to pull a packet. The fd should be non-blocking and we don't
* care if we under-read (i.e. sz != nbytes) as we may not have a
@@ -504,35 +500,16 @@ vionet_rx_copy(struct vionet_dev *dev, i
return (-1);
}
return (0);
- } else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) {
+ } else if (fd == dev->data_fd && (size_t)sz < VIONET_MIN_TXLEN) {
/* If reading the tap(4), we should get valid ethernet. */
log_warnx("%s: invalid packet size", __func__);
return (0);
- } else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) {
- log_warnx("%s: invalid injected packet object (sz=%ld)",
- __func__, sz);
- return (0);
- }
-
- /* Decompose an injected packet, if that's what we're working with. */
- if (fd == pipe_inject[READ]) {
- pkt = (struct packet *)buf;
- if (pkt->buf == NULL) {
- log_warnx("%s: invalid injected packet, no buffer",
- __func__);
- return (0);
- }
- if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
- log_warnx("%s: invalid injected packet size", __func__);
- goto drop;
- }
- payload = pkt->buf;
- sz = (ssize_t)pkt->len;
}
/* Validate the ethernet header, if required. */
if (dev->lockedmac) {
- eh = (struct ether_header *)(payload);
+ eh = (struct ether_header *)(payload
+ + sizeof(struct virtio_net_hdr));
if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
memcmp(eh->ether_dhost, dev->mac,
sizeof(eh->ether_dhost)) != 0)
@@ -553,10 +530,6 @@ vionet_rx_copy(struct vionet_dev *dev, i
}
drop:
- /* Free any injected packet buffer. */
- if (pkt != NULL)
- free(pkt->buf);
-
return (copied);
}
@@ -585,6 +558,10 @@ vionet_rx_zerocopy(struct vionet_dev *de
sz = readv(fd, iov, iov_cnt);
if (sz == -1 && errno == EAGAIN)
return (0);
+
+ if ((size_t)sz < sizeof(struct virtio_net_hdr))
+ return (0);
+
return (sz);
}
@@ -664,7 +641,6 @@ vionet_tx(struct virtio_dev *dev)
struct virtio_vq_info *vq_info;
struct ether_header *eh;
struct iovec *iov;
- struct packet pkt;
uint8_t status = 0;
status = dev->status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
@@ -685,6 +661,8 @@ vionet_tx(struct virtio_dev *dev)
used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
while (idx != avail->idx) {
+ size_t off = 0;
+
hdr_idx = avail->ring[idx & vq_info->mask];
desc = &table[hdr_idx & vq_info->mask];
if (DESC_WRITABLE(desc)) {
@@ -704,19 +682,14 @@ vionet_tx(struct virtio_dev *dev)
log_warnx("%s: invalid descriptor length", __func__);
goto reset;
}
- iov->iov_len = desc->len;
- if (iov->iov_len > sizeof(struct virtio_net_hdr)) {
- /* Chop off the virtio header, leaving packet data. */
- iov->iov_len -= sizeof(struct virtio_net_hdr);
- iov->iov_base = hvaddr_mem(desc->addr +
- sizeof(struct virtio_net_hdr), iov->iov_len);
- if (iov->iov_base == NULL)
- goto reset;
-
- chain_len += iov->iov_len;
- iov_cnt++;
- }
+ /* Handel the first descriptor. */
+ iov->iov_len = desc->len;
+ iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
+ if (iov->iov_base == NULL)
+ goto reset;
+ chain_len += iov->iov_len;
+ iov_cnt++;
/*
* Walk the chain and collect remaining addresses and lengths.
@@ -758,14 +731,19 @@ vionet_tx(struct virtio_dev *dev)
* descriptor with packet data contains a large enough buffer
* for this inspection.
*/
+
iov = &iov_tx[0];
+ if (iov->iov_len == sizeof(struct virtio_net_hdr))
+ iov = &iov_tx[1];
+ else
+ off = sizeof(struct virtio_net_hdr);
if (vionet->lockedmac) {
- if (iov->iov_len < ETHER_HDR_LEN) {
+ if (iov->iov_len - off < ETHER_HDR_LEN) {
log_warnx("%s: insufficient header data",
__func__);
goto drop;
}
- eh = (struct ether_header *)iov->iov_base;
+ eh = (struct ether_header *)((char *)iov->iov_base + off);
if (memcmp(eh->ether_shost, vionet->mac,
sizeof(eh->ether_shost)) != 0) {
log_warnx("%s: bad source address %s",
@@ -775,8 +753,8 @@ vionet_tx(struct virtio_dev *dev)
}
}
if (vionet->local) {
- dhcpsz = dhcp_request(dev, iov->iov_base, iov->iov_len,
- &dhcppkt);
+ dhcpsz = dhcp_request(dev, (char *)iov->iov_base + off,
+ iov->iov_len - off, &dhcppkt);
if (dhcpsz > 0) {
log_debug("%s: detected dhcp request of %zu bytes",
__func__, dhcpsz);
@@ -790,7 +768,6 @@ vionet_tx(struct virtio_dev *dev)
log_warn("%s", __func__);
goto reset;
}
- chain_len += sizeof(struct virtio_net_hdr);
drop:
used->ring[used->idx & vq_info->mask].id = hdr_idx;
used->ring[used->idx & vq_info->mask].len = chain_len;
@@ -800,20 +777,21 @@ drop:
/* Facilitate DHCP reply injection, if needed. */
if (dhcpsz > 0) {
- pkt.buf = dhcppkt;
- pkt.len = dhcpsz;
- sz = write(pipe_inject[WRITE], &pkt, sizeof(pkt));
+ struct virtio_net_hdr vhdr = { 0 };
+ struct iovec iovec[2] = {
+ { .iov_base = &vhdr, .iov_len = sizeof(vhdr) },
+ { .iov_base = dhcppkt, .iov_len = dhcpsz }
+ };
+ sz = writev(pipe_inject[WRITE], iovec, 2);
if (sz == -1 && errno != EAGAIN) {
log_warn("%s: packet injection", __func__);
- free(pkt.buf);
} else if (sz == -1 && errno == EAGAIN) {
log_debug("%s: dropping dhcp reply", __func__);
- free(pkt.buf);
- } else if (sz != sizeof(pkt)) {
+ } else if (sz != (ssize_t)(sizeof(vhdr) + dhcpsz)) {
log_warnx("%s: failed packet injection",
__func__);
- free(pkt.buf);
}
+ free(dhcppkt);
}
}
@@ -1114,6 +1092,7 @@ vionet_cfg_write(struct virtio_dev *dev,
dev->driver_feature &= dev->device_feature;
DPRINTF("%s: driver features 0x%llx", __func__,
dev->driver_feature);
+ vionet_update_offload(dev);
break;
case VIO1_PCI_CONFIG_MSIX_VECTOR:
/* Ignore until we support MSIX. */
Index: usr.sbin/vmd/virtio.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
diff -u -p -r1.134 virtio.c
--- usr.sbin/vmd/virtio.c 14 Jan 2026 03:09:05 -0000 1.134
+++ usr.sbin/vmd/virtio.c 6 Feb 2026 22:42:52 -0000
@@ -19,6 +19,7 @@
#include <sys/param.h> /* PAGE_SIZE */
#include <sys/socket.h>
#include <sys/wait.h>
+#include <sys/ioctl.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcidevs.h>
@@ -28,6 +29,7 @@
#include <dev/vmm/vmm.h>
#include <net/if.h>
+#include <net/if_tun.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
@@ -64,8 +66,6 @@ SLIST_HEAD(virtio_dev_head, virtio_dev)
#define MAXPHYS (64 * 1024) /* max raw I/O transfer size */
-#define VIRTIO_NET_F_MAC (1<<5)
-
#define VMMCI_F_TIMESYNC (1<<0)
#define VMMCI_F_ACK (1<<1)
#define VMMCI_F_SYNCRTC (1<<2)
@@ -1020,6 +1020,8 @@ virtio_init(struct vmd_vm *vm, int child
/* Virtio 1.x Network Devices */
if (vmc->vmc_nnics > 0) {
for (i = 0; i < vmc->vmc_nnics; i++) {
+ struct tun_capabilities tcap;
+
dev = malloc(sizeof(struct virtio_dev));
if (dev == NULL) {
log_warn("calloc failure allocating vionet");
@@ -1034,7 +1036,8 @@ virtio_init(struct vmd_vm *vm, int child
}
virtio_dev_init(vm, dev, id, VIONET_QUEUE_SIZE_DEFAULT,
VIRTIO_NET_QUEUES,
- (VIRTIO_NET_F_MAC | VIRTIO_F_VERSION_1));
+ (VIRTIO_NET_F_MAC | VIRTIO_NET_F_CSUM |
+ VIRTIO_NET_F_GUEST_CSUM | VIRTIO_F_VERSION_1));
if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
dev) == -1) {
@@ -1056,6 +1059,15 @@ virtio_init(struct vmd_vm *vm, int child
dev->vmm_id = vm->vm_vmmid;
dev->vionet.data_fd = child_taps[i];
+ /*
+ * IFCAPs are tweaked after feature negotiation with
+ * the guest later.
+ */
+ memset(&tcap, 0, sizeof(tcap));
+ tcap.vnet_hdr_size = sizeof(struct virtio_net_hdr);
+ if (ioctl(dev->vionet.data_fd, TUNSCAP, &tcap) == -1)
+ fatal("tap(4) TUNSCAP");
+
/* MAC address has been assigned by the parent */
memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
dev->vionet.lockedmac =
@@ -1532,10 +1544,12 @@ virtio_dev_launch(struct vmd_vm *vm, str
}
/* Close data fds. Only the child device needs them now. */
- if (virtio_dev_closefds(dev) == -1) {
- log_warnx("%s: failed to close device data fds",
- __func__);
- goto err;
+ if (dev->dev_type != VMD_DEVTYPE_NET) {
+ if (virtio_dev_closefds(dev) == -1) {
+ log_warnx("%s: failed to close device data fds",
+ __func__);
+ goto err;
+ }
}
/* 2. Send over details on the VM (including memory fds). */
@@ -1758,6 +1772,19 @@ handle_dev_msg(struct viodev_msg *msg, s
case VIODEV_MSG_ERROR:
log_warnx("%s: device reported error", __func__);
break;
+ case VIODEV_MSG_TUNSCAP:
+ {
+ struct tun_capabilities tcap;
+
+ memset(&tcap, 0, sizeof(tcap));
+ tcap.tun_if_capabilities = msg->data;
+ tcap.vnet_hdr_size = sizeof(struct virtio_net_hdr);
+
+ if (ioctl(gdev->vionet.data_fd, TUNSCAP, &tcap) == -1)
+ fatal("%s: tap(4) TUNSCAP", __func__);
+
+ break;
+ }
case VIODEV_MSG_INVALID:
case VIODEV_MSG_IO_READ:
case VIODEV_MSG_IO_WRITE:
Index: usr.sbin/vmd/virtio.h
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v
diff -u -p -r1.60 virtio.h
--- usr.sbin/vmd/virtio.h 14 Jan 2026 03:09:05 -0000 1.60
+++ usr.sbin/vmd/virtio.h 6 Feb 2026 22:42:52 -0000
@@ -84,7 +84,7 @@
/* Virtio network device is backed by tap(4), so inherit limits */
#define VIONET_HARD_MTU TUNMRU
-#define VIONET_MIN_TXLEN ETHER_HDR_LEN
+#define VIONET_MIN_TXLEN (sizeof(struct virtio_net_hdr) + ETHER_HDR_LEN)
#define VIONET_MAX_TXLEN VIONET_HARD_MTU + ETHER_HDR_LEN
/* VMM Control Interface shutdown timeout (in seconds) */
@@ -134,6 +134,7 @@ struct viodev_msg {
#define VIODEV_MSG_IO_WRITE 5
#define VIODEV_MSG_DUMP 6
#define VIODEV_MSG_SHUTDOWN 7
+#define VIODEV_MSG_TUNSCAP 8
uint16_t reg; /* VirtIO register */
uint8_t io_sz; /* IO instruction size */
@@ -309,6 +310,13 @@ struct virtio_net_hdr {
uint16_t padding_reserved;
*/
};
+
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* flags */
+#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* flags */
+
+#define VIRTIO_NET_F_CSUM (1<<0)
+#define VIRTIO_NET_F_GUEST_CSUM (1<<1)
+#define VIRTIO_NET_F_MAC (1<<5)
enum vmmci_cmd {
VMMCI_NONE = 0,
vmd: add checksum offload for guests