Index | Thread | Search

From:
David Gwynne <david@gwynne.id.au>
Subject:
Re: vmd(8): TCP/UDP checksum offloading for guests
To:
Jan Klemkow <jan@openbsd.org>
Cc:
tech@openbsd.org
Date:
Fri, 23 Aug 2024 11:05:43 +1000

Download raw body.

Thread
On Fri, Aug 16, 2024 at 02:51:06PM +0200, Jan Klemkow wrote:
> Hi,
> 
> This diff introduces the transmit side of TCP/UDP checksum offloading
> for vmd(8) guests.  It uses a similar implementation as FreeBSD and
> Linux.  vmd(8) tell tap(4) via a new ioctl to expect virtio net header
> structs.  These are used to transport offloading feature bits between
> guest and tap(4) device.  The tap(4) translates the feature bits between
> virtio net header and our mbufs.
> 
> This diff just contains the sending part, to keep it short.
> 
> When both parts are implemented, we never calculate a checksum of
> packets between our guests or the host system.  As its described in the
> virtio-network specification:
> 
> 	A network packet transported between two guests on the same
> 	system might not need checksumming at all, nor segmentation,
> 	if both guests are amenable.
> 
> This behavior is similar to our loopback traffic, as we also don't
> calculate checksums of packets between two processes here.
> 
> Tests and any kind of feedback is welcome!

i love the idea, but i really dislike how tun and virtio get tied
together by this diff.

ideally tun and only tun would get extended to be able to communicate
this info between userland and the kernel in a way that makes sense for
our stack. separately, vmd would be responsible for mapping the virtio
semantics to our tun driver semantics.

i understand that tun/tap is kind of a psuedo standard interface
between a bunch of different systems, and we're catching up to some of
those systems in this area, but making struct virtio_net_hdr part
of the tun api seems a bit much. virtio can tweak this struct when
it makes sense for a guest talking to a hv via a memory mapped ring
and visa versa, but automatically inheriting the change for userland
talking to the kernel on an fd is annoying.

will any ports make use of this? would this be used outside vmd? i
have vague memories of tailscale omgoptimising a wireguard
implementation to do offloads which probably uses this, but nothing
else springs to mind.

> 
> bye,
> Jan
> 
> Index: sys/dev/pv/if_vio.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pv/if_vio.c,v
> diff -u -p -r1.45 if_vio.c
> --- sys/dev/pv/if_vio.c	1 Aug 2024 11:13:19 -0000	1.45
> +++ sys/dev/pv/if_vio.c	16 Aug 2024 08:47:15 -0000
> @@ -59,154 +59,7 @@
>  #define DPRINTF(x...)
>  #endif
>  
> -/*
> - * if_vioreg.h:
> - */
> -/* Configuration registers */
> -#define VIRTIO_NET_CONFIG_MAC		0 /* 8bit x 6byte */
> -#define VIRTIO_NET_CONFIG_STATUS	6 /* 16bit */
> -
> -/* Feature bits */
> -#define VIRTIO_NET_F_CSUM			(1ULL<<0)
> -#define VIRTIO_NET_F_GUEST_CSUM			(1ULL<<1)
> -#define VIRTIO_NET_F_CTRL_GUEST_OFFLOADS        (1ULL<<2)
> -#define VIRTIO_NET_F_MTU                        (1ULL<<3)
> -#define VIRTIO_NET_F_MAC			(1ULL<<5)
> -#define VIRTIO_NET_F_GSO			(1ULL<<6)
> -#define VIRTIO_NET_F_GUEST_TSO4			(1ULL<<7)
> -#define VIRTIO_NET_F_GUEST_TSO6			(1ULL<<8)
> -#define VIRTIO_NET_F_GUEST_ECN			(1ULL<<9)
> -#define VIRTIO_NET_F_GUEST_UFO			(1ULL<<10)
> -#define VIRTIO_NET_F_HOST_TSO4			(1ULL<<11)
> -#define VIRTIO_NET_F_HOST_TSO6			(1ULL<<12)
> -#define VIRTIO_NET_F_HOST_ECN			(1ULL<<13)
> -#define VIRTIO_NET_F_HOST_UFO			(1ULL<<14)
> -#define VIRTIO_NET_F_MRG_RXBUF			(1ULL<<15)
> -#define VIRTIO_NET_F_STATUS			(1ULL<<16)
> -#define VIRTIO_NET_F_CTRL_VQ			(1ULL<<17)
> -#define VIRTIO_NET_F_CTRL_RX			(1ULL<<18)
> -#define VIRTIO_NET_F_CTRL_VLAN			(1ULL<<19)
> -#define VIRTIO_NET_F_CTRL_RX_EXTRA		(1ULL<<20)
> -#define VIRTIO_NET_F_GUEST_ANNOUNCE		(1ULL<<21)
> -#define VIRTIO_NET_F_MQ				(1ULL<<22)
> -#define VIRTIO_NET_F_CTRL_MAC_ADDR		(1ULL<<23)
> -#define VIRTIO_NET_F_HOST_USO			(1ULL<<56)
> -#define VIRTIO_NET_F_HASH_REPORT		(1ULL<<57)
> -#define VIRTIO_NET_F_GUEST_HDRLEN		(1ULL<<59)
> -#define VIRTIO_NET_F_RSS			(1ULL<<60)
> -#define VIRTIO_NET_F_RSC_EXT			(1ULL<<61)
> -#define VIRTIO_NET_F_STANDBY			(1ULL<<62)
> -#define VIRTIO_NET_F_SPEED_DUPLEX		(1ULL<<63)
> -/*
> - * Config(8) flags. The lowest byte is reserved for generic virtio stuff.
> - */
> -
> -/* Workaround for vlan related bug in qemu < version 2.0 */
> -#define CONFFLAG_QEMU_VLAN_BUG		(1<<8)
> -
> -static const struct virtio_feature_name virtio_net_feature_names[] = {
> -#if VIRTIO_DEBUG
> -	{ VIRTIO_NET_F_CSUM,			"CSum" },
> -	{ VIRTIO_NET_F_GUEST_CSUM,		"GuestCSum" },
> -	{ VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,	"CtrlGuestOffl" },
> -	{ VIRTIO_NET_F_MTU,			"MTU", },
> -	{ VIRTIO_NET_F_MAC,			"MAC" },
> -	{ VIRTIO_NET_F_GSO,			"GSO" },
> -	{ VIRTIO_NET_F_GUEST_TSO4,		"GuestTSO4" },
> -	{ VIRTIO_NET_F_GUEST_TSO6,		"GuestTSO6" },
> -	{ VIRTIO_NET_F_GUEST_ECN,		"GuestECN" },
> -	{ VIRTIO_NET_F_GUEST_UFO,		"GuestUFO" },
> -	{ VIRTIO_NET_F_HOST_TSO4,		"HostTSO4" },
> -	{ VIRTIO_NET_F_HOST_TSO6,		"HostTSO6" },
> -	{ VIRTIO_NET_F_HOST_ECN,		"HostECN" },
> -	{ VIRTIO_NET_F_HOST_UFO,		"HostUFO" },
> -	{ VIRTIO_NET_F_MRG_RXBUF,		"MrgRXBuf" },
> -	{ VIRTIO_NET_F_STATUS,			"Status" },
> -	{ VIRTIO_NET_F_CTRL_VQ,			"CtrlVQ" },
> -	{ VIRTIO_NET_F_CTRL_RX,			"CtrlRX" },
> -	{ VIRTIO_NET_F_CTRL_VLAN,		"CtrlVLAN" },
> -	{ VIRTIO_NET_F_CTRL_RX_EXTRA,		"CtrlRXExtra" },
> -	{ VIRTIO_NET_F_GUEST_ANNOUNCE,		"GuestAnnounce" },
> -	{ VIRTIO_NET_F_MQ,			"MQ" },
> -	{ VIRTIO_NET_F_CTRL_MAC_ADDR,		"CtrlMAC" },
> -	{ VIRTIO_NET_F_HOST_USO,		"HostUso" },
> -	{ VIRTIO_NET_F_HASH_REPORT,		"HashRpt" },
> -	{ VIRTIO_NET_F_GUEST_HDRLEN,		"GuestHdrlen" },
> -	{ VIRTIO_NET_F_RSS,			"RSS" },
> -	{ VIRTIO_NET_F_RSC_EXT,			"RSSExt" },
> -	{ VIRTIO_NET_F_STANDBY,			"Stdby" },
> -	{ VIRTIO_NET_F_SPEED_DUPLEX,		"SpdDplx" },
> -#endif
> -	{ 0,					NULL }
> -};
> -
> -/* Status */
> -#define VIRTIO_NET_S_LINK_UP	1
> -
> -/* Packet header structure */
> -struct virtio_net_hdr {
> -	uint8_t		flags;
> -	uint8_t		gso_type;
> -	uint16_t	hdr_len;
> -	uint16_t	gso_size;
> -	uint16_t	csum_start;
> -	uint16_t	csum_offset;
> -
> -	/* only present if VIRTIO_NET_F_MRG_RXBUF is negotiated */
> -	uint16_t	num_buffers;
> -} __packed;
> -
> -#define VIRTIO_NET_HDR_F_NEEDS_CSUM	1 /* flags */
> -#define VIRTIO_NET_HDR_F_DATA_VALID	2 /* flags */
> -#define VIRTIO_NET_HDR_GSO_NONE		0 /* gso_type */
> -#define VIRTIO_NET_HDR_GSO_TCPV4	1 /* gso_type */
> -#define VIRTIO_NET_HDR_GSO_UDP		3 /* gso_type */
> -#define VIRTIO_NET_HDR_GSO_TCPV6	4 /* gso_type */
> -#define VIRTIO_NET_HDR_GSO_ECN		0x80 /* gso_type, |'ed */
> -
> -#define VIRTIO_NET_MAX_GSO_LEN		(65536+ETHER_HDR_LEN)
> -
> -/* Control virtqueue */
> -struct virtio_net_ctrl_cmd {
> -	uint8_t	class;
> -	uint8_t	command;
> -} __packed;
> -#define VIRTIO_NET_CTRL_RX		0
> -# define VIRTIO_NET_CTRL_RX_PROMISC	0
> -# define VIRTIO_NET_CTRL_RX_ALLMULTI	1
> -
> -#define VIRTIO_NET_CTRL_MAC		1
> -# define VIRTIO_NET_CTRL_MAC_TABLE_SET	0
> -
> -#define VIRTIO_NET_CTRL_VLAN		2
> -# define VIRTIO_NET_CTRL_VLAN_ADD	0
> -# define VIRTIO_NET_CTRL_VLAN_DEL	1
> -
> -#define VIRTIO_NET_CTRL_GUEST_OFFLOADS	5
> -# define VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET	0
> -
> -struct virtio_net_ctrl_status {
> -	uint8_t	ack;
> -} __packed;
> -#define VIRTIO_NET_OK			0
> -#define VIRTIO_NET_ERR			1
> -
> -struct virtio_net_ctrl_rx {
> -	uint8_t	onoff;
> -} __packed;
> -
> -struct virtio_net_ctrl_guest_offloads {
> -	uint64_t offloads;
> -} __packed;
> -
> -struct virtio_net_ctrl_mac_tbl {
> -	uint32_t nentries;
> -	uint8_t macs[][ETHER_ADDR_LEN];
> -} __packed;
> -
> -struct virtio_net_ctrl_vlan {
> -	uint16_t id;
> -} __packed;
> +#include "if_vioreg.h"
>  
>  /*
>   * if_viovar.h:
> Index: sys/dev/pv/if_vioreg.h
> ===================================================================
> RCS file: sys/dev/pv/if_vioreg.h
> diff -N sys/dev/pv/if_vioreg.h
> --- /dev/null	1 Jan 1970 00:00:00 -0000
> +++ sys/dev/pv/if_vioreg.h	16 Aug 2024 08:47:15 -0000
> @@ -0,0 +1,182 @@
> +/*	$OpenBSD$	*/
> +
> +/*
> + * Copyright (c) 2012 Stefan Fritsch, Alexander Fiveg.
> + * Copyright (c) 2010 Minoura Makoto.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
> + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
> + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
> + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +/* Configuration registers */
> +#define VIRTIO_NET_CONFIG_MAC		0 /* 8bit x 6byte */
> +#define VIRTIO_NET_CONFIG_STATUS	6 /* 16bit */
> +
> +/* Feature bits */
> +#define VIRTIO_NET_F_CSUM			(1ULL<<0)
> +#define VIRTIO_NET_F_GUEST_CSUM			(1ULL<<1)
> +#define VIRTIO_NET_F_CTRL_GUEST_OFFLOADS        (1ULL<<2)
> +#define VIRTIO_NET_F_MTU                        (1ULL<<3)
> +#define VIRTIO_NET_F_MAC			(1ULL<<5)
> +#define VIRTIO_NET_F_GSO			(1ULL<<6)
> +#define VIRTIO_NET_F_GUEST_TSO4			(1ULL<<7)
> +#define VIRTIO_NET_F_GUEST_TSO6			(1ULL<<8)
> +#define VIRTIO_NET_F_GUEST_ECN			(1ULL<<9)
> +#define VIRTIO_NET_F_GUEST_UFO			(1ULL<<10)
> +#define VIRTIO_NET_F_HOST_TSO4			(1ULL<<11)
> +#define VIRTIO_NET_F_HOST_TSO6			(1ULL<<12)
> +#define VIRTIO_NET_F_HOST_ECN			(1ULL<<13)
> +#define VIRTIO_NET_F_HOST_UFO			(1ULL<<14)
> +#define VIRTIO_NET_F_MRG_RXBUF			(1ULL<<15)
> +#define VIRTIO_NET_F_STATUS			(1ULL<<16)
> +#define VIRTIO_NET_F_CTRL_VQ			(1ULL<<17)
> +#define VIRTIO_NET_F_CTRL_RX			(1ULL<<18)
> +#define VIRTIO_NET_F_CTRL_VLAN			(1ULL<<19)
> +#define VIRTIO_NET_F_CTRL_RX_EXTRA		(1ULL<<20)
> +#define VIRTIO_NET_F_GUEST_ANNOUNCE		(1ULL<<21)
> +#define VIRTIO_NET_F_MQ				(1ULL<<22)
> +#define VIRTIO_NET_F_CTRL_MAC_ADDR		(1ULL<<23)
> +#define VIRTIO_NET_F_HOST_USO			(1ULL<<56)
> +#define VIRTIO_NET_F_HASH_REPORT		(1ULL<<57)
> +#define VIRTIO_NET_F_GUEST_HDRLEN		(1ULL<<59)
> +#define VIRTIO_NET_F_RSS			(1ULL<<60)
> +#define VIRTIO_NET_F_RSC_EXT			(1ULL<<61)
> +#define VIRTIO_NET_F_STANDBY			(1ULL<<62)
> +#define VIRTIO_NET_F_SPEED_DUPLEX		(1ULL<<63)
> +/*
> + * Config(8) flags. The lowest byte is reserved for generic virtio stuff.
> + */
> +
> +/* Workaround for vlan related bug in qemu < version 2.0 */
> +#define CONFFLAG_QEMU_VLAN_BUG		(1<<8)
> +
> +static const struct virtio_feature_name virtio_net_feature_names[] = {
> +#if VIRTIO_DEBUG
> +	{ VIRTIO_NET_F_CSUM,			"CSum" },
> +	{ VIRTIO_NET_F_GUEST_CSUM,		"GuestCSum" },
> +	{ VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,	"CtrlGuestOffl" },
> +	{ VIRTIO_NET_F_MTU,			"MTU", },
> +	{ VIRTIO_NET_F_MAC,			"MAC" },
> +	{ VIRTIO_NET_F_GSO,			"GSO" },
> +	{ VIRTIO_NET_F_GUEST_TSO4,		"GuestTSO4" },
> +	{ VIRTIO_NET_F_GUEST_TSO6,		"GuestTSO6" },
> +	{ VIRTIO_NET_F_GUEST_ECN,		"GuestECN" },
> +	{ VIRTIO_NET_F_GUEST_UFO,		"GuestUFO" },
> +	{ VIRTIO_NET_F_HOST_TSO4,		"HostTSO4" },
> +	{ VIRTIO_NET_F_HOST_TSO6,		"HostTSO6" },
> +	{ VIRTIO_NET_F_HOST_ECN,		"HostECN" },
> +	{ VIRTIO_NET_F_HOST_UFO,		"HostUFO" },
> +	{ VIRTIO_NET_F_MRG_RXBUF,		"MrgRXBuf" },
> +	{ VIRTIO_NET_F_STATUS,			"Status" },
> +	{ VIRTIO_NET_F_CTRL_VQ,			"CtrlVQ" },
> +	{ VIRTIO_NET_F_CTRL_RX,			"CtrlRX" },
> +	{ VIRTIO_NET_F_CTRL_VLAN,		"CtrlVLAN" },
> +	{ VIRTIO_NET_F_CTRL_RX_EXTRA,		"CtrlRXExtra" },
> +	{ VIRTIO_NET_F_GUEST_ANNOUNCE,		"GuestAnnounce" },
> +	{ VIRTIO_NET_F_MQ,			"MQ" },
> +	{ VIRTIO_NET_F_CTRL_MAC_ADDR,		"CtrlMAC" },
> +	{ VIRTIO_NET_F_HOST_USO,		"HostUso" },
> +	{ VIRTIO_NET_F_HASH_REPORT,		"HashRpt" },
> +	{ VIRTIO_NET_F_GUEST_HDRLEN,		"GuestHdrlen" },
> +	{ VIRTIO_NET_F_RSS,			"RSS" },
> +	{ VIRTIO_NET_F_RSC_EXT,			"RSSExt" },
> +	{ VIRTIO_NET_F_STANDBY,			"Stdby" },
> +	{ VIRTIO_NET_F_SPEED_DUPLEX,		"SpdDplx" },
> +#endif
> +	{ 0,					NULL }
> +};
> +
> +/* Status */
> +#define VIRTIO_NET_S_LINK_UP	1
> +
> +/* Packet header structure */
> +struct virtio_net_hdr {
> +	uint8_t		flags;
> +	uint8_t		gso_type;
> +	uint16_t	hdr_len;
> +	uint16_t	gso_size;
> +	uint16_t	csum_start;
> +	uint16_t	csum_offset;
> +
> +	/* only present if VIRTIO_NET_F_MRG_RXBUF is negotiated */
> +	uint16_t	num_buffers;
> +} __packed;
> +
> +struct virtio_net_hdr_legacy {
> +	uint8_t		flags;
> +	uint8_t		gso_type;
> +	uint16_t	hdr_len;
> +	uint16_t	gso_size;
> +	uint16_t	csum_start;
> +	uint16_t	csum_offset;
> +} __packed;
> +
> +#define VIRTIO_NET_HDR_F_NEEDS_CSUM	1 /* flags */
> +#define VIRTIO_NET_HDR_F_DATA_VALID	2 /* flags */
> +#define VIRTIO_NET_HDR_GSO_NONE		0 /* gso_type */
> +#define VIRTIO_NET_HDR_GSO_TCPV4	1 /* gso_type */
> +#define VIRTIO_NET_HDR_GSO_UDP		3 /* gso_type */
> +#define VIRTIO_NET_HDR_GSO_TCPV6	4 /* gso_type */
> +#define VIRTIO_NET_HDR_GSO_ECN		0x80 /* gso_type, |'ed */
> +
> +#define VIRTIO_NET_MAX_GSO_LEN		(65536+ETHER_HDR_LEN)
> +
> +/* Control virtqueue */
> +struct virtio_net_ctrl_cmd {
> +	uint8_t	class;
> +	uint8_t	command;
> +} __packed;
> +#define VIRTIO_NET_CTRL_RX		0
> +# define VIRTIO_NET_CTRL_RX_PROMISC	0
> +# define VIRTIO_NET_CTRL_RX_ALLMULTI	1
> +
> +#define VIRTIO_NET_CTRL_MAC		1
> +# define VIRTIO_NET_CTRL_MAC_TABLE_SET	0
> +
> +#define VIRTIO_NET_CTRL_VLAN		2
> +# define VIRTIO_NET_CTRL_VLAN_ADD	0
> +# define VIRTIO_NET_CTRL_VLAN_DEL	1
> +
> +#define VIRTIO_NET_CTRL_GUEST_OFFLOADS	5
> +# define VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET	0
> +
> +struct virtio_net_ctrl_status {
> +	uint8_t	ack;
> +} __packed;
> +#define VIRTIO_NET_OK			0
> +#define VIRTIO_NET_ERR			1
> +
> +struct virtio_net_ctrl_rx {
> +	uint8_t	onoff;
> +} __packed;
> +
> +struct virtio_net_ctrl_guest_offloads {
> +	uint64_t offloads;
> +} __packed;
> +
> +struct virtio_net_ctrl_mac_tbl {
> +	uint32_t nentries;
> +	uint8_t macs[][ETHER_ADDR_LEN];
> +} __packed;
> +
> +struct virtio_net_ctrl_vlan {
> +	uint16_t id;
> +} __packed;
> Index: sys/net/if_tun.c
> ===================================================================
> RCS file: /cvs/src/sys/net/if_tun.c,v
> diff -u -p -r1.240 if_tun.c
> --- sys/net/if_tun.c	23 Dec 2023 10:52:54 -0000	1.240
> +++ sys/net/if_tun.c	16 Aug 2024 12:00:46 -0000
> @@ -63,8 +63,15 @@
>  #include <net/rtable.h>
>  
>  #include <netinet/in.h>
> +#include <netinet/ip.h>
> +#include <netinet/ip6.h>
> +#include <netinet/tcp.h>
> +#include <netinet/udp.h>
>  #include <netinet/if_ether.h>
>  
> +#include <dev/pv/virtiovar.h>
> +#include <dev/pv/if_vioreg.h>
> +
>  #include "bpfilter.h"
>  #if NBPFILTER > 0
>  #include <net/bpf.h>
> @@ -92,6 +99,7 @@ struct tun_softc {
>  	dev_t			sc_dev;
>  	struct refcnt		sc_refs;
>  	unsigned int		sc_reading;
> +	int			sc_vhdrlen;
>  };
>  
>  #ifdef	TUN_DEBUG
> @@ -779,6 +787,17 @@ tun_dev_ioctl(dev_t dev, u_long cmd, voi
>  		bcopy(data, sc->sc_ac.ac_enaddr,
>  		    sizeof(sc->sc_ac.ac_enaddr));
>  		break;
> +	case TAPSVNETHDR:
> +		if (*(int *)data != 0 &&
> +		    *(int *)data != sizeof(struct virtio_net_hdr) &&
> +		    *(int *)data != sizeof(struct virtio_net_hdr_legacy)) {
> +			error = EINVAL;
> +			break;
> +		}
> +
> +		sc->sc_vhdrlen = *(int *)data;
> +
> +		break;
>  	default:
>  		error = ENOTTY;
>  		break;
> @@ -828,6 +847,17 @@ tun_dev_read(dev_t dev, struct uio *uio,
>  		bpf_mtap(ifp->if_bpf, m0, BPF_DIRECTION_OUT);
>  #endif
>  
> +	size_t len = ulmin(uio->uio_resid, sc->sc_vhdrlen);
> +	if (len > 0) {
> +		struct virtio_net_hdr vhdr;
> +
> +		bzero(&vhdr, sizeof(vhdr));
> +
> +		error = uiomove(&vhdr, len, uio);
> +		if (error != 0)
> +			goto free;
> +	}
> +
>  	m = m0;
>  	while (uio->uio_resid > 0) {
>  		size_t len = ulmin(uio->uio_resid, m->m_len);
> @@ -841,7 +871,7 @@ tun_dev_read(dev_t dev, struct uio *uio,
>  		if (m == NULL)
>  			break;
>  	}
> -
> +free:
>  	m_freem(m0);
>  
>  put:
> @@ -870,6 +900,7 @@ tun_dev_write(dev_t dev, struct uio *uio
>  	struct tun_softc	*sc;
>  	struct ifnet		*ifp;
>  	struct mbuf		*m0;
> +	struct virtio_net_hdr	vhdr;
>  	int			error = 0;
>  	size_t			mlen;
>  
> @@ -905,9 +936,31 @@ tun_dev_write(dev_t dev, struct uio *uio
>  	m0->m_pkthdr.len = m0->m_len = mlen;
>  	m_adj(m0, align);
>  
> +	if (sc->sc_vhdrlen > 0) {
> +		error = uiomove(&vhdr, sc->sc_vhdrlen, uio);
> +		if (error != 0)
> +			goto drop;
> +	}
> +
>  	error = uiomove(mtod(m0, void *), m0->m_len, uio);
>  	if (error != 0)
>  		goto drop;
> +
> +	if (sc->sc_vhdrlen > 0) {
> +		if (ISSET(vhdr.flags, VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
> +			struct ether_extracted ext;
> +
> +			ether_extract_headers(m0, &ext);
> +
> +			if (ext.tcp) {
> +				SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_IN_OK);
> +				SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
> +			} else if (ext.udp) {
> +				SET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_IN_OK);
> +				SET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT);
> +			}
> +		}
> +	}
>  
>  	NET_LOCK();
>  	if_vinput(ifp, m0);
> Index: sys/net/if_tun.h
> ===================================================================
> RCS file: /cvs/src/sys/net/if_tun.h,v
> diff -u -p -r1.15 if_tun.h
> --- sys/net/if_tun.h	6 Feb 2007 10:49:40 -0000	1.15
> +++ sys/net/if_tun.h	16 Aug 2024 12:00:43 -0000
> @@ -68,4 +68,7 @@ struct tuninfo {
>  #define	TUNSDEBUG	_IOW('t', 94, int)
>  #define	TUNGDEBUG	_IOR('t', 95, int)
>  
> +/* ioctl to set the virtio-net header size */
> +#define TAPSVNETHDR	_IOW('t', 96, int)
> +
>  #endif /* _NET_IF_TUN_H_ */
> Index: usr.sbin/vmd/vionet.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/vionet.c,v
> diff -u -p -r1.16 vionet.c
> --- usr.sbin/vmd/vionet.c	12 Jul 2024 14:34:08 -0000	1.16
> +++ usr.sbin/vmd/vionet.c	16 Aug 2024 12:00:43 -0000
> @@ -18,6 +18,7 @@
>   */
>  #include <sys/socket.h>
>  #include <sys/types.h>
> +#include <sys/ioctl.h>
>  
>  #include <dev/pci/virtio_pcireg.h>
>  #include <dev/pv/virtioreg.h>
> @@ -46,11 +47,6 @@
>  extern char *__progname;
>  extern struct vmd_vm *current_vm;
>  
> -struct packet {
> -	uint8_t	*buf;
> -	size_t	 len;
> -};
> -
>  static void *rx_run_loop(void *);
>  static void *tx_run_loop(void *);
>  static int vionet_rx(struct vionet_dev *, int);
> @@ -102,13 +98,7 @@ vionet_main(int fd, int fd_vmm)
>  	struct vm_create_params	*vcp;
>  	ssize_t			 sz;
>  	int			 ret;
> -
> -	/*
> -	 * stdio - needed for read/write to disk fds and channels to the vm.
> -	 * vmm + proc - needed to create shared vm mappings.
> -	 */
> -	if (pledge("stdio vmm proc", NULL) == -1)
> -		fatal("pledge");
> +	int			 vhdrlen = sizeof(struct virtio_net_hdr);
>  
>  	/* Initialize iovec arrays. */
>  	memset(iov_rx, 0, sizeof(iov_rx));
> @@ -133,6 +123,17 @@ vionet_main(int fd, int fd_vmm)
>  	    ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd,
>  	    dev.async_fd, fd_vmm);
>  
> +	/* Activate virtio_net_hdr for tap(4) device. */
> +	if (ioctl(vionet->data_fd, TAPSVNETHDR, &vhdrlen) == -1)
> +		fatal("%s: TAPSVNETHDR option on tap(4) failed", __func__);
> +
> +	/*
> +	 * stdio - needed for read/write to disk fds and channels to the vm.
> +	 * vmm + proc - needed to create shared vm mappings.
> +	 */
> +	if (pledge("stdio vmm proc", NULL) == -1)
> +		fatal("pledge");
> +
>  	/* Receive our vm information from the vm process. */
>  	memset(&vm, 0, sizeof(vm));
>  	sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
> @@ -413,6 +414,7 @@ vionet_rx(struct vionet_dev *dev, int fd
>  
>  		iov = &iov_rx[0];
>  		iov_cnt = 1;
> +		chain_len = 0;
>  
>  		/*
>  		 * First descriptor should be at least as large as the
> @@ -425,22 +427,9 @@ vionet_rx(struct vionet_dev *dev, int fd
>  			goto reset;
>  		}
>  
> -		/*
> -		 * Insert the virtio_net_hdr and adjust len/base. We do the
> -		 * pointer math here before it's a void*.
> -		 */
>  		iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
>  		if (iov->iov_base == NULL)
>  			goto reset;
> -		memset(iov->iov_base, 0, sizeof(struct virtio_net_hdr));
> -
> -		/* Tweak the iovec to account for the virtio_net_hdr. */
> -		iov->iov_len -= sizeof(struct virtio_net_hdr);
> -		iov->iov_base = hvaddr_mem(desc->addr +
> -		    sizeof(struct virtio_net_hdr), iov->iov_len);
> -		if (iov->iov_base == NULL)
> -			goto reset;
> -		chain_len = iov->iov_len;
>  
>  		/*
>  		 * Walk the remaining chain and collect remaining addresses
> @@ -491,12 +480,6 @@ vionet_rx(struct vionet_dev *dev, int fd
>  		if (sz == 0)	/* No packets, so bail out for now. */
>  			break;
>  
> -		/*
> -		 * Account for the prefixed header since it wasn't included
> -		 * in the copy or zerocopy operations.
> -		 */
> -		sz += sizeof(struct virtio_net_hdr);
> -
>  		/* Mark our buffers as used. */
>  		used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_idx;
>  		used->ring[used->idx & VIONET_QUEUE_MASK].len = sz;
> @@ -533,22 +516,19 @@ vionet_rx_copy(struct vionet_dev *dev, i
>      int iov_cnt, size_t chain_len)
>  {
>  	static uint8_t		 buf[VIONET_HARD_MTU];
> -	struct packet		*pkt = NULL;
>  	struct ether_header	*eh = NULL;
>  	uint8_t			*payload = buf;
>  	size_t			 i, chunk, nbytes, copied = 0;
>  	ssize_t			 sz;
>  
> -	/* If reading from the tap(4), try to right-size the read. */
> -	if (fd == dev->data_fd)
> -		nbytes = MIN(chain_len, VIONET_HARD_MTU);
> -	else if (fd == pipe_inject[READ])
> -		nbytes = sizeof(struct packet);
> -	else {
> +	if (fd != dev->data_fd && fd != pipe_inject[READ]) {
>  		log_warnx("%s: invalid fd: %d", __func__, fd);
>  		return (-1);
>  	}
>  
> +	/* If reading from the tap(4), try to right-size the read. */
> +	nbytes = MIN(chain_len, VIONET_HARD_MTU);
> +
>  	/*
>  	 * Try to pull a packet. The fd should be non-blocking and we don't
>  	 * care if we under-read (i.e. sz != nbytes) as we may not have a
> @@ -560,36 +540,18 @@ vionet_rx_copy(struct vionet_dev *dev, i
>  			log_warn("%s: error reading packet", __func__);
>  			return (-1);
>  		}
> +		log_warn("%s: EAGAIN", __func__);
>  		return (0);
> -	} else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) {
> +	} else if (fd == dev->data_fd && (size_t)sz < VIONET_MIN_TXLEN) {
>  		/* If reading the tap(4), we should get valid ethernet. */
>  		log_warnx("%s: invalid packet size", __func__);
>  		return (0);
> -	} else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) {
> -		log_warnx("%s: invalid injected packet object (sz=%ld)",
> -		    __func__, sz);
> -		return (0);
> -	}
> -
> -	/* Decompose an injected packet, if that's what we're working with. */
> -	if (fd == pipe_inject[READ]) {
> -		pkt = (struct packet *)buf;
> -		if (pkt->buf == NULL) {
> -			log_warnx("%s: invalid injected packet, no buffer",
> -			    __func__);
> -			return (0);
> -		}
> -		if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
> -			log_warnx("%s: invalid injected packet size", __func__);
> -			goto drop;
> -		}
> -		payload = pkt->buf;
> -		sz = (ssize_t)pkt->len;
>  	}
>  
>  	/* Validate the ethernet header, if required. */
>  	if (dev->lockedmac) {
> -		eh = (struct ether_header *)(payload);
> +		eh = (struct ether_header *)(payload
> +		    + sizeof(struct virtio_net_hdr));
>  		if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
>  		    memcmp(eh->ether_dhost, dev->mac,
>  		    sizeof(eh->ether_dhost)) != 0)
> @@ -610,10 +572,6 @@ vionet_rx_copy(struct vionet_dev *dev, i
>  	}
>  
>  drop:
> -	/* Free any injected packet buffer. */
> -	if (pkt != NULL)
> -		free(pkt->buf);
> -
>  	return (copied);
>  }
>  
> @@ -724,7 +682,6 @@ vionet_tx(struct virtio_dev *dev)
>  	struct virtio_vq_info *vq_info;
>  	struct ether_header *eh;
>  	struct iovec *iov;
> -	struct packet pkt;
>  	uint8_t status = 0;
>  
>  	status = vionet->cfg.device_status
> @@ -766,17 +723,15 @@ vionet_tx(struct virtio_dev *dev)
>  		if (iov->iov_len < sizeof(struct virtio_net_hdr)) {
>  			log_warnx("%s: invalid descriptor length", __func__);
>  			goto reset;
> -		} else if (iov->iov_len > sizeof(struct virtio_net_hdr)) {
> -			/* Chop off the virtio header, leaving packet data. */
> -			iov->iov_len -= sizeof(struct virtio_net_hdr);
> -			chain_len += iov->iov_len;
> -			iov->iov_base = hvaddr_mem(desc->addr +
> -			    sizeof(struct virtio_net_hdr), iov->iov_len);
> -			if (iov->iov_base == NULL)
> -				goto reset;
> -			iov_cnt++;
>  		}
>  
> +		/* Handel the first descriptor. */
> +		chain_len += iov->iov_len;
> +		iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
> +		if (iov->iov_base == NULL)
> +			goto reset;
> +		iov_cnt++;
> +
>  		/*
>  		 * Walk the chain and collect remaining addresses and lengths.
>  		 */
> @@ -818,6 +773,8 @@ vionet_tx(struct virtio_dev *dev)
>  		 * for this inspection.
>  		 */
>  		iov = &iov_tx[0];
> +		if (iov->iov_len == sizeof(struct virtio_net_hdr))
> +			iov = &iov_tx[1];
>  		if (vionet->lockedmac) {
>  			if (iov->iov_len < ETHER_HDR_LEN) {
>  				log_warnx("%s: insufficient header data",
> @@ -839,6 +796,7 @@ vionet_tx(struct virtio_dev *dev)
>  			if (dhcpsz > 0) {
>  				log_debug("%s: detected dhcp request of %zu bytes",
>  				    __func__, dhcpsz);
> +				sz = chain_len;
>  				goto drop;
>  			}
>  		}
> @@ -859,20 +817,23 @@ drop:
>  
>  		/* Facilitate DHCP reply injection, if needed. */
>  		if (dhcpsz > 0) {
> -			pkt.buf = dhcppkt;
> -			pkt.len = dhcpsz;
> -			sz = write(pipe_inject[WRITE], &pkt, sizeof(pkt));
> -			if (sz == -1 && errno != EAGAIN) {
> +			struct virtio_net_hdr vhdr = { 0 };
> +			struct iovec iovec[2] = {
> +				{ .iov_base = &vhdr, .iov_len = sizeof(vhdr) },
> +				{ .iov_base = dhcppkt, .iov_len = dhcpsz }
> +			};
> +
> +			sz = writev(pipe_inject[WRITE], iovec, 2);
> +
> +			if (sz == -1 && errno != EAGAIN)
>  				log_warn("%s: packet injection", __func__);
> -				free(pkt.buf);
> -			} else if (sz == -1 && errno == EAGAIN) {
> +			else if (sz == -1 && errno == EAGAIN)
>  				log_debug("%s: dropping dhcp reply", __func__);
> -				free(pkt.buf);
> -			} else if (sz != sizeof(pkt)) {
> +			else if (sz != (ssize_t)(sizeof(vhdr) + dhcpsz))
>  				log_warnx("%s: failed packet injection",
>  				    __func__);
> -				free(pkt.buf);
> -			}
> +
> +			free(dhcppkt);
>  			log_debug("%s: injected dhcp reply with %ld bytes",
>  			    __func__, sz);
>  		}
> Index: usr.sbin/vmd/virtio.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
> diff -u -p -r1.115 virtio.c
> --- usr.sbin/vmd/virtio.c	10 Jul 2024 09:27:33 -0000	1.115
> +++ usr.sbin/vmd/virtio.c	16 Aug 2024 12:00:46 -0000
> @@ -60,6 +60,7 @@ SLIST_HEAD(virtio_dev_head, virtio_dev) 
>  
>  #define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */
>  
> +#define VIRTIO_NET_F_CSUM	(1<<0)
>  #define VIRTIO_NET_F_MAC	(1<<5)
>  
>  #define VMMCI_F_TIMESYNC	(1<<0)
> @@ -584,6 +585,9 @@ virtio_init(struct vmd_vm *vm, int child
>  			/* MAC address has been assigned by the parent */
>  			memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
>  			dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC;
> +
> +			/* enable checksum offloading */
> +			dev->vionet.cfg.device_feature |= VIRTIO_NET_F_CSUM;
>  
>  			dev->vionet.lockedmac =
>  			    vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
> Index: usr.sbin/vmd/virtio.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v
> diff -u -p -r1.52 virtio.h
> --- usr.sbin/vmd/virtio.h	10 Jul 2024 09:27:33 -0000	1.52
> +++ usr.sbin/vmd/virtio.h	16 Aug 2024 12:00:43 -0000
> @@ -49,7 +49,7 @@
>  
>  /* Virtio network device is backed by tap(4), so inherit limits */
>  #define VIONET_HARD_MTU		TUNMRU
> -#define VIONET_MIN_TXLEN	ETHER_HDR_LEN
> +#define VIONET_MIN_TXLEN	(sizeof(struct virtio_net_hdr) + ETHER_HDR_LEN)
>  #define VIONET_MAX_TXLEN	VIONET_HARD_MTU + ETHER_HDR_LEN
>  
>  /* VMM Control Interface shutdown timeout (in seconds) */
>