Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: ice(4) TSO support
To:
tech@openbsd.org
Date:
Wed, 18 Jun 2025 10:19:44 +0200

Download raw body.

Thread
On Tue, Jun 17, 2025 at 11:16:58AM +0200, Stefan Sperling wrote:
> The patch below adds TSO (TCP Segmentation Offload) support to ice(4).
> 
> TSO has been tested both with and without ice-ddp firmware loaded.
> On my test device TSO works in both cases.
> 
> # ifconfig ice0 mtu 9710
> 
> $ iperf -c 192.168.99.1 --mss 512 
> 
> [  1] local 192.168.99.2 port 20750 connected with 192.168.99.1 port 5001
> [ ID] Interval       Transfer     Bandwidth
> [  1] 0.00-10.02 sec  11.5 GBytes  9.89 Gbits/sec
> 
> $ iperf -c 2001:db8::1 --mss 512 
> 
> [  1] local 2001:db8:: port 11682 connected with 2001:db8::1 port 5001
> [ ID] Interval       Transfer     Bandwidth
> [  1] 0.00-10.02 sec  11.5 GBytes  9.84 Gbits/sec
> 
> (This 25Gb E810 device is connected to a 10Gb switch.)
> 
> $ netstat -s | grep TSO
>                 0 output TSO large packets chopped in software
>                 5434855 output TSO large packets to device
>                 13327052 output TSO generated packets sent to network
>                 0 bad TSO packets dropped
> 
> The known limitation of ixl(4) where packets spanning more than 8 DMA
> segments cannot be processed by the device also affects ice(4).
> ice(4) will drop such packets rather than passing them to hardware.
> This avoids the following error after which the device becomes non-functional:
> 
>   ice0: Malicious Driver Detection Tx Descriptor check event 'Packet too small or too big' on Tx queue 0 PF# 0 VF# 0
>   ice_request_stack_reinit: not implemented
> 
> As far as i know, a general fix to make the forwarding path deal with
> this limitation is still in progress.
> Disabling TSO on routers seems to be a good idea in general anyway.
> 
> 
> ok?

I have tested it, now we reach 10 Gbit line speed when sending TCP.

Do we actually need ice_tso_detect_sparse()?  In theory
bus_dmamap_load_mbuf() should find unsupported layout of mbufs,
return EFBIG, and m_defrag() in ice_load_mbuf() fixes it.

We see problems in ixl(4) that jan@ and I do not fully understand
it yet.  Does it also not work with ice(4)?  Have you seen bad
behavior?

Is the ICE_MAX_TSO_HDR_SEGS check stricter than _bus_dmamap_load_buffer()?

Is it possible to call m_defrag() to send the packet instead of
dropping it?

bluhm

> M  sys/dev/pci/if_ice.c  |  168+  22-
> 
> 1 file changed, 168 insertions(+), 22 deletions(-)
> 
> commit - dce1da27fcc7e851ab497aa79c511b8be816a365
> commit + 6f6738a01020d1372d5b357faef2b01a533c9c33
> blob - 21fbff1b1dbd5c169175759b4454ba7dd4bf7536
> blob + 3de2bd9d21f6e06772735a706cd3f5256fac7c78
> --- sys/dev/pci/if_ice.c
> +++ sys/dev/pci/if_ice.c
> @@ -84,6 +84,9 @@
>  
>  #include <netinet/in.h>
>  #include <netinet/if_ether.h>
> +#include <netinet/tcp.h>
> +#include <netinet/tcp_timer.h>
> +#include <netinet/tcp_var.h>
>  #include <netinet/udp.h>
>  
>  #define STRUCT_HACK_VAR_LEN
> @@ -102,6 +105,12 @@
>  #include "if_icereg.h"
>  #include "if_icevar.h"
>  
> +/*
> + * Our network stack cannot handle packets greater than MAXMCLBYTES.
> + * This interface cannot handle packets greater than ICE_TSO_SIZE.
> + */
> +CTASSERT(MAXMCLBYTES < ICE_TSO_SIZE);
> +
>  /**
>   * @var ice_driver_version
>   * @brief driver version string
> @@ -13641,11 +13650,97 @@ ice_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
>  	return error;
>  }
>  
> +/**
> + * ice_tso_detect_sparse - detect TSO packets with too many segments
> + *
> + * Hardware only transmits packets with a maximum of 8 descriptors. For TSO
> + * packets, hardware needs to be able to build the split packets using 8 or
> + * fewer descriptors. Additionally, the header must be contained within at
> + * most 3 descriptors.
> + *
> + * To verify this, we walk the headers to find out how many descriptors the
> + * headers require (usually 1). Then we ensure that, for each TSO segment, its
> + * data plus the headers are contained within 8 or fewer descriptors.
> + */
> +int
> +ice_tso_detect_sparse(struct mbuf *m, struct ether_extracted *ext,
> +    bus_dmamap_t map)
> +{
> +	int count, curseg, i, hlen, segsz, seglen, hdrs, maxsegs;
> +	bus_dma_segment_t *segs;
> +	uint64_t paylen, outlen, nsegs;
> +
> +	curseg = hdrs = 0;
> +
> +	hlen = ETHER_HDR_LEN + ext->iphlen + ext->tcphlen;
> +	outlen = MIN(9668, MAX(64, m->m_pkthdr.ph_mss));
> +	paylen = m->m_pkthdr.len - hlen;
> +	nsegs = (paylen + outlen - 1) / outlen;
> +
> +	segs = map->dm_segs;
> +
> +	/* First, count the number of descriptors for the header.
> +	 * Additionally, make sure it does not span more than 3 segments.
> +	 */
> +	i = 0;
> +	curseg = segs[0].ds_len;
> +	while (hlen > 0) {
> +		hdrs++;
> +		if (hdrs > ICE_MAX_TSO_HDR_SEGS)
> +			return (1);
> +		if (curseg == 0) {
> +			i++;
> +			if (i == nsegs)
> +				return (1);
> +
> +			curseg = segs[i].ds_len;
> +		}
> +		seglen = MIN(curseg, hlen);
> +		curseg -= seglen;
> +		hlen -= seglen;
> +	}
> +
> +	maxsegs = ICE_MAX_TX_SEGS - hdrs;
> +
> +	/* We must count the headers, in order to verify that they take up
> +	 * 3 or fewer descriptors. However, we don't need to check the data
> +	 * if the total segments is small.
> +	 */
> +	if (nsegs <= maxsegs)
> +		return (0);
> +
> +	count = 0;
> +
> +	/* Now check the data to make sure that each TSO segment is made up of
> +	 * no more than maxsegs descriptors. This ensures that hardware will
> +	 * be capable of performing TSO offload.
> +	 */
> +	while (paylen > 0) {
> +		segsz = m->m_pkthdr.ph_mss;
> +		while (segsz > 0 && paylen != 0) {
> +			count++;
> +			if (count > maxsegs)
> +				return (1);
> +			if (curseg == 0) {
> +				i++;
> +				if (i == nsegs)
> +					return (1);
> +				curseg = segs[i].ds_len;
> +			}
> +			seglen = MIN(curseg, segsz);
> +			segsz -= seglen;
> +			curseg -= seglen;
> +			paylen -= seglen;
> +		}
> +		count = 0;
> +	}
> +
> +	return (0);
> +}
> +
>  uint64_t
> -ice_tx_setup_offload(struct mbuf *m0, struct ice_tx_queue *txq,
> -    unsigned int prod)
> +ice_tx_setup_offload(struct mbuf *m0, struct ether_extracted *ext)
>  {
> -	struct ether_extracted ext;
>  	uint64_t offload = 0, hlen;
>  
>  #if NVLAN > 0
> @@ -13659,17 +13754,16 @@ ice_tx_setup_offload(struct mbuf *m0, struct ice_tx_qu
>  	    M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_TCP_TSO))
>  		return offload;
>  
> -	ether_extract_headers(m0, &ext);
> -	hlen = ext.iphlen;
> +	hlen = ext->iphlen;
>  
> -	if (ext.ip4) {
> +	if (ext->ip4) {
>  		if (ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT))
>  			offload |= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM <<
>  			    ICE_TXD_QW1_CMD_S;
>  		else
>  			offload |= ICE_TX_DESC_CMD_IIPT_IPV4 <<
>  			    ICE_TXD_QW1_CMD_S;
> -	} else if (ext.ip6)
> +	} else if (ext->ip6)
>  		offload |= ICE_TX_DESC_CMD_IIPT_IPV6 << ICE_TXD_QW1_CMD_S;
>  	else
>  		return offload;
> @@ -13679,13 +13773,13 @@ ice_tx_setup_offload(struct mbuf *m0, struct ice_tx_qu
>  	offload |= ((hlen >> 2) << ICE_TX_DESC_LEN_IPLEN_S) <<
>  	    ICE_TXD_QW1_OFFSET_S;
>  
> -	if (ext.tcp && ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) {
> +	if (ext->tcp && ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) {
>  		offload |= ICE_TX_DESC_CMD_L4T_EOFT_TCP << ICE_TXD_QW1_CMD_S;
> -		offload |= ((uint64_t)(ext.tcphlen >> 2) <<
> +		offload |= ((uint64_t)(ext->tcphlen >> 2) <<
>  		    ICE_TX_DESC_LEN_L4_LEN_S) << ICE_TXD_QW1_OFFSET_S;
> -	} else if (ext.udp && ISSET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) {
> +	} else if (ext->udp && ISSET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) {
>  		offload |= ICE_TX_DESC_CMD_L4T_EOFT_UDP << ICE_TXD_QW1_CMD_S;
> -		offload |= ((uint64_t)(sizeof(*ext.udp) >> 2) <<
> +		offload |= ((uint64_t)(sizeof(*ext->udp) >> 2) <<
>  		    ICE_TX_DESC_LEN_L4_LEN_S) << ICE_TXD_QW1_OFFSET_S;
>  	}
>  
> @@ -13711,6 +13805,36 @@ ice_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, st
>  }
>  
>  void
> +ice_set_tso_context(struct mbuf *m0, struct ice_tx_queue *txq,
> +    unsigned int prod, struct ether_extracted *ext)
> +{
> +	struct ice_tx_desc *ring;
> +	struct ice_tx_ctx_desc *txd;
> +	uint64_t qword1 = 0, paylen, outlen;
> +
> +	/*
> +	 * The MSS should not be set to a lower value than 64.
> +	 */
> +	outlen = MAX(64, m0->m_pkthdr.ph_mss);
> +	paylen = m0->m_pkthdr.len - ETHER_HDR_LEN - ext->iphlen - ext->tcphlen;
> +
> +	ring = ICE_DMA_KVA(&txq->tx_desc_mem);
> +	txd = (struct ice_tx_ctx_desc *)&ring[prod];
> +
> +	qword1 |= ICE_TX_DESC_DTYPE_CTX;
> +	qword1 |= ICE_TX_CTX_DESC_TSO << ICE_TXD_CTX_QW1_CMD_S;
> +	qword1 |= paylen << ICE_TXD_CTX_QW1_TSO_LEN_S;
> +	qword1 |= outlen << ICE_TXD_CTX_QW1_MSS_S;
> +
> +	htolem32(&txd->tunneling_params, 0);
> +	htolem16(&txd->l2tag2, 0);
> +	htolem16(&txd->rsvd, 0);
> +	htolem64(&txd->qw1, qword1);
> +
> +	tcpstat_add(tcps_outpkttso, (paylen + outlen - 1) / outlen);
> +}
> +
> +void
>  ice_start(struct ifqueue *ifq)
>  {
>  	struct ifnet *ifp = ifq->ifq_if;
> @@ -13727,6 +13851,7 @@ ice_start(struct ifqueue *ifq)
>  	uint64_t offload;
>  	uint64_t paddr;
>  	uint64_t seglen;
> +	struct ether_extracted ext;
>  #if NBPFILTER > 0
>  	caddr_t if_bpf;
>  #endif
> @@ -13759,17 +13884,22 @@ ice_start(struct ifqueue *ifq)
>  		if (m == NULL)
>  			break;
>  
> -		offload = ice_tx_setup_offload(m, txq, prod);
> +		ether_extract_headers(m, &ext);
> +		offload = ice_tx_setup_offload(m, &ext);
>  
> +		if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
> +			if (ext.tcp == NULL || m->m_pkthdr.ph_mss == 0 ||
> +			    m->m_pkthdr.ph_mss > ICE_TXD_CTX_MAX_MSS) {
> +				tcpstat_inc(tcps_outbadtso);
> +				ifq->ifq_errors++;
> +				m_freem(m);
> +				continue;
> +			}
> +		}
> +
>  		txm = &txq->tx_map[prod];
>  		map = txm->txm_map;
> -#if 0
> -		if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
> -			prod++;
> -			prod &= mask;
> -			free--;
> -		}
> -#endif
> +
>  		if (ice_load_mbuf(sc->sc_dmat, map, m) != 0) {
>  			ifq->ifq_errors++;
>  			m_freem(m);
> @@ -13779,6 +13909,21 @@ ice_start(struct ifqueue *ifq)
>  		bus_dmamap_sync(sc->sc_dmat, map, 0,
>  		    map->dm_mapsize, BUS_DMASYNC_PREWRITE);
>  
> +		if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
> +			if (ice_tso_detect_sparse(m, &ext, map)) {
> +				bus_dmamap_unload(sc->sc_dmat, map);
> +				tcpstat_inc(tcps_outbadtso);
> +				ifq->ifq_errors++;
> +				m_freem(m);
> +				continue;
> +			}
> +
> +			ice_set_tso_context(m, txq, prod, &ext);
> +			prod++;
> +			prod &= mask;
> +			free--;
> +		}
> +
>  		for (i = 0; i < map->dm_nsegs; i++) {
>  			txd = &ring[prod];
>  
> @@ -29345,8 +29490,8 @@ ice_tx_queues_alloc(struct ice_softc *sc)
>  
>  		for (j = 0; j < sc->isc_ntxd[i]; j++) {
>  			map = &txq->tx_map[j];
> -			if (bus_dmamap_create(sc->sc_dmat, ICE_MAX_FRAME_SIZE,
> -			    ICE_MAX_TX_SEGS, ICE_MAX_FRAME_SIZE, 0,
> +			if (bus_dmamap_create(sc->sc_dmat, MAXMCLBYTES,
> +			    ICE_MAX_TX_SEGS, ICE_MAX_DMA_SEG_SIZE, 0,
>  			    BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
>  			    &map->txm_map) != 0) {
>  				printf("%s: could not allocate Tx DMA map\n",
> @@ -30276,7 +30421,8 @@ ice_attach_hook(struct device *self)
>  #endif
>  	ifp->if_capabilities |= IFCAP_CSUM_IPv4 |
>  	    IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
> -	    IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> +	    IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6 |
> +	    IFCAP_TSOv4 | IFCAP_TSOv6;
>  
>  	if_attach(ifp);
>  	ether_ifattach(ifp);
> 
>